In [1]:
import csv
import time
from datetime import datetime
import math
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from sklearn.model_selection import train_test_split

In [2]:
class TimePredictionModel:
    # Types of abstractions that can be used to build the model
    SEQUENCE_ABSTRACTION = 0
    SET_ABSTRACTION = 1
    MULTISET_ABSTRACTION = 2

    def __init__(self, cases=[], abstraction=SEQUENCE_ABSTRACTION, horizon=0, calendar=None):
        self.abstraction = abstraction
        # Unlimited horizon is applied if this parameter is set to 0
        self.horizon = horizon
        self.calendar = calendar
        self.states = dict()
        self.initialstate = self.addState([])
        self.build(cases)

    def addState(self, trace):
        state = self.codeState(trace)
        if state not in self.states:
            self.states[state] = []
        return state

    def codeState(self, trace):
        if self.abstraction == self.SEQUENCE_ABSTRACTION:
            # Sequence of execution matters
            state = tuple(trace)
        elif self.abstraction == self.SET_ABSTRACTION:
            # Sequence and repetitions do not matter
            state = frozenset(trace)
        elif self.abstraction == self.MULTISET_ABSTRACTION:
            # Sequence does not matter, but repetions do
            state = tuple(sorted(Counter(trace).items()))
        else:
            raise ValueError("Invalid abstraction type.")
        return state

    def build(self, cases):
        for case in cases:
            self.processCase(case)

    def processCase(self, case):
        activities, eventtimes = zip(*case)
        #print("Case activities:", activities)
        for i in range(len(case)):
            # t is the time the state is visited; e is the elapsed time since the start
            # of the case; r is the remaining flow time; s is the sojourn time, i.e., the
            # time until the next event
            t = time.mktime(eventtimes[i])
            e = self.elapsedTime(eventtimes[0], eventtimes[i])
            r = self.elapsedTime(eventtimes[i], eventtimes[-1])
            if i < len(case) - 1:
                s = self.elapsedTime(eventtimes[i], eventtimes[i+1])
            else:
                s = -1
            initial = 0
            if self.horizon > 0 and i >= self.horizon:
                # Use a limited horizon, i.e., consider the k lasts activities executed
                # (k is the value set to self.horizon)
                initial = i - self.horizon + 1
            for j in range(initial, i+1):
                state = self.addState(activities[j:i+1])
                #print("State:", state)
                self.states[state].append((t, e, r, s))
                #print("Annotations:", self.states[state])

    def elapsedTime(self, starttime, endtime):
        # TODO: Use calendar
        start = time.mktime(starttime)
        end = time.mktime(endtime)
        return end - start

    def timePredictionFunction(self, measurements):
        mean = np.mean(measurements)
        std = np.std(measurements)
        min = np.min(measurements)
        max = np.max(measurements)
        return mean, std, min, max

    def predictRemainingTime(self, partialtrace):
        #print("Predicting remaining time for partial trace", partialtrace)
        initial = 0
        if self.horizon > 0 and len(partialtrace) > self.horizon:
            initial = len(partialtrace) - self.horizon
            #print("Initial:", initial)
        while initial < len(partialtrace):
            state = self.codeState(partialtrace[initial:])
            if state in self.states:
                #print("State:", state)
                t, e, r, s = zip(*self.states[state])
                predicted = self.timePredictionFunction(r)
                #print("Predicted:", predicted)
                return predicted
            initial += 1
        # This will only happen if the partial trace contains an activity that did not
        # appear in the training set
        return self.fallThrough()

    def fallThrough(self):
        # Could not find any match in te model for the given trace, so use all
        # measures stored for states composed of a single activity
        allRemaining = []
        for state, annotations in self.states.items():
            if self.abstraction == self.MULTISET_ABSTRACTION:
                state = list(state.elements())
            if len(state) == 1:
                t, e, r, s = zip(*annotations)
                allRemaining.extend(r)
        if len(allRemaining) > 0:
            predicted = self.timePredictionFunction(allRemaining)
            return predicted
        else:
            # This will only happen if the model is empty (no case has been processed)
            print("Partial trace does not fit any state in the model. Cannot predict.")
            return None, None, None, None


In [37]:
# def runTimePredictions(cases, category, eventlog, directory, h, cat_cols):
def runTimePredictions(trainingset, testset, eventlog, directory, h, features):
    global df_eventlog
    global cases_cat
    # Divide the data set into folds for model generation and prediction
#     trainingset, testset = train_test_split(cases, test_size=0.33, random_state=11, stratify=cases['Timestapmp'].year) #divide o conjunto em treino e teste e estratifica a timestamp
#     trainingset, testset = train_test_split(cases, test_size=0.33)
    num_cases = len(trainingset) + len(testset)
    feats = len(features)    
    avg_mae = 0
    avg_gt = 0
    sum_gt = 0
    num_events = 0
    
    # Build the model
    caseids, trainingset = zip(*trainingset)

    model = TimePredictionModel(trainingset, abstraction=TimePredictionModel.SEQUENCE_ABSTRACTION, horizon = h)
    
    columns = ["CaseID", "Prefix length", "RT ground truth", "Predicted RT", "Std Deviation", "MAE", "MAE Days"]
    
    df_prediction_results = pd.DataFrame(index=None, columns=columns)
    
    casestotest = True
    prefixlength = 2
    df_prediction_results = pd.DataFrame(columns=columns)
    
    while casestotest:
        print("Predicting remaining time using prefix length", prefixlength)
        casestotest = False
        for caseid, case in testset:
            if len(case) > prefixlength:
                activities, eventtimes = zip(*case)
                #print("Predicting remaining time for case", caseid)
                predicted, std, min, max = model.predictRemainingTime(activities[:prefixlength])
                groundtruth = model.elapsedTime(eventtimes[prefixlength-1], eventtimes[-1])
                if predicted is not None:
                    mae = abs(predicted - groundtruth)
                    mae_days = round(mae/86400, 4)
                else:
                    mae = None
                newline = {"CaseID": caseid, "Prefix length": prefixlength, "RT ground truth": groundtruth, "Predicted RT": predicted, "Std Deviation": std, "MAE": mae, "MAE Days": mae_days}
                df_prediction_results = df_prediction_results.append(newline, ignore_index=True)
                casestotest = True
        prefixlength += 1
    
    avg_mae = round(df_prediction_results['MAE'].mean(), 4)
    sum_gt = round(df_prediction_results['RT ground truth'].sum(), 4)
    avg_gt = round(df_prediction_results['RT ground truth'].mean(), 4)
    num_events = len(df_prediction_results)
    
    if math.isnan(avg_mae):
        avg_mae = 0.0
    if math.isnan(avg_gt):
        avg_gt = 0.0
        sum_gt = 0.0
        
    cases_cat = [avg_gt, avg_mae, num_cases, sum_gt, num_events]
    df_prediction_results.to_csv('../results/Will/'+directory+'/horizon_'+str(horizon)+'/predictions_nf'+str(feats)+'_%s' % eventlog, index=False, sep=';', decimal=",")

In [27]:
# Calculate the metrics to the running prediction
def calcMetrics(cases_cat, directory, horizon=0, add_feat=[]):
    global df_eventlog
    avgMAE = 0.0
    avgGT = 0.0
    total_events = 0
    
    url_arquivo = "../results/Will/"+directory+"/horizon_"+str(horizon)+"/metrics-"+str(horizon)+".txt"
    
    with open(url_arquivo,"a+") as f:
        
        if os.stat(url_arquivo).st_size == 0:
            f.write("Horizon:%d\n" % horizon)
        if add_feat:
            f.write("\n  Features Used: "+'-'.join(add_feat))
        else:
            f.write("\n  No Features Used")
        f.write("\n  Ground Truth:%.4f" % cases_cat[0])
        f.write("\n  Mean for MAE:%.4f" % cases_cat[1])
        f.write("\n  Mean for MAE in days:%.4f" % (cases_cat[1]/86400))
        f.write("\n  Total cases:%d\n" % cases_cat[2])
            
        f.write("\n ------------------------ \n")
        f.close()

In [28]:
def createDirectory(path, horizon=0):
    path_horizon = path+'/horizon_'+str(horizon)
    
    if os.path.isdir(path) == False:
        try:
            os.mkdir(path)
        except OSError:
            print ("Creation of the directory %s failed" % path)
        else:
            print ("Successfully created the directory %s " % path)
            
    if os.path.isdir(path_horizon) == False:
        try: 
            os.mkdir(path_horizon)
        except OSError:
            print ("Creation of the directory %s failed" % path_horizon)
        else:
            print ("Successfully created the directory %s " % path_horizon)

In [29]:
def appendCategory(df_eventlog):
    df_event = df_eventlog.copy()
    
    
    add_feats = df_event.columns[3:]
    
    dict_feat = {feat:i for i, feat in enumerate(add_feats)}
    
    max_val = df_event['ActivityID'].max()
  
    for col in add_feats:
        df_event.loc[df_event[col] == 1, 'ActivityID'] += max_val+dict_feat[col]
        
    return df_event

In [30]:
def loadCases(logname, dirc, features, timeformat, sep):
    global df_eventlog
    df_eventlog = pd.read_csv("../dataset/"+dirc+"/%s" % logname, sep=sep, error_bad_lines=False)
    df_eventlog = df_eventlog[features]
    
    # Transform timestamp to datetime format
    df_eventlog.iloc[:, 2] = pd.to_datetime(df_eventlog.iloc[:, 2])
    
    columns = [df_eventlog.columns.get_loc(col) for col in features]

    csv_data = appendCategory(df_eventlog).iloc[:, :3].values

    cases = []
    previouscase = None
    case = []
    # Columns are (CaseId, ActivityId, CompleteTimestamp)
    for row in csv_data:
        # Assume events are ordered by CaseID and then CompleteTimestamp in the event log
        if row[columns[0]] != previouscase:
            if len(case) > 0:
                # This is the first event for a new case
                cases.append((previouscase, case))
                case = []
            previouscase = row[columns[0]]
        eventtime = time.strptime(str(row[columns[2]]), timeformat)
        case.append((row[columns[1]], eventtime))
    # Add the last case
    cases.append((previouscase, case))
    return cases


In [39]:
df_eventlog = None
cases_cat = []
number_cases = 0

if __name__ == '__main__':
    begin_time = datetime.now()
    global cases_cat
    eventlog = "WFM-treated_completed_cases_40_Type_City_Severity_one_hot.csv"
    # eventlog = "BPI_Challenge_2013_closed_problems_columns_date_treated_features_encoded_one_hot.csv"
    #eventlog = "rm_outliers_WFM-treated_completed_cases_40_feat_select_result.csv"
#     eventlog = "BPI_Challenge_2019_treated_treated_final_over_sampling_v1_case_Item Type_one_hot.csv"
#     eventlog = "incident_event_log-columns-treated-for-test_onehot_result_assignment_location_cat.csv"
    sep="|"
#     columns = [0, 1, 2]
    features = ['CaseID','ActivityID','CompleteTimestamp','Type_PC','City_Los Angeles']
#     features = ['CaseID','ActivityID','CompleteTimestamp', 'category_Category 32']
    # features = ['CaseID','ActivityID','CompleteTimestamp', 'organization involved_G199 3rd', 'resource country_Sweden']
    # features = ['CaseID','ActivityID','CompleteTimestamp']
#      'category_Category 32', 'location_Location 204'
    timeformat = "%Y-%m-%d %H:%M:%S"
    horizons = [2, 4, 6, 8, 10, 12]
#     directory = 'BPI2019/by_feat'
    # directory = 'BPI2013'
    directory = 'WFM'
#     horizons = [3]
    feats = features[3:]
    #Load the cases and its timestamps
    print("Loading the cases")
    cases = loadCases(eventlog, directory, features, timeformat, sep)
    number_cases = len(cases) + 1
    foldsize = int(round(len(cases)/3))
    trainingset = cases[:2*foldsize]
    testset = cases[2*foldsize:]
    print("Cases Loaded")
    
    for h in horizons:
        horizon = h
        
        directory = eventlog.replace('.csv', '')
        path = '../results/Will/'+directory

        createDirectory(path, horizon)

        print("Horizon selected: ", horizon)
        runTimePredictions(trainingset, testset, eventlog, directory, horizon, feats)

        print("\nComputing metrics from prediction")
        calcMetrics(cases_cat, directory, horizon, features[3:])
        print("\nMetrics computed")
    print(datetime.now() - begin_time)

Loading the cases
Cases Loaded
Horizon selected:  2
Predicting remaining time using prefix length 2
Predicting remaining time using prefix length 3
Predicting remaining time using prefix length 4
Predicting remaining time using prefix length 5
Predicting remaining time using prefix length 6
Predicting remaining time using prefix length 7
Predicting remaining time using prefix length 8
Predicting remaining time using prefix length 9
Predicting remaining time using prefix length 10
Predicting remaining time using prefix length 11
Predicting remaining time using prefix length 12
Predicting remaining time using prefix length 13
Predicting remaining time using prefix length 14
Predicting remaining time using prefix length 15
Predicting remaining time using prefix length 16
Predicting remaining time using prefix length 17
Predicting remaining time using prefix length 18
Predicting remaining time using prefix length 19
Predicting remaining time using prefix length 20
Predicting remaining time