In [1]:
import csv
import time
import math
import os
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
class TimePredictionModel:
    # Types of abstractions that can be used to build the model
    SEQUENCE_ABSTRACTION = 0
    SET_ABSTRACTION = 1
    MULTISET_ABSTRACTION = 2

    def __init__(self, cases=[], abstraction=SEQUENCE_ABSTRACTION, horizon=0, calendar=None):
        self.abstraction = abstraction
        # Unlimited horizon is applied if this parameter is set to 0
        self.horizon = horizon
        self.calendar = calendar
        self.states = dict()
        self.initialstate = self.addState([])
        self.build(cases)

    def addState(self, trace):
        state = self.codeState(trace)
        if state not in self.states:
            self.states[state] = []
        return state

    def codeState(self, trace):
        if self.abstraction == self.SEQUENCE_ABSTRACTION:
            # Sequence of execution matters
            state = tuple(trace)
        elif self.abstraction == self.SET_ABSTRACTION:
            # Sequence and repetitions do not matter
            state = frozenset(trace)
        elif self.abstraction == self.MULTISET_ABSTRACTION:
            # Sequence does not matter, but repetions do
            state = tuple(sorted(Counter(trace).items()))
        else:
            raise ValueError("Invalid abstraction type.")
        return state

    def build(self, cases):
        for case in cases:
            self.processCase(case)

    def processCase(self, case):
        activities, eventtimes = zip(*case)
        #print("Case activities:", activities)
        for i in range(len(case)):
            # t is the time the state is visited; e is the elapsed time since the start
            # of the case; r is the remaining flow time; s is the sojourn time, i.e., the
            # time until the next event
            t = time.mktime(eventtimes[i])
            e = self.elapsedTime(eventtimes[0], eventtimes[i])
            r = self.elapsedTime(eventtimes[i], eventtimes[-1])
            if i < len(case) - 1:
                s = self.elapsedTime(eventtimes[i], eventtimes[i+1])
            else:
                s = -1
            initial = 0
            if self.horizon > 0 and i >= self.horizon:
                # Use a limited horizon, i.e., consider the k lasts activities executed
                # (k is the value set to self.horizon)
                initial = i - self.horizon + 1
            for j in range(initial, i+1):
                state = self.addState(activities[j:i+1])
                #print("State:", state)
                self.states[state].append((t, e, r, s))
                #print("Annotations:", self.states[state])

    def elapsedTime(self, starttime, endtime):
        # TODO: Use calendar
        start = time.mktime(starttime)
        end = time.mktime(endtime)
        return end - start

    def timePredictionFunction(self, measurements):
        mean = np.mean(measurements)
        std = np.std(measurements)
        min = np.min(measurements)
        max = np.max(measurements)
        return mean, std, min, max

    def predictRemainingTime(self, partialtrace):
        #print("Predicting remaining time for partial trace", partialtrace)
        initial = 0
        if self.horizon > 0 and len(partialtrace) > self.horizon:
            initial = len(partialtrace) - self.horizon
            #print("Initial:", initial)
        while initial < len(partialtrace):
            state = self.codeState(partialtrace[initial:])
            if state in self.states:
                #print("State:", state)
                t, e, r, s = zip(*self.states[state])
                predicted = self.timePredictionFunction(r)
                #print("Predicted:", predicted)
                return predicted
            initial += 1
        # This will only happen if the partial trace contains an activity that did not
        # appear in the training set
        return self.fallThrough()

    def fallThrough(self):
        # Could not find any match in te model for the given trace, so use all
        # measures stored for states composed of a single activity
        allRemaining = []
        for state, annotations in self.states.items():
            if self.abstraction == self.MULTISET_ABSTRACTION:
                state = list(state.elements())
            if len(state) == 1:
                t, e, r, s = zip(*annotations)
                allRemaining.extend(r)
        if len(allRemaining) > 0:
            predicted = self.timePredictionFunction(allRemaining)
            return predicted
        else:
            # This will only happen if the model is empty (no case has been processed)
            print("Partial trace does not fit any state in the model. Cannot predict.")
            return None, None, None, None


In [3]:
def loadCases(logname, columns, timeformat):
    global df_eventlog
    df_eventlog = pd.read_csv("../../dataset/test/%s" % logname, sep="|", error_bad_lines=False)
    
    csv_data = df_eventlog.values
    cases = []
    previouscase = None
    category = None
    case = []
    # Columns are (CaseId, ActivityId, CompleteTimestamp)
    for row in csv_data:
        # Assume events are ordered by CaseID and then CompleteTimestamp in the event log
        if row[columns[0]] != previouscase:
            if len(case) > 0:
                # This is the first event for a new case
                cases.append((previouscase, category, case))
                case = []
            previouscase = row[columns[0]]
            if len(columns) > 3:
                size = len(columns)
                c = ""
                for i in row[3:size]: c = c + str(i)
                category = c
        eventtime = time.strptime(row[columns[2]], timeformat)
        case.append((row[columns[1]], eventtime))
    # Add the last case
    cases.append((previouscase, category, case))
    return cases


In [4]:
def splitIntoCategories(cases):
    casesets = dict()
    for caseid, category, case in cases:
        if category not in casesets:
            casesets[category] = [(caseid, case)]
        else:
            casesets[category].append((caseid, case))
    return casesets

In [61]:
def runTimePredictions(cases, category, eventlog, directory, h):
    global df_eventlog
    # Divide the data set into folds for model generation and prediction
    foldsize = int(round(len(cases)/3))
    trainingset = cases[:2*foldsize]
    testset = cases[2*foldsize:]
    
    avg_mae = 0
    avg_gt = 0
    
    categories_names = ""
    for cont,i in enumerate(category):
        cat = df_eventlog.columns[3+cont]
        categories_names = categories_names + (str(cat+'-'+i+'_'))
        
    eventlog = ("cat%s_" % categories_names) + eventlog
    
    # Build the model
    caseids, trainingset = zip(*trainingset)
    model = TimePredictionModel(trainingset, abstraction=TimePredictionModel.SEQUENCE_ABSTRACTION, horizon = h)
    
    columns = ["CaseID", "Prefix length", "RT ground truth", "Predicted RT", "Std Deviation", "MAE", "MAE Days"]
    
    df_prediction_results = pd.DataFrame(index=None, columns=columns)
    
    casestotest = True
    prefixlength = 2
    df_prediction_results = pd.DataFrame(columns=columns)
    
    while casestotest:
        print("Predicting remaining time using prefix length", prefixlength)
        casestotest = False
        for caseid, case in testset:
            if len(case) > prefixlength:
                activities, eventtimes = zip(*case)
                #print("Predicting remaining time for case", caseid)
                predicted, std, min, max = model.predictRemainingTime(activities[:prefixlength])
                groundtruth = model.elapsedTime(eventtimes[prefixlength-1], eventtimes[-1])
                if predicted is not None:
                    mae = abs(predicted - groundtruth)
                    mae_days = round(mae/86400, 4)
                else:
                    mae = None
                newline = {"CaseID": caseid, "Prefix length": prefixlength, "RT ground truth": groundtruth, "Predicted RT": predicted, "Std Deviation": std, "MAE": mae, "MAE Days": mae_days}
                df_prediction_results = df_prediction_results.append(newline, ignore_index=True)
                casestotest = True
        prefixlength += 1
    
    avg_mae = round(df_prediction_results['MAE'].mean(), 4)
    avg_gt = round(df_prediction_results['RT ground truth'].mean(), 4)
    
    if math.isnan(avg_mae):
        avg_mae = 0.0
    if math.isnan(avg_gt):
        avg_gt = 0.0
    
    calcPercDataCat(category, avg_mae, avg_gt)
    df_prediction_results.to_csv('../../results/'+directory+'/horizon_'+str(horizon)+'/predictions_%s' % eventlog, index=False, sep=',')

In [62]:
# Calculates the percentage of data in the full dataset which belongs to the given categories
def calcPercDataCat(categories, mae_mean, ground_truth):
    global df_eventlog
    global cases_cat
    
    data_size = df_eventlog.shape[0]
    
    df_data_per_cat = df_eventlog
    
    cat_name = ""
    for cont, i in enumerate(categories):
        category = df_eventlog.columns[3+cont]
        df_data_per_cat = df_data_per_cat.loc[df_data_per_cat[category] == int(i)]
        cat_name = cat_name + (str(category+'='+i+' '))
    
    perc_cats_full_data = round(df_data_per_cat.shape[0] * 100 / data_size, 2)
    cases_cat[cat_name] = [perc_cats_full_data, ground_truth, mae_mean, df_data_per_cat.shape[0]]
    

In [63]:
# Calculate the metrics to the running prediction
def calcMetrics(cases_cat, directory, horizon):
    avgMAE = 0.0
    avgGT = 0.0
    
    f = open("../../results/"+directory+"/horizon_"+str(horizon)+"/metrics.txt","a+")
    
    f.write("Horizon:%d" % horizon)
    
    for i in cases_cat:
        f.write("\n\nCategory: %s" % i)
        f.write("\n  Ground Truth:%.4f" % cases_cat[i][1])
        f.write("\n  Mean for MAE:%.4f" % cases_cat[i][2])
        f.write("\n  Percentage from the full data: %.3f" % cases_cat[i][0])
        f.write("\n  Total cases:%d" % cases_cat[i][3])  
        avgMAE = avgMAE + (cases_cat[i][0] * cases_cat[i][2])
        avgGT = avgGT  + (cases_cat[i][0] * cases_cat[i][1])
        
    avgMAE = avgMAE / 100
    avgGT = avgGT / 100
    f.write("\n\nComputed Weighted arithmetic mean for MAE error:")
    f.write("\nTotal Ground Truth:%.4f" % avgGT)
    f.write("\nTotal MAE: %.4f" % avgMAE)
    f.write("\nTotal MAE in days: %.4f" % (avgMAE/86400))
    f.write("\n ------------------------ \n\n")
    f.close()

In [80]:
df_eventlog = None
cases_cat = dict()

if __name__ == '__main__':
    global cases_cat
#     eventlog = "wfmpca.csv"
    eventlog = "bpi2013-pca.csv"
    #eventlog = "helpdesk.csv"
    columns = (0, 1, 2, 3)
    timeformat = "%Y-%m-%d %H:%M:%S"
    horizon = 8
    #eventlog = "running-example.csv"
    #columns = (3, 0, 7)
    #timeformat = "%Y-%m-%d %H:%M:%S%z"
    cases = loadCases(eventlog, columns, timeformat)
    casessets = splitIntoCategories(cases)
    
    directory = eventlog.replace('.csv', '')
    path = '../../results/'+directory+'/horizon_'+str(horizon)
    
    if os.path.isdir(path) == False:
        try:
            os.mkdir(path)
        except OSError:
            print ("Creation of the directory %s failed" % path)
        else:
            print ("Successfully created the directory %s " % path)
    
    for category, cases in casessets.items():
        print("Predicting remaining time for case category", category)
        runTimePredictions(cases, category, eventlog, directory, horizon)
        
    print("\nComputing metrics from prediction")
    calcMetrics(cases_cat, directory, horizon)
    print("\nMetrics computed")

Predicting remaining time for case category 0
Predicting remaining time using prefix length 2
Predicting remaining time using prefix length 3
Predicting remaining time using prefix length 4
Predicting remaining time using prefix length 5
Predicting remaining time using prefix length 6
Predicting remaining time using prefix length 7
Predicting remaining time using prefix length 8
Predicting remaining time using prefix length 9
Predicting remaining time using prefix length 10
Predicting remaining time using prefix length 11
Predicting remaining time using prefix length 12
Predicting remaining time using prefix length 13
Predicting remaining time using prefix length 14
Predicting remaining time using prefix length 15
Predicting remaining time using prefix length 16
Predicting remaining time using prefix length 17
Predicting remaining time using prefix length 18
Predicting remaining time using prefix length 19
Predicting remaining time using prefix length 20
Predicting remaining time using