# Loading libraries and data

In [1]:
import pandas as pd
from datetime import datetime

# Convert timestamp to datetime (preprocessing)


In [2]:
def convert_time(dataset):
    """Adds a new column to a dataset with the converted timestamp to datetime"""

    date_list = []

    for time in dataset['event time:timestamp']:
        datex = time[:-4]
        date = datetime.strptime(datex, '%d-%m-%Y %H:%M:%S')

        date_list.append(date)

    dataset['time and date'] = date_list

# Actual next event and time

In [3]:
def add_actual_next(df_case):
    """Adds the actual next activity and time to next event to the final dataframe"""


    # Create a list for all the actual next events for an case
    event_lst = [event for event in df_case['event concept:name']] # Gets a list of all events for a specific trace
    event_lst = event_lst[1:] # Erase the first activity from the list (thus the second activity becomes first in the list)
    event_lst.append('-') # Append a '-' to the end of the list (the last activity does not have a next activity)
    
    # Create a list for time of the next event
    nexttime_lst1 = [time for time in df_case['time and date']]
    nexttime_lst = nexttime_lst1[1:]
    nexttime_lst.append(nexttime_lst[-1])

    # Create the time difference list
    time_diff = []
    for i in range(len(nexttime_lst)):
        time_diff.append(nexttime_lst[i] - nexttime_lst1[i])

    # Append columns to the case dataframe
    df_case['Next event'] = event_lst
    df_case['Time to next event'] = time_diff

    trace_len = len(df_case)

    return trace_len

# Predicted next event and time 

In [4]:
def get_position_time(df_case, count_dict, time_dict):
    for index, row in df_case.iterrows():
        
        # Get the amount of times an action occured in a certain position {action : {position_1 : count_1, position_2: count_2}}
        if row['event concept:name'] in count_dict:
            if index in count_dict[row['event concept:name']]:
                count_dict[row['event concept:name']][index] += 1
            else:
                count_dict[row['event concept:name']].update({index: 1})
        else:
            count_dict[row['event concept:name']] = {index: 1}
        
        # Summation of the times to next action per position (index)
        if index in time_dict:
            time_dict[index]['sum'] += row['Time to next event']
            time_dict[index]['count'] += 1
        else:
            time_dict[index] = {'sum': row['Time to next event'], 'count': 1}

def get_position_rank(max_trace_len, count_dict):
    pos_rank_dict = {}
    for i in range(max_trace_len):
        init = 0
        task = 0
        for key in count_dict.keys():
            try:
                new = count_dict[key][i]
            except:
                new = 0
            if new > init:
                init = new
                task = key

        pos_rank_dict.update({i: task})
    
    return pos_rank_dict

def get_mean_time(total_time_dict):
    mean_time_dict = {}
    for position in total_time_dict.keys():
        mean_time = total_time_dict[position]['sum'] / total_time_dict[position]['count']
        mean_time_dict[position] = mean_time
    
    return mean_time_dict

In [5]:
def create_event_pred(df_case, pos_rank_dict, mean_time_dict):
    
    # Prediction for the action
    pred_act_lst = [pos_rank_dict[i] for i in range(len(df_case))]
    pred_act_lst = pred_act_lst[1:]
    pred_act_lst.append('-')

    # Prediction for time
    pred_time_lst = [mean_time_dict[i] for i in range(len(df_case))]

    df_case['Event prediction'] = pred_act_lst 
    df_case['Time prediction'] = pred_time_lst



In [24]:
def train(path, maximum=None):
    dataset = pd.read_csv(path)
    convert_time(dataset)

    df_actual = pd.DataFrame()


    # Creating a dataframe with the actual events

    cases = list(dataset['case concept:name'].unique())  
    max_trace_len = 0  
    pos_count_dict = {}
    time_dict = {}
    for case in cases[:maximum]:
        df_case = dataset[dataset['case concept:name'] == case].copy().reset_index(drop=True)
        trace_len = add_actual_next(df_case)
        get_position_time(df_case, pos_count_dict, time_dict)
        df_actual = df_actual.append(df_case)

        if trace_len > max_trace_len:
            max_trace_len = trace_len
    


    # Creating the predicitions
    df_predicted = pd.DataFrame()
    
    pos_rank_dict = get_position_rank(max_trace_len, pos_count_dict)
    mean_time_dict = get_mean_time(time_dict)

    for case in cases[:maximum]:
        df_case = df_actual[df_actual['case concept:name'] == case].copy().reset_index(drop=True)
        create_event_pred(df_case, pos_rank_dict, mean_time_dict)
        df_predicted = df_predicted.append(df_case)



    return df_predicted, pos_rank_dict, mean_time_dict

In [30]:
def test(path, train_pos, train_time):
    """Creates the test dataset including the predictions based on the training dataset"""
    dataset = pd.read_csv(path)
    convert_time(dataset)

    df_predict = pd.DataFrame()
    cases = list(dataset['case concept:name'].unique())  
    for case in cases:
        df_case = dataset[dataset['case concept:name'] == case].copy().reset_index(drop=True)
        _ = add_actual_next(df_case)
        create_event_pred(df_case, train_pos, train_time)
        df_predict = df_predict.append(df_case)
    
    return df_predict

In [44]:
def get_accuracy(dataset):
    correct_event = 0 
    correct_time = 0
    total = 0
    for index, row in dataset.iterrows():
        total += 1
        if row['Next event'] == row['Event prediction']:
            correct_event += 1
        if row['Time to next event'] == row['Time prediction']:
            correct_time += 1
        
    accuracy_event = correct_event/total 
    accuracy_time = correct_time/total

    return accuracy_event, accuracy_time

In [43]:
df_train, train_pos, train_time = train('data\BPI_Challenge_2012\BPI_Challenge_2012-training.csv', 20000)
df_test = test("data\BPI_Challenge_2012\BPI_Challenge_2012-test.csv", train_pos, train_time)

In [45]:
train_event_acc, train_time_acc = get_accuracy(df_train)
test_event_acc, test_time_acc = get_accuracy(df_test)

In [46]:
train_event_acc, train_time_acc

(0.4880832800657474, 0.00022828965391288468)

In [40]:
test_event_acc, test_time_acc

(0.49124898061602157, 0.0)