# Loading libraries and data

In [None]:
import pandas as pd
from datetime import datetime

In [None]:
df_train = pd.read_csv("data\BPI_Challenge_2012-training.csv")

# Convert timestamp to datetime (preprocessing)


In [None]:
def convert_time(dataset):
    """Adds a new column to a dataset with the converted timestamp to datetime"""

    date_list = []

    for time in dataset['event time:timestamp']:
        datex = time[:-4]
        date = datetime.strptime(datex, '%d-%m-%Y %H:%M:%S')

        date_list.append(date)

    dataset['time and date'] = date_list

# Actual next event and time

In [46]:
def add_actual_next(df_case):
    """Adds the actual next activity and time to next event to the final dataframe"""


    # Create a list for all the actual next events for an case
    event_lst = [event for event in df_case['event concept:name']] # Gets a list of all events for a specific trace
    event_lst = event_lst[1:] # Erase the first activity from the list (thus the second activity becomes first in the list)
    event_lst.append('-') # Append a '-' to the end of the list (the last activity does not have a next activity)
    
    # Create a list for time of the next event
    nexttime_lst1 = [time for time in df_case['time and date']]
    nexttime_lst = nexttime_lst1[1:]
    nexttime_lst.append(nexttime_lst[-1])

    # Create the time difference list
    time_diff = []
    for i in range(len(nexttime_lst)):
        time_diff.append(nexttime_lst[i] - nexttime_lst1[i])

    # Append columns to the case dataframe
    df_case['Next event'] = event_lst
    df_case['Time to next event'] = time_diff

    trace_len = len(df_case)

    return trace_len

# Predicted next event and time 

In [73]:
def get_position_time(df_case, count_dict, time_dict):
    for index, row in df_case.iterrows():
        
        # Get the amount of times an action occured in a certain position {action : {position_1 : count_1, position_2: count_2}}
        if row['event concept:name'] in count_dict:
            if index in count_dict[row['event concept:name']]:
                count_dict[row['event concept:name']][index] += 1
            else:
                count_dict[row['event concept:name']].update({index: 1})
        else:
            count_dict[row['event concept:name']] = {index: 1}
        
        # Summation of the times to next action per position (index)
        if index in time_dict:
            time_dict[index]['sum'] += row['Time to next event']
            time_dict[index]['count'] += 1
        else:
            time_dict[index] = {'sum': row['Time to next event'], 'count': 1}

def get_position_rank(max_trace_len, count_dict):
    pos_rank_dict = {}
    for i in range(max_trace_len):
        init = 0
        task = 0
        for key in count_dict.keys():
            try:
                new = count_dict[key][i]
            except:
                new = 0
            if new > init:
                init = new
                task = key

        pos_rank_dict.update({i: task})
    
    return pos_rank_dict

def get_mean_time(total_time_dict):
    mean_time_dict = {}
    for position in total_time_dict.keys():
        mean_time = total_time_dict[position]['sum'] / total_time_dict[position]['count']
        mean_time_dict[position] = mean_time
    
    return mean_time_dict

In [74]:
def create_event_pred(df_case, pos_rank_dict, mean_time_dict):
    
    # Prediction for the action
    pred_act_lst = [pos_rank_dict[i] for i in range(len(df_case))]
    pred_act_lst = pred_act_lst[1:]
    pred_act_lst.append('-')

    # Prediction for time
    pred_time_lst = [mean_time_dict[i] for i in range(len(df_case))]

    df_case['Event prediction'] = pred_act_lst 
    df_case['Time prediction'] = pred_time_lst



In [80]:
def main(path):
    dataset = pd.read_csv(path)
    convert_time(dataset)

    df_actual = pd.DataFrame()


    # Creating a dataframe with the actual events

    cases = list(dataset['case concept:name'].unique())  
    max_trace_len = 0  
    pos_count_dict = {}
    time_dict = {}
    for case in cases:
        df_case = dataset[dataset['case concept:name'] == case].copy().reset_index(drop=True)
        trace_len = add_actual_next(df_case)
        get_position_time(df_case, pos_count_dict, time_dict)
        df_actual = df_actual.append(df_case)

        if trace_len > max_trace_len:
            max_trace_len = trace_len
    


    # Creating the predicitions
    df_predicted = pd.DataFrame()
    
    pos_rank_dict = get_position_rank(max_trace_len, pos_count_dict)
    mean_time_dict = get_mean_time(time_dict)

    for case in cases:
        df_case = df_actual[df_actual['case concept:name'] == case].copy().reset_index(drop=True)
        create_event_pred(df_case, pos_rank_dict, mean_time_dict)
        df_predicted = df_predicted.append(df_case)




    #for case in cases:



    
    
    return df_predicted

In [81]:
final = main('data\BPI_Challenge_2012-training.csv')

In [79]:
final

Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,time and date,Event prediction,Time prediction
0,0,173688,2011-10-01T00:38:44.546+02:00,20000,A_SUBMITTED,COMPLETE,01-10-2011 00:38:44.546,2011-10-01 00:38:44,A_PARTLYSUBMITTED,0 days 00:00:00.700000
1,1,173688,2011-10-01T00:38:44.546+02:00,20000,A_PARTLYSUBMITTED,COMPLETE,01-10-2011 00:38:44.880,2011-10-01 00:38:44,A_PREACCEPTED,0 days 00:00:41.200000
2,2,173688,2011-10-01T00:38:44.546+02:00,20000,A_PREACCEPTED,COMPLETE,01-10-2011 00:39:37.906,2011-10-01 00:39:37,W_Completeren aanvraag,0 days 00:03:56.200000
3,3,173688,2011-10-01T00:38:44.546+02:00,20000,W_Completeren aanvraag,SCHEDULE,01-10-2011 00:39:38.875,2011-10-01 00:39:38,W_Completeren aanvraag,0 days 02:43:58.250000
4,4,173688,2011-10-01T00:38:44.546+02:00,20000,W_Completeren aanvraag,START,01-10-2011 11:36:46.437,2011-10-01 11:36:46,W_Completeren aanvraag,0 days 00:02:40.125000
...,...,...,...,...,...,...,...,...,...,...
19,38654705683,173715,2011-10-01T09:59:10.501+02:00,45000,W_Nabellen offertes,COMPLETE,10-10-2011 12:54:04.785,2011-10-10 12:54:04,W_Valideren aanvraag,1 days 11:09:46.250000
20,38654705684,173715,2011-10-01T09:59:10.501+02:00,45000,W_Valideren aanvraag,START,13-10-2011 11:00:48.597,2011-10-13 11:00:48,O_CREATED,0 days 00:09:10.250000
21,38654705685,173715,2011-10-01T09:59:10.501+02:00,45000,A_DECLINED,COMPLETE,13-10-2011 11:05:24.620,2011-10-13 11:05:24,O_SENT,0 days 00:00:00.500000
22,38654705686,173715,2011-10-01T09:59:10.501+02:00,45000,O_DECLINED,COMPLETE,13-10-2011 11:05:24.620,2011-10-13 11:05:24,W_Nabellen offertes,0 days 00:00:04.250000
