In [3]:
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import datetime as datetime
from sklearn.model_selection import train_test_split

In [150]:
df_train = pd.read_csv("bpi2017_train.csv", parse_dates = ['time:timestamp'])
df_val = pd.read_csv("bpi2017_val.csv", parse_dates = ['time:timestamp'])
df_test = pd.read_csv("bpi2017_test.csv", parse_dates = ['time:timestamp'])

# The default name indicating the case ID is case:concept:name
# concept:name is the event
# time:timestamp is the corresponding timestamp
#load the datasets, sort them on case and consequently timestamp, then reset the index
df_train = df_train.sort_values(by=['case:concept:name', 'time:timestamp']).reset_index()
df_val = df_val.sort_values(by=['case:concept:name', 'time:timestamp']).reset_index()
df_test = df_test.sort_values(by=['case:concept:name', 'time:timestamp']).reset_index()

#remove obsolete columns
df_train = df_train.drop(['index', 'Unnamed: 0'], axis = 1)
df_val = df_val.drop(['index', 'Unnamed: 0'], axis = 1)
df_test = df_test.drop(['index', 'Unnamed: 0'], axis = 1)

In [135]:
#Cumulative sum function to be used later
def CumSum(lists):
    #```Returns the cumulative sum of a list```
    cu_list = []
    length = len(lists)
    cu_list = [sum(lists[0:x:1]) for x in range(0, length+1)]
    return cu_list[1:]

# 2. Baseline Time Prediction (Only on Training Dataset)

In [137]:
# Calculate time difference between each row
df_train['time_diff'] = df_train['time:timestamp'].diff().dt.total_seconds()
# Set the time difference of the 1st row to 0 as it's currently NaN
df_train.at[0, 'time_diff'] = 0
# Count number of steps per process
Length_per_case_List = df_train.groupby(['case:concept:name'])['time_diff'].count().tolist()

#using the cumulative sum we get all the positions that are a first step in a process, here the time difference 
#should thus be set to 0
position_lst = CumSum(Length_per_case_List)
for i in tqdm(position_lst):
    df_train.at[i,'time_diff']=0
#The for Loop mysteriously creates an empty row at the end of the df, gotta delete it
df_train = df_train.iloc[:-1]

#Unzip the position list to get the number of each steps of each process, make that into a list
step_in_process = []
for x in tqdm(Length_per_case_List):
    for y in range(x):
        step_in_process.append(y + 1)
# Assign position number to each row/process
df_train['Process_Step'] = step_in_process


#The warnings are obsolete, it's because it uses .at which is considerably faster than .loc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['time_diff'] = df_train['time:timestamp'].diff().dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.loc[index, col] = value
100%|██████████| 16308/16308 [00:00<00:00, 65416.10it/s] 
100%|██████████| 16308/16308 [00:00<00:00, 171750.61it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['Process_Step'] = step_in_proce

In [140]:
df_train

Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,...,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,time_diff,Process_Step
0,Created,User_1,A_Create Application,Application,Application_1000086665,complete,2016-08-03 15:57:21.673000+00:00,"Other, see explanation",New credit,Application_1000086665,...,,,,,,,,,0.000,1
1,statechange,User_1,A_Submitted,Application,ApplState_161925113,complete,2016-08-03 15:57:21.734000+00:00,"Other, see explanation",New credit,Application_1000086665,...,,,,,,,,,0.061,2
2,Created,User_1,W_Handle leads,Workflow,Workitem_747707399,schedule,2016-08-03 15:57:21.963000+00:00,"Other, see explanation",New credit,Application_1000086665,...,,,,,,,,,0.229,3
3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1030261128,withdraw,2016-08-03 15:58:28.286000+00:00,"Other, see explanation",New credit,Application_1000086665,...,,,,,,,,,66.323,4
4,Created,User_1,W_Complete application,Workflow,Workitem_1127124826,schedule,2016-08-03 15:58:28.293000+00:00,"Other, see explanation",New credit,Application_1000086665,...,,,,,,,,,0.007,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620079,Obtained,User_68,W_Validate application,Workflow,Workitem_715332932,resume,2016-04-26 09:22:34.654000+00:00,Home improvement,New credit,Application_610205010,...,,,,,,,,,6159.402,67
620080,statechange,User_68,O_Accepted,Offer,OfferState_287854721,complete,2016-04-26 09:23:27.987000+00:00,Home improvement,New credit,Application_610205010,...,,,,,,,,Offer_772484790,53.333,68
620081,statechange,User_68,A_Pending,Application,ApplState_1161629338,complete,2016-04-26 09:23:28.009000+00:00,Home improvement,New credit,Application_610205010,...,,,,,,,,,0.022,69
620082,Deleted,User_68,W_Validate application,Workflow,Workitem_1093600680,complete,2016-04-26 09:23:28.012000+00:00,Home improvement,New credit,Application_610205010,...,,,,,,,,,0.003,70


In [147]:
# Calculate mean time difference grouped by position based on the number of cases
mean_time_lst = df_train.groupby('Process_Step')['time_diff'].mean().tolist()
mean_time_lst

# Create the predicted time column per entry using the mean time difference
pred_time_lst = [mean_time_lst[j - 1] for j in step_in_process]
df_train['baseline_predicted_time'] = pred_time_lst
df_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['baseline_predicted_time'] = pred_time_lst


Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,...,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,time_diff,Process_Step,baseline_predicted_time
0,Created,User_1,A_Create Application,Application,Application_1000086665,complete,2016-08-03 15:57:21.673000+00:00,"Other, see explanation",New credit,Application_1000086665,...,,,,,,,,0.000,1,0.000000
1,statechange,User_1,A_Submitted,Application,ApplState_161925113,complete,2016-08-03 15:57:21.734000+00:00,"Other, see explanation",New credit,Application_1000086665,...,,,,,,,,0.061,2,0.107744
2,Created,User_1,W_Handle leads,Workflow,Workitem_747707399,schedule,2016-08-03 15:57:21.963000+00:00,"Other, see explanation",New credit,Application_1000086665,...,,,,,,,,0.229,3,0.246024
3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1030261128,withdraw,2016-08-03 15:58:28.286000+00:00,"Other, see explanation",New credit,Application_1000086665,...,,,,,,,,66.323,4,2451.263246
4,Created,User_1,W_Complete application,Workflow,Workitem_1127124826,schedule,2016-08-03 15:58:28.293000+00:00,"Other, see explanation",New credit,Application_1000086665,...,,,,,,,,0.007,5,103.970044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620079,Obtained,User_68,W_Validate application,Workflow,Workitem_715332932,resume,2016-04-26 09:22:34.654000+00:00,Home improvement,New credit,Application_610205010,...,,,,,,,,6159.402,67,26557.137350
620080,statechange,User_68,O_Accepted,Offer,OfferState_287854721,complete,2016-04-26 09:23:27.987000+00:00,Home improvement,New credit,Application_610205010,...,,,,,,,Offer_772484790,53.333,68,25754.074620
620081,statechange,User_68,A_Pending,Application,ApplState_1161629338,complete,2016-04-26 09:23:28.009000+00:00,Home improvement,New credit,Application_610205010,...,,,,,,,,0.022,69,30170.973672
620082,Deleted,User_68,W_Validate application,Workflow,Workitem_1093600680,complete,2016-04-26 09:23:28.012000+00:00,Home improvement,New credit,Application_610205010,...,,,,,,,,0.003,70,33482.332418


# 3. Apply Above Calculated Mean Time to Validation and Test Set

In [151]:
# Calculate time difference
df_val['time_diff'] = df_val['time:timestamp'].diff().dt.total_seconds()
# Set the time difference of the 1st row 0
df_val.loc[0, 'time_diff'] = 0
# Count number of processes per trace/ID
count_val_lst = df_val.groupby('case:concept:name').count()['time_diff'].tolist()
# Assign position number to each row/process
position_lst_1_val = [list(range(1, i + 1)) for i in count_val_lst]
position_lst_val = []
for i in position_lst_1_val:
    for j in i:
        position_lst_val.append(j)
df_val['Process_Step'] = position_lst_val
# Set the time difference of every process with position = 1 as 0
df_val.loc[df_val['Process_Step'] == 1, 'time_diff'] = 0
# Create the predicted time column per entry using the mean time difference
pred_time_lst_val = []
for j in position_lst_val:
    if j <= len(mean_time_lst):
        pred_time_lst_val.append(mean_time_lst[j - 1])
    else:
        pred_time_lst_val.append(0)
df_val['baseline_predicted_time'] = pred_time_lst_val
df_val

Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,...,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,time_diff,Process_Step,baseline_predicted_time
0,Created,User_1,A_Create Application,Application,Application_1000339879,complete,2016-03-17 12:57:10.159000+00:00,Existing loan takeover,New credit,Application_1000339879,...,,,,,,,,0.000,1,0.000000
1,statechange,User_1,A_Submitted,Application,ApplState_778367888,complete,2016-03-17 12:57:10.201000+00:00,Existing loan takeover,New credit,Application_1000339879,...,,,,,,,,0.042,2,0.107744
2,Created,User_1,W_Handle leads,Workflow,Workitem_1463338913,schedule,2016-03-17 12:57:10.478000+00:00,Existing loan takeover,New credit,Application_1000339879,...,,,,,,,,0.277,3,0.246024
3,Deleted,User_1,W_Handle leads,Workflow,Workitem_2039736457,withdraw,2016-03-17 12:58:21.610000+00:00,Existing loan takeover,New credit,Application_1000339879,...,,,,,,,,71.132,4,2451.263246
4,Created,User_1,W_Complete application,Workflow,Workitem_1636332312,schedule,2016-03-17 12:58:21.619000+00:00,Existing loan takeover,New credit,Application_1000339879,...,,,,,,,,0.009,5,103.970044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156043,statechange,User_130,O_Returned,Offer,OfferState_932083099,complete,2016-10-05 09:56:01.060000+00:00,Home improvement,New credit,Application_609941821,...,,,,,,,Offer_1124522024,5.348,26,74144.325198
156044,Released,User_130,W_Validate application,Workflow,Workitem_659477062,suspend,2016-10-05 09:57:53.288000+00:00,Home improvement,New credit,Application_609941821,...,,,,,,,,112.228,27,61147.565498
156045,statechange,User_75,O_Accepted,Offer,OfferState_818903550,complete,2016-10-09 19:01:17.881000+00:00,Home improvement,New credit,Application_609941821,...,,,,,,,Offer_1124522024,378204.593,28,52753.889248
156046,statechange,User_75,A_Pending,Application,ApplState_1476651273,complete,2016-10-09 19:01:17.884000+00:00,Home improvement,New credit,Application_609941821,...,,,,,,,,0.003,29,51195.260052


In [149]:
# Calculate time difference
df_test['time_diff'] = df_test['time:timestamp'].diff().dt.total_seconds()
# Set the time difference of the 1st row 0
df_test.loc[0, 'time_diff'] = 0
# Count number of processes per trace/ID
count_test_lst = df_test.groupby('case:concept:name').count()['time_diff'].tolist()
# Assign position number to each row/process
position_lst_1_test = [list(range(1, i + 1)) for i in count_test_lst]
position_lst_test = []
for i in position_lst_1_test:
    for j in i:
        position_lst_test.append(j)
df_test['Process_Step'] = position_lst_test
# Set the time difference of every process with position = 1 as 0
df_test.loc[df_test['Process_Step'] == 1, 'time_diff'] = 0
# Create the predicted time column per entry using the mean time difference
pred_time_lst_test = []
for j in position_lst_test:
    if j <= len(mean_time_lst):
        pred_time_lst_test.append(mean_time_lst[j - 1])
    else:
        pred_time_lst_test.append(0)
df_test['baseline_predicted_time'] = pred_time_lst_test
df_test

Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,...,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,time_diff,Process_Step,baseline_predicted_time
0,Created,User_1,A_Create Application,Application,Application_610717758,complete,2016-11-19 23:12:05.325000+00:00,Car,New credit,Application_610717758,...,,,,,,,,0.000,1,0.000000
1,statechange,User_1,A_Submitted,Application,ApplState_1960150929,complete,2016-11-19 23:12:06.411000+00:00,Car,New credit,Application_610717758,...,,,,,,,,1.086,2,0.107744
2,Created,User_1,W_Handle leads,Workflow,Workitem_684900503,schedule,2016-11-19 23:12:06.690000+00:00,Car,New credit,Application_610717758,...,,,,,,,,0.279,3,0.246024
3,Deleted,User_1,W_Handle leads,Workflow,Workitem_954760450,withdraw,2016-11-19 23:12:47.428000+00:00,Car,New credit,Application_610717758,...,,,,,,,,40.738,4,2451.263246
4,Created,User_1,W_Complete application,Workflow,Workitem_1752703538,schedule,2016-11-19 23:12:47.435000+00:00,Car,New credit,Application_610717758,...,,,,,,,,0.007,5,103.970044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28701,Deleted,User_1,W_Call after offers,Workflow,Workitem_1894495253,ate_abort,2016-12-20 06:32:24.316000+00:00,Existing loan takeover,New credit,Application_999632431,...,,,,,,,,331831.284,19,177611.591210
28702,Created,User_1,W_Call after offers,Workflow,Workitem_1671270331,schedule,2016-12-20 06:32:24.326000+00:00,Existing loan takeover,New credit,Application_999632431,...,,,,,,,,0.010,20,141597.683657
28703,statechange,User_1,A_Cancelled,Application,ApplState_318084843,complete,2017-01-16 07:00:30.429000+00:00,Existing loan takeover,New credit,Application_999632431,...,,,,,,,,2334486.103,21,108116.420252
28704,statechange,User_1,O_Cancelled,Offer,OfferState_560115325,complete,2017-01-16 07:00:30.449000+00:00,Existing loan takeover,New credit,Application_999632431,...,,,,,,,Offer_872978767,0.020,22,144902.788360
