In [1]:
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import datetime as datetime

In [2]:
df_train = pd.read_csv("bpi2017_train.csv", parse_dates = ['time:timestamp'])
df_val = pd.read_csv("bpi2017_val.csv", parse_dates = ['time:timestamp'])
df_test = pd.read_csv("bpi2017_test.csv", parse_dates = ['time:timestamp'])

# The default name indicating the case ID is case:concept:name
# concept:name is the event
# time:timestamp is the corresponding timestamp
# Load the datasets, sort them on case and consequently timestamp, then reset the index
df_train = df_train.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index()
df_val = df_val.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index()
df_test = df_test.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index()

# Remove obsolete columns
df_train = df_train.drop(['index', 'Unnamed: 0'], axis = 1)
df_val = df_val.drop(['index', 'Unnamed: 0'], axis = 1)
df_test = df_test.drop(['index', 'Unnamed: 0'], axis = 1)

# 1. Calculate the time difference

In [3]:
# Cumulative sum function to be used later
def CumSum(lists):
    # Returns the cumulative sum of a list
    length = len(lists)
    cu_list = [sum(lists[0: x: 1]) for x in range(0, length + 1)]
    return cu_list[1: ]

In [4]:
def time_difference(df):
    # Calculate time difference between each row
    df['time_diff'] = df['time:timestamp'].diff().dt.total_seconds()
    # Set the time difference of the 1st row to 0 as it's currently NaN
    df.at[0, 'time_diff'] = 0
    # Count number of steps per process
    length_per_case_List = df.groupby(['case:concept:name'])['time_diff'].count().tolist()

    # Using the cumulative sum we get all the positions that are a first step in a process
    # And then the time difference can be set to 0
    position_lst = CumSum(length_per_case_List)
    for i in tqdm(position_lst):
        df.at[i, 'time_diff'] = 0
    # For Loop mysteriously creates an empty row at the end of the df, gotta delete it
    df = df.iloc[: -1]

    # Unzip the position list to get the number of each steps of each process, make that into a list
    step_in_process = []
    for x in tqdm(length_per_case_List):
        for y in range(x):
            step_in_process.append(y + 1)
    # Assign position number to each row/process
    df['position'] = step_in_process
    return df

In [5]:
# Apply the above changes to all dataframes
# The warnings are obsolete, it's because it uses .at which is considerably faster than .loc
df_train = time_difference(df_train)
df_val = time_difference(df_val)
df_test = time_difference(df_test)

100%|█████████████████████████████████████████████████████████████████████████| 16308/16308 [00:00<00:00, 52537.89it/s]
100%|████████████████████████████████████████████████████████████████████████| 16308/16308 [00:00<00:00, 192965.60it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|███████████████████████████████████████████████████████████████████████████| 4078/4078 [00:00<00:00, 55190.35it/s]
100%|██████████████████████████████████████████████████████████████████████████| 4078/4078 [00:00<00:00, 226974.86it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 751/751 [00:00<00:00, 41832.75it/s]
100%|████████████████████████████████████████████████████████████████████████████| 751/751 [00:00<00:00, 175219.58it/s]


# 2. Baseline Time Prediction (Only on Training Dataset)

In [6]:
# Get the list of position number
step_in_process_train = df_train['position'].tolist()
# Calculate mean time difference grouped by position based on the number of cases
mean_time_lst = df_train.groupby('position')['time_diff'].mean().tolist()

# Create the predicted time column per entry using the mean time difference
pred_time_lst_train = [mean_time_lst[j - 1] for j in step_in_process_train]
df_train['baseline_predicted_time'] = pred_time_lst_train
df_train

Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,...,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,time_diff,position,baseline_predicted_time
0,Created,User_1,A_Create Application,Application,Application_1000086665,complete,2016-08-03 15:57:21.673000+00:00,"Other, see explanation",New credit,Application_1000086665,...,,,,,,,,0.000,1,0.000000
1,statechange,User_1,A_Submitted,Application,ApplState_161925113,complete,2016-08-03 15:57:21.734000+00:00,"Other, see explanation",New credit,Application_1000086665,...,,,,,,,,0.061,2,0.105229
2,Created,User_1,W_Handle leads,Workflow,Workitem_747707399,schedule,2016-08-03 15:57:21.963000+00:00,"Other, see explanation",New credit,Application_1000086665,...,,,,,,,,0.229,3,0.242584
3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1030261128,withdraw,2016-08-03 15:58:28.286000+00:00,"Other, see explanation",New credit,Application_1000086665,...,,,,,,,,66.323,4,2470.953376
4,Created,User_1,W_Complete application,Workflow,Workitem_1127124826,schedule,2016-08-03 15:58:28.293000+00:00,"Other, see explanation",New credit,Application_1000086665,...,,,,,,,,0.007,5,103.429894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620439,Obtained,User_68,W_Validate application,Workflow,Workitem_715332932,resume,2016-04-26 09:22:34.654000+00:00,Home improvement,New credit,Application_610205010,...,,,,,,,,6159.402,67,24102.327326
620440,statechange,User_68,O_Accepted,Offer,OfferState_287854721,complete,2016-04-26 09:23:27.987000+00:00,Home improvement,New credit,Application_610205010,...,,,,,,,Offer_772484790,53.333,68,21609.747173
620441,statechange,User_68,A_Pending,Application,ApplState_1161629338,complete,2016-04-26 09:23:28.009000+00:00,Home improvement,New credit,Application_610205010,...,,,,,,,,0.022,69,27145.051865
620442,Deleted,User_68,W_Validate application,Workflow,Workitem_1093600680,complete,2016-04-26 09:23:28.012000+00:00,Home improvement,New credit,Application_610205010,...,,,,,,,,0.003,70,28502.377932


# 3. Apply Above Calculated Mean Time to Validation and Test Set

In [7]:
def apply_prediction(df):
    # Get the list of position number
    step_in_process = df['position'].tolist()

    # Create the predicted time column per entry using the mean time difference
    # If some position numbers are not shown in the training dataset, its predicted time will be 0
    pred_time_lst = []
    for j in step_in_process:
        if j <= len(mean_time_lst):
            pred_time_lst.append(mean_time_lst[j - 1])
        else:
            pred_time_lst.append(0)
    df['baseline_predicted_time'] = pred_time_lst
    return df

In [8]:
# Apply the above changes to all dataframes
df_val = apply_prediction(df_val)
df_test = apply_prediction(df_test)
df_val

Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,...,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,time_diff,position,baseline_predicted_time
0,Created,User_71,A_Create Application,Application,Application_1000311556,complete,2016-04-04 15:56:37.675000+00:00,Car,New credit,Application_1000311556,...,,,,,,,,0.000,1,0.000000
1,Created,User_71,W_Complete application,Workflow,Workitem_1212746755,schedule,2016-04-04 15:56:37.688000+00:00,Car,New credit,Application_1000311556,...,,,,,,,,0.013,2,0.105229
2,Obtained,User_71,W_Complete application,Workflow,Workitem_95061077,start,2016-04-04 15:56:37.693000+00:00,Car,New credit,Application_1000311556,...,,,,,,,,0.005,3,0.242584
3,statechange,User_71,A_Concept,Application,ApplState_1508577231,complete,2016-04-04 15:56:37.696000+00:00,Car,New credit,Application_1000311556,...,,,,,,,,0.003,4,2470.953376
4,statechange,User_71,A_Accepted,Application,ApplState_189615352,complete,2016-04-04 16:06:29.479000+00:00,Car,New credit,Application_1000311556,...,,,,,,,,591.783,5,103.429894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155683,statechange,User_39,A_Validating,Application,ApplState_1662533710,complete,2016-06-20 14:52:05.816000+00:00,Home improvement,New credit,Application_60999665,...,,,,,,,,0.070,37,32463.565499
155684,Released,User_39,W_Validate application,Workflow,Workitem_621404938,suspend,2016-06-20 14:52:25.340000+00:00,Home improvement,New credit,Application_60999665,...,,,,,,,,19.524,38,30576.940455
155685,statechange,User_116,O_Accepted,Offer,OfferState_1906154041,complete,2016-06-20 14:58:31.014000+00:00,Home improvement,New credit,Application_60999665,...,,,,,,,Offer_1603549424,365.674,39,31179.482374
155686,statechange,User_116,A_Pending,Application,ApplState_9557204,complete,2016-06-20 14:58:31.020000+00:00,Home improvement,New credit,Application_60999665,...,,,,,,,,0.006,40,28837.622489


In [9]:
df_test

Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,...,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,time_diff,position,baseline_predicted_time
0,Created,User_1,A_Create Application,Application,Application_610717758,complete,2016-11-19 23:12:05.325000+00:00,Car,New credit,Application_610717758,...,,,,,,,,0.000,1,0.000000
1,statechange,User_1,A_Submitted,Application,ApplState_1960150929,complete,2016-11-19 23:12:06.411000+00:00,Car,New credit,Application_610717758,...,,,,,,,,1.086,2,0.105229
2,Created,User_1,W_Handle leads,Workflow,Workitem_684900503,schedule,2016-11-19 23:12:06.690000+00:00,Car,New credit,Application_610717758,...,,,,,,,,0.279,3,0.242584
3,Deleted,User_1,W_Handle leads,Workflow,Workitem_954760450,withdraw,2016-11-19 23:12:47.428000+00:00,Car,New credit,Application_610717758,...,,,,,,,,40.738,4,2470.953376
4,Created,User_1,W_Complete application,Workflow,Workitem_1752703538,schedule,2016-11-19 23:12:47.435000+00:00,Car,New credit,Application_610717758,...,,,,,,,,0.007,5,103.429894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28701,Deleted,User_1,W_Call after offers,Workflow,Workitem_1894495253,ate_abort,2016-12-20 06:32:24.316000+00:00,Existing loan takeover,New credit,Application_999632431,...,,,,,,,,331831.284,19,180456.850270
28702,Created,User_1,W_Call after offers,Workflow,Workitem_1671270331,schedule,2016-12-20 06:32:24.326000+00:00,Existing loan takeover,New credit,Application_999632431,...,,,,,,,,0.010,20,137202.070216
28703,statechange,User_1,A_Cancelled,Application,ApplState_318084843,complete,2017-01-16 07:00:30.429000+00:00,Existing loan takeover,New credit,Application_999632431,...,,,,,,,,2334486.103,21,108489.723106
28704,statechange,User_1,O_Cancelled,Offer,OfferState_560115325,complete,2017-01-16 07:00:30.449000+00:00,Existing loan takeover,New credit,Application_999632431,...,,,,,,,Offer_872978767,0.020,22,146098.017933


In [20]:
position_lst = list(range(1, len(mean_time_lst) + 1))
df_prediction = pd.DataFrame(position_lst)
df_prediction['predicted_time'] = mean_time_lst
df_prediction = df_prediction.rename(columns = {0: 'position'})
df_prediction

Unnamed: 0,position,predicted_time
0,1,0.000000
1,2,0.105229
2,3,0.242584
3,4,2470.953376
4,5,103.429894
...,...,...
175,176,292506.206000
176,177,0.007000
177,178,0.008000
178,179,0.140500


In [29]:
df_prediction.to_csv('baseline_time.csv')