In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import datetime as datetime

### Data preparation

In [2]:
df_train = pd.read_csv("bpi2017_train.csv", parse_dates = ['time:timestamp'])
df_val = pd.read_csv("bpi2017_val.csv", parse_dates = ['time:timestamp'])
df_test = pd.read_csv("bpi2017_test.csv", parse_dates = ['time:timestamp'])

# The default name indicating the case ID is case:concept:name
# concept:name is the event
# time:timestamp is the corresponding timestamp
# Load the datasets, sort them on case and consequently timestamp, then reset the index
df_train = df_train.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index()
df_val = df_val.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index()
df_test = df_test.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index()

# Remove obsolete columns
df_train = df_train.drop(['index', 'Unnamed: 0'], axis = 1)
df_val = df_val.drop(['index', 'Unnamed: 0'], axis = 1)
df_test = df_test.drop(['index', 'Unnamed: 0'], axis = 1)

# 1. Calculate the time difference

In [3]:
# Cumulative sum function to be used later
def CumSum(lists):
    # Returns the cumulative sum of a list
    length = len(lists)
    cu_list = [sum(lists[0: x: 1]) for x in range(0, length + 1)]
    return cu_list[1: ]

In [4]:
def time_difference(df):
    # Calculate time difference between each row
    df['time_diff'] = df['time:timestamp'].diff().dt.total_seconds()
    # Set the time difference of the 1st row to 0 as it's currently NaN
    df.at[0, 'time_diff'] = 0
    # Count number of steps per process
    length_per_case_List = df.groupby(['case:concept:name'])['time_diff'].count().tolist()

    # Using the cumulative sum we get all the positions that are a first step in a process
    # And then the time difference can be set to 0
    position_lst = CumSum(length_per_case_List)
    for i in tqdm(position_lst):
        df.at[i, 'time_diff'] = 0
    # For Loop mysteriously creates an empty row at the end of the df, gotta delete it
    df = df.iloc[: -1]

    # Unzip the position list to get the number of each steps of each process, make that into a list
    step_in_process = []
    for x in tqdm(length_per_case_List):
        for y in range(x):
            step_in_process.append(y + 1)
    # Assign position number to each row/process
    df['position'] = step_in_process
    return df

In [5]:
# Apply the above changes to all dataframes
# The warnings are obsolete, it's because it uses .at which is considerably faster than .loc
df_train = time_difference(df_train)
df_val = time_difference(df_val)
df_test = time_difference(df_test)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16308/16308 [00:00<00:00, 68212.24it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16308/16308 [00:00<00:00, 329334.07it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['position'] = step_in_process
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4078/4078 [00:00<00:00, 73859.13it/s]
100%|████████████████

# 2. Baseline Time Prediction (Only on Training Dataset)

In [6]:
# Get the list of position number
step_in_process_train = df_train['position'].tolist()
# Calculate mean time difference grouped by position based on the number of cases
mean_time_lst = df_train.groupby('position')['time_diff'].mean().tolist()

# Create the predicted time column per entry using the mean time difference
pred_time_lst_train = [mean_time_lst[j - 1] for j in step_in_process_train]
df_train['baseline_predicted_time'] = pred_time_lst_train
df_train

Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,...,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,time_diff,position,baseline_predicted_time
0,Created,User_1,A_Create Application,Application,Application_1000158214,complete,2016-06-02 10:14:26.844000+00:00,Home improvement,New credit,Application_1000158214,...,,,,,,,,0.000,1,0.000000
1,statechange,User_1,A_Submitted,Application,ApplState_277536765,complete,2016-06-02 10:14:26.885000+00:00,Home improvement,New credit,Application_1000158214,...,,,,,,,,0.041,2,0.108810
2,Created,User_1,W_Handle leads,Workflow,Workitem_176988109,schedule,2016-06-02 10:14:27.227000+00:00,Home improvement,New credit,Application_1000158214,...,,,,,,,,0.342,3,0.246913
3,Deleted,User_1,W_Handle leads,Workflow,Workitem_895982768,withdraw,2016-06-02 10:15:36.752000+00:00,Home improvement,New credit,Application_1000158214,...,,,,,,,,69.525,4,2336.961114
4,Created,User_1,W_Complete application,Workflow,Workitem_1409452454,schedule,2016-06-02 10:15:36.764000+00:00,Home improvement,New credit,Application_1000158214,...,,,,,,,,0.012,5,101.363755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620621,Obtained,User_68,W_Validate application,Workflow,Workitem_715332932,resume,2016-04-26 09:22:34.654000+00:00,Home improvement,New credit,Application_610205010,...,,,,,,,,6159.402,67,24760.097749
620622,statechange,User_68,O_Accepted,Offer,OfferState_287854721,complete,2016-04-26 09:23:27.987000+00:00,Home improvement,New credit,Application_610205010,...,,,,,,,Offer_772484790,53.333,68,25855.859743
620623,statechange,User_68,A_Pending,Application,ApplState_1161629338,complete,2016-04-26 09:23:28.009000+00:00,Home improvement,New credit,Application_610205010,...,,,,,,,,0.022,69,28523.590462
620624,Deleted,User_68,W_Validate application,Workflow,Workitem_1093600680,complete,2016-04-26 09:23:28.012000+00:00,Home improvement,New credit,Application_610205010,...,,,,,,,,0.003,70,35405.497539


# 3. Baseline Case Prediction (only on the training dataset)

In [7]:
position_df = df_train.groupby('position').agg(pd.Series.mode)['concept:name'].to_frame()
df_train = pd.merge(df_train, position_df, on='position')
df_train

Unnamed: 0,Action,org:resource,concept:name_x,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,...,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,time_diff,position,baseline_predicted_time,concept:name_y
0,Created,User_1,A_Create Application,Application,Application_1000158214,complete,2016-06-02 10:14:26.844000+00:00,Home improvement,New credit,Application_1000158214,...,,,,,,,0.000,1,0.000,A_Create Application
1,Created,User_71,A_Create Application,Application,Application_1000311556,complete,2016-04-04 15:56:37.675000+00:00,Car,New credit,Application_1000311556,...,,,,,,,0.000,1,0.000,A_Create Application
2,Created,User_1,A_Create Application,Application,Application_1000334415,complete,2016-09-15 16:39:17.758000+00:00,"Other, see explanation",New credit,Application_1000334415,...,,,,,,,0.000,1,0.000,A_Create Application
3,Created,User_1,A_Create Application,Application,Application_1000339879,complete,2016-03-17 12:57:10.159000+00:00,Existing loan takeover,New credit,Application_1000339879,...,,,,,,,0.000,1,0.000,A_Create Application
4,Created,User_7,A_Create Application,Application,Application_1000474975,complete,2016-06-16 15:37:24.412000+00:00,"Other, see explanation",New credit,Application_1000474975,...,,,,,,,0.000,1,0.000,A_Create Application
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620621,statechange,User_118,O_Cancelled,Offer,OfferState_2104946863,complete,2016-06-07 08:35:37.303000+00:00,Remaining debt home,New credit,Application_2037628374,...,,,,,,Offer_1158976722,0.012,176,0.012,O_Cancelled
620622,statechange,User_118,O_Cancelled,Offer,OfferState_1671093436,complete,2016-06-07 08:35:37.314000+00:00,Remaining debt home,New credit,Application_2037628374,...,,,,,,Offer_1596032809,0.011,177,0.011,O_Cancelled
620623,statechange,User_118,O_Cancelled,Offer,OfferState_816623561,complete,2016-06-07 08:35:37.327000+00:00,Remaining debt home,New credit,Application_2037628374,...,,,,,,Offer_1305081611,0.013,178,0.013,O_Cancelled
620624,statechange,User_118,O_Cancelled,Offer,OfferState_475329857,complete,2016-06-07 08:35:37.596000+00:00,Remaining debt home,New credit,Application_2037628374,...,,,,,,Offer_966701358,0.269,179,0.269,O_Cancelled


# 4. Apply Above Calculated Mean Time to Validation and Test datasets

In [8]:
def apply_time_prediction(df):
    # Get the list of position number
    step_in_process = df['position'].tolist()

    # Create the predicted time column per entry using the mean time difference
    # If some position numbers are not shown in the training dataset, its predicted time will be 0
    pred_time_lst = []
    for j in step_in_process:
        if j <= len(mean_time_lst):
            pred_time_lst.append(mean_time_lst[j - 1])
        else:
            pred_time_lst.append(0)
    df['baseline_predicted_time'] = pred_time_lst
    return df

In [9]:
# Apply the above changes to all dataframes
df_val = apply_time_prediction(df_val)
df_test = apply_time_prediction(df_test)

# 5. Apply Baseline Case prediction to Validation and Test datasets

In [10]:
def apply_case_prediction(df: pd.DataFrame) -> pd.DataFrame:
    # Merge the dataframe with position with the dataframe prediction is applied to
    df = pd.merge(df, position_df, on='position')
    
    # Sort values by timestamp, like in the original dataset
    df.sort_values(by=['time:timestamp'], inplace=True)
    
    # Rename the column labels due to applying merge
    df.rename(columns = {"concept:name_y":"baseline_action_pred", "concept:name_x":"concept:name"}, inplace=True)
    
    return df

In [11]:
df_val = apply_case_prediction(df_val)
df_test = apply_case_prediction(df_test)

### Dataset preview

In [12]:
df_train

Unnamed: 0,Action,org:resource,concept:name_x,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,...,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,time_diff,position,baseline_predicted_time,concept:name_y
0,Created,User_1,A_Create Application,Application,Application_1000158214,complete,2016-06-02 10:14:26.844000+00:00,Home improvement,New credit,Application_1000158214,...,,,,,,,0.000,1,0.000,A_Create Application
1,Created,User_71,A_Create Application,Application,Application_1000311556,complete,2016-04-04 15:56:37.675000+00:00,Car,New credit,Application_1000311556,...,,,,,,,0.000,1,0.000,A_Create Application
2,Created,User_1,A_Create Application,Application,Application_1000334415,complete,2016-09-15 16:39:17.758000+00:00,"Other, see explanation",New credit,Application_1000334415,...,,,,,,,0.000,1,0.000,A_Create Application
3,Created,User_1,A_Create Application,Application,Application_1000339879,complete,2016-03-17 12:57:10.159000+00:00,Existing loan takeover,New credit,Application_1000339879,...,,,,,,,0.000,1,0.000,A_Create Application
4,Created,User_7,A_Create Application,Application,Application_1000474975,complete,2016-06-16 15:37:24.412000+00:00,"Other, see explanation",New credit,Application_1000474975,...,,,,,,,0.000,1,0.000,A_Create Application
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620621,statechange,User_118,O_Cancelled,Offer,OfferState_2104946863,complete,2016-06-07 08:35:37.303000+00:00,Remaining debt home,New credit,Application_2037628374,...,,,,,,Offer_1158976722,0.012,176,0.012,O_Cancelled
620622,statechange,User_118,O_Cancelled,Offer,OfferState_1671093436,complete,2016-06-07 08:35:37.314000+00:00,Remaining debt home,New credit,Application_2037628374,...,,,,,,Offer_1596032809,0.011,177,0.011,O_Cancelled
620623,statechange,User_118,O_Cancelled,Offer,OfferState_816623561,complete,2016-06-07 08:35:37.327000+00:00,Remaining debt home,New credit,Application_2037628374,...,,,,,,Offer_1305081611,0.013,178,0.013,O_Cancelled
620624,statechange,User_118,O_Cancelled,Offer,OfferState_475329857,complete,2016-06-07 08:35:37.596000+00:00,Remaining debt home,New credit,Application_2037628374,...,,,,,,Offer_966701358,0.269,179,0.269,O_Cancelled


In [13]:
df_val

Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,...,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,time_diff,position,baseline_predicted_time,baseline_action_pred
2297,Created,User_1,A_Create Application,Application,Application_1878239836,complete,2016-01-01 13:35:26.422000+00:00,Home improvement,New credit,Application_1878239836,...,,,,,,,0.000,1,0.000000,A_Create Application
6375,statechange,User_1,A_Submitted,Application,ApplState_649299817,complete,2016-01-01 13:35:26.463000+00:00,Home improvement,New credit,Application_1878239836,...,,,,,,,0.041,2,0.108810,A_Submitted
10453,Created,User_1,W_Handle leads,Workflow,Workitem_1693495632,schedule,2016-01-01 13:35:26.664000+00:00,Home improvement,New credit,Application_1878239836,...,,,,,,,0.201,3,0.246913,W_Handle leads
14531,Deleted,User_1,W_Handle leads,Workflow,Workitem_1080257214,withdraw,2016-01-01 13:36:30.982000+00:00,Home improvement,New credit,Application_1878239836,...,,,,,,,64.318,4,2336.961114,W_Handle leads
18609,Created,User_1,W_Complete application,Workflow,Workitem_1531288681,schedule,2016-01-01 13:36:30.993000+00:00,Home improvement,New credit,Application_1878239836,...,,,,,,,0.011,5,101.363755,W_Complete application
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74150,statechange,User_1,O_Cancelled,Offer,OfferState_516860315,complete,2016-11-13 07:00:36.337000+00:00,Home improvement,New credit,Application_132220187,...,,,,,,Offer_869706358,0.025,19,178269.342986,W_Call after offers
78046,Deleted,User_1,W_Call after offers,Workflow,Workitem_1599009653,ate_abort,2016-11-13 07:00:36.346000+00:00,Home improvement,New credit,Application_132220187,...,,,,,,,0.009,20,141001.874388,W_Call after offers
86711,statechange,User_1,A_Cancelled,Application,ApplState_338161915,complete,2016-11-13 07:00:49.818000+00:00,Home improvement,New credit,Application_1792327676,...,,,,,,,2247927.100,22,146835.274929,W_Call after offers
90242,statechange,User_1,O_Cancelled,Offer,OfferState_341732870,complete,2016-11-13 07:00:49.840000+00:00,Home improvement,New credit,Application_1792327676,...,,,,,,Offer_230690584,0.022,23,106112.924149,W_Validate application


In [14]:
df_test

Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,...,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,time_diff,position,baseline_predicted_time,baseline_action_pred
458,Created,User_1,A_Create Application,Application,Application_837911105,complete,2016-11-14 05:57:57.461000+00:00,Existing loan takeover,New credit,Application_837911105,...,,,,,,,0.000,1,0.000000,A_Create Application
1209,statechange,User_1,A_Submitted,Application,ApplState_1692094008,complete,2016-11-14 05:57:59.458000+00:00,Existing loan takeover,New credit,Application_837911105,...,,,,,,,1.997,2,0.108810,A_Submitted
1960,Created,User_1,W_Handle leads,Workflow,Workitem_1100633268,schedule,2016-11-14 05:57:59.981000+00:00,Existing loan takeover,New credit,Application_837911105,...,,,,,,,0.523,3,0.246913,W_Handle leads
590,Created,User_41,A_Create Application,Application,Application_912681536,complete,2016-11-14 08:12:30.523000+00:00,Home improvement,New credit,Application_912681536,...,,,,,,,0.000,1,0.000000,A_Create Application
1341,Created,User_41,W_Complete application,Workflow,Workitem_1334591020,schedule,2016-11-14 08:12:30.536000+00:00,Home improvement,New credit,Application_912681536,...,,,,,,,0.013,2,0.108810,A_Submitted
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28406,Deleted,User_131,W_Call incomplete files,Workflow,Workitem_1308647312,ate_abort,2017-02-01 14:00:30.259000+00:00,Existing loan takeover,New credit,Application_637536789,...,,,,,,,5193.446,77,29014.518989,W_Call incomplete files
28429,Created,User_131,W_Validate application,Workflow,Workitem_1339594215,schedule,2017-02-01 14:00:30.272000+00:00,Existing loan takeover,New credit,Application_637536789,...,,,,,,,0.013,78,43444.314986,W_Call incomplete files
28451,Obtained,User_131,W_Validate application,Workflow,Workitem_1697785565,start,2017-02-01 14:00:30.275000+00:00,Existing loan takeover,New credit,Application_637536789,...,,,,,,,0.003,79,26900.300938,W_Call incomplete files
28472,statechange,User_131,A_Validating,Application,ApplState_1576288280,complete,2017-02-01 14:00:30.347000+00:00,Existing loan takeover,New credit,Application_637536789,...,,,,,,,0.072,80,32291.172272,W_Call incomplete files
