In [1]:
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import datetime as datetime

# 1. Read the datasets

In [2]:
df_train = pd.read_csv("bpi2017_train.csv", parse_dates = ['time:timestamp'])
df_val = pd.read_csv("bpi2017_val.csv", parse_dates = ['time:timestamp'])
df_test = pd.read_csv("bpi2017_test.csv", parse_dates = ['time:timestamp'])
df_train

Unnamed: 0.1,Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID
0,0,Created,User_1,A_Create Application,Application,Application_1691306052,complete,2016-01-01 10:16:11.500000+00:00,Home improvement,New credit,Application_1691306052,10000.0,,,,,,,,
1,1,statechange,User_1,A_Submitted,Application,ApplState_284636842,complete,2016-01-01 10:16:11.549000+00:00,Home improvement,New credit,Application_1691306052,10000.0,,,,,,,,
2,2,Created,User_1,W_Handle leads,Workflow,Workitem_831373279,schedule,2016-01-01 10:16:11.740000+00:00,Home improvement,New credit,Application_1691306052,10000.0,,,,,,,,
3,3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1299098074,withdraw,2016-01-01 10:17:31.573000+00:00,Home improvement,New credit,Application_1691306052,10000.0,,,,,,,,
4,4,Created,User_1,W_Complete application,Workflow,Workitem_1703931302,schedule,2016-01-01 10:17:31.584000+00:00,Home improvement,New credit,Application_1691306052,10000.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620621,620621,Created,User_44,O_Create Offer,Offer,Offer_1879818928,complete,2016-11-11 15:07:05.804000+00:00,Existing loan takeover,New credit,Application_1354066062,25000.0,25000.0,120.0,True,254.56,False,0.0,25000.0,
620622,620622,statechange,User_44,O_Created,Offer,OfferState_975946794,complete,2016-11-11 15:07:06.388000+00:00,Existing loan takeover,New credit,Application_1354066062,25000.0,,,,,,,,Offer_1879818928
620623,620623,statechange,User_44,A_Cancelled,Application,ApplState_1775629292,complete,2016-11-11 15:09:00.488000+00:00,Existing loan takeover,New credit,Application_1354066062,25000.0,,,,,,,,
620624,620624,statechange,User_44,O_Cancelled,Offer,OfferState_2048261328,complete,2016-11-11 15:09:00.507000+00:00,Existing loan takeover,New credit,Application_1354066062,25000.0,,,,,,,,Offer_1879818928


# 2. Baseline Time Prediction (Only on Training Dataset)

In [3]:
# Count number of processes per trace/ID
count_lst = df_train.groupby('case:concept:name').count()['lifecycle:transition'].tolist()

## Assign a postion number to each row/process

In [4]:
position_lst_1 = [list(range(1, i + 1)) for i in count_lst]
position_lst = []
for i in position_lst_1:
    for j in i:
        position_lst.append(j)
df_train['position'] = position_lst

## Select the most common action per position

In [5]:
position_df = df_train.groupby('position').agg(pd.Series.mode)['concept:name'].to_frame()

## Merge the training dataset with with the predicted actions based on their position in a case

In [6]:
pred_df = pd.merge(df_train, position_df, on="position")
pred_df.rename(columns = {"concept:name_y": "baseline_action_pred", "concept:name_x":"concept:name"}, inplace=True)
pred_df

Unnamed: 0.1,Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,...,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,position,baseline_action_pred
0,0,Created,User_1,A_Create Application,Application,Application_1691306052,complete,2016-01-01 10:16:11.500000+00:00,Home improvement,New credit,...,,,,,,,,,1,W_Validate application
1,25,Obtained,User_30,W_Validate application,Workflow,Workitem_1522720538,resume,2016-01-07 11:34:12.366000+00:00,Home improvement,New credit,...,,,,,,,,,1,W_Validate application
2,43,Created,User_19,W_Call after offers,Workflow,Workitem_1530416988,schedule,2016-01-02 11:06:53.310000+00:00,Home improvement,New credit,...,,,,,,,,,1,W_Validate application
3,83,statechange,User_17,A_Concept,Application,ApplState_694592100,complete,2016-01-02 09:06:04.213000+00:00,Car,New credit,...,,,,,,,,,1,W_Validate application
4,134,Obtained,User_95,W_Validate application,Workflow,Workitem_576279767,resume,2016-01-27 13:29:52.530000+00:00,Car,New credit,...,,,,,,,,,1,W_Validate application
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620621,419480,Created,User_17,O_Create Offer,Offer,Offer_581651964,complete,2016-07-27 12:20:23.916000+00:00,Car,New credit,...,0.0,35.0,False,250.0,True,1030.0,8000.0,,176,O_Create Offer
620622,419481,statechange,User_17,O_Created,Offer,OfferState_850958320,complete,2016-07-27 12:20:24.625000+00:00,Car,New credit,...,,,,,,,,Offer_581651964,177,O_Created
620623,419482,statechange,User_17,O_Sent (mail and online),Offer,OfferState_1161686692,complete,2016-07-27 12:20:34.068000+00:00,Car,New credit,...,,,,,,,,Offer_581651964,178,O_Sent (mail and online)
620624,419483,Released,User_17,W_Call after offers,Workflow,Workitem_1096231714,suspend,2016-07-27 12:25:54.440000+00:00,Car,New credit,...,,,,,,,,,179,W_Call after offers


# 3. Apply prediction to Validation and Test datasets

## Validation dataset

In [7]:
# Count number of processes per trace/ID
count_lst = df_val.groupby('case:concept:name').count()['lifecycle:transition'].tolist()

position_lst_1 = [list(range(1, i + 1)) for i in count_lst]
position_lst = []
for i in position_lst_1:
    for j in i:
        position_lst.append(j)
df_val['position'] = position_lst

position_df = df_val.groupby('position').agg(pd.Series.mode)['concept:name'].to_frame()

pred_df = pd.merge(df_val, position_df, on="position")
pred_df.rename(columns = {"concept:name_y":"baseline_action_pred", "concept:name_x":"concept:name"}, inplace=True)
pred_df

Unnamed: 0.1,Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,...,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,position,baseline_action_pred
0,0,Created,User_1,A_Create Application,Application,Application_1878239836,complete,2016-01-01 13:35:26.422000+00:00,Home improvement,New credit,...,,,,,,,,,1,W_Validate application
1,22,Obtained,User_113,W_Validate application,Workflow,Workitem_1853647796,start,2016-01-08 10:08:49.262000+00:00,Home improvement,New credit,...,,,,,,,,,1,W_Validate application
2,77,statechange,User_8,O_Created,Offer,OfferState_1681949223,complete,2016-01-29 09:43:48.735000+00:00,Car,New credit,...,,,,,,,,Offer_274484173,1,W_Validate application
3,151,statechange,User_89,O_Sent (online only),Offer,OfferState_1773249096,complete,2016-01-04 09:27:56.205000+00:00,Car,New credit,...,,,,,,,,Offer_1301660092,1,W_Validate application
4,175,statechange,User_100,A_Incomplete,Application,ApplState_417798179,complete,2016-01-11 09:15:13.251000+00:00,Car,New credit,...,,,,,,,,,1,W_Validate application
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155501,21809,statechange,User_41,A_Accepted,Application,ApplState_1787882934,complete,2016-02-22 19:16:35.835000+00:00,Not speficied,New credit,...,,,,,,,,,176,A_Accepted
155502,21810,Created,User_41,O_Create Offer,Offer,Offer_1230279746,complete,2016-02-22 19:21:20.465000+00:00,Not speficied,New credit,...,1500.0,58.0,True,100.0,True,893.0,5000.0,,177,O_Create Offer
155503,21811,statechange,User_41,O_Created,Offer,OfferState_235472469,complete,2016-02-22 19:21:21.787000+00:00,Not speficied,New credit,...,,,,,,,,Offer_1230279746,178,O_Created
155504,21812,statechange,User_41,O_Sent (mail and online),Offer,OfferState_961775746,complete,2016-02-22 19:21:52.582000+00:00,Not speficied,New credit,...,,,,,,,,Offer_1230279746,179,O_Sent (mail and online)


## Test dataset

In [8]:
# Count number of processes per trace/ID
count_lst = df_test.groupby('case:concept:name').count()['lifecycle:transition'].tolist()

position_lst_1 = [list(range(1, i + 1)) for i in count_lst]
position_lst = []
for i in position_lst_1:
    for j in i:
        position_lst.append(j)
df_test['position'] = position_lst

position_df = df_test.groupby('position').agg(pd.Series.mode)['concept:name'].to_frame()

pred_df = pd.merge(df_test, position_df, on="position")
pred_df.rename(columns={"concept:name_y":"baseline_action_pred", "concept:name_x":"concept:name"}, inplace=True)
pred_df

Unnamed: 0.1,Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,...,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,position,baseline_action_pred
0,0,Created,User_1,A_Create Application,Application,Application_837911105,complete,2016-11-14 05:57:57.461000+00:00,Existing loan takeover,New credit,...,,,,,,,,,1,W_Validate application
1,31,statechange,User_41,O_Created,Offer,OfferState_440426323,complete,2016-11-14 08:18:41.811000+00:00,Home improvement,New credit,...,,,,,,,,Offer_1213862981,1,W_Validate application
2,60,Obtained,User_100,W_Call incomplete files,Workflow,Workitem_2099807820,start,2016-11-25 13:36:12.710000+00:00,Home improvement,New credit,...,,,,,,,,,1,W_Validate application
3,100,Created,User_12,W_Complete application,Workflow,Workitem_506766874,schedule,2016-11-14 09:14:08.742000+00:00,Unknown,New credit,...,,,,,,,,,1,W_Validate application
4,131,Created,User_79,W_Call after offers,Workflow,Workitem_1774219897,schedule,2016-11-14 10:02:46.826000+00:00,Not speficied,New credit,...,,,,,,,,,1,W_Validate application
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28701,24279,Deleted,User_31,W_Validate application,Workflow,Workitem_1948554254,ate_abort,2016-12-29 11:03:27.555000+00:00,Home improvement,New credit,...,,,,,,,,,113,W_Validate application
28702,24280,Created,User_17,A_Create Application,Application,Application_851976076,complete,2016-12-23 08:28:49.067000+00:00,Unknown,Limit raise,...,,,,,,,,,114,A_Create Application
28703,24281,statechange,User_17,A_Concept,Application,ApplState_37396173,complete,2016-12-23 08:28:49.071000+00:00,Unknown,Limit raise,...,,,,,,,,,115,A_Concept
28704,24282,Created,User_17,W_Complete application,Workflow,Workitem_281498894,schedule,2016-12-23 08:28:49.075000+00:00,Unknown,Limit raise,...,,,,,,,,,116,W_Complete application


### For easy exploration:

In [9]:
# Just dropping columns for easier inspection, this shouldn't go to the final version
# pred_df.drop(pred_df.columns.difference(['Action', 'concept:name_x', 'baseline_action_pred', 'position', 'lifecycle:transition']), 1, inplace=True)
# Save to csv for lookup
# pred_df.to_csv('baseline_prediction.csv')