In [13]:
import numpy as np
import pandas as pd
import datetime as datetime

# 1. Read the datasets

In [14]:
df_train = pd.read_csv("bpi2017_train.csv", parse_dates = ['time:timestamp'])
df_val = pd.read_csv("bpi2017_val.csv", parse_dates = ['time:timestamp'])
df_test = pd.read_csv("bpi2017_test.csv", parse_dates = ['time:timestamp'])
df_train

Unnamed: 0.1,Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID
0,0,Created,User_1,A_Create Application,Application,Application_1691306052,complete,2016-01-01 10:16:11.500000+00:00,Home improvement,New credit,Application_1691306052,10000.0,,,,,,,,
1,1,statechange,User_1,A_Submitted,Application,ApplState_284636842,complete,2016-01-01 10:16:11.549000+00:00,Home improvement,New credit,Application_1691306052,10000.0,,,,,,,,
2,2,Created,User_1,W_Handle leads,Workflow,Workitem_831373279,schedule,2016-01-01 10:16:11.740000+00:00,Home improvement,New credit,Application_1691306052,10000.0,,,,,,,,
3,3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1299098074,withdraw,2016-01-01 10:17:31.573000+00:00,Home improvement,New credit,Application_1691306052,10000.0,,,,,,,,
4,4,Created,User_1,W_Complete application,Workflow,Workitem_1703931302,schedule,2016-01-01 10:17:31.584000+00:00,Home improvement,New credit,Application_1691306052,10000.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620621,620621,Created,User_44,O_Create Offer,Offer,Offer_1879818928,complete,2016-11-11 15:07:05.804000+00:00,Existing loan takeover,New credit,Application_1354066062,25000.0,25000.0,120.0,True,254.56,False,0.0,25000.0,
620622,620622,statechange,User_44,O_Created,Offer,OfferState_975946794,complete,2016-11-11 15:07:06.388000+00:00,Existing loan takeover,New credit,Application_1354066062,25000.0,,,,,,,,Offer_1879818928
620623,620623,statechange,User_44,A_Cancelled,Application,ApplState_1775629292,complete,2016-11-11 15:09:00.488000+00:00,Existing loan takeover,New credit,Application_1354066062,25000.0,,,,,,,,
620624,620624,statechange,User_44,O_Cancelled,Offer,OfferState_2048261328,complete,2016-11-11 15:09:00.507000+00:00,Existing loan takeover,New credit,Application_1354066062,25000.0,,,,,,,,Offer_1879818928


## Assign a postion number to each row/process

In [15]:
def assign_position(df: pd.DataFrame) -> pd.DataFrame:
    # Count number of processes per trace/ID
    count_lst = df.groupby('case:concept:name').count()['lifecycle:transition'].tolist()
    position_lst_1 = [list(range(1, i + 1)) for i in count_lst]
    position_lst = []
    for i in position_lst_1:
        for j in i:
            position_lst.append(j)
    df['position'] = position_lst
    return df

In [16]:
df_train = assign_position(df_train)
df_val = assign_position(df_val)
df_test = assign_position(df_test)

# 2. Baseline Case Prediction (Only on Training Dataset)

## Select the most common action per position

In [22]:
position_df = df_train.groupby('position')
# .agg(pd.Series.mode)['concept:name'].to_frame()
# position_df['concept:name']

In [24]:
position_df.apply(print)

position
1      16308.0
2      16308.0
3      16308.0
4      16308.0
5      16308.0
        ...   
176        1.0
177        1.0
178        1.0
179        1.0
180        1.0
Name: (Unnamed: 0, count), Length: 180, dtype: float64
position
1      310885.114913
2      310886.114913
3      310887.114913
4      310888.114913
5      310889.114913
           ...      
176    419480.000000
177    419481.000000
178    419482.000000
179    419483.000000
180    419484.000000
Name: (Unnamed: 0, mean), Length: 180, dtype: float64
position
1      179124.217104
2      179124.217104
3      179124.217104
4      179124.217104
5      179124.217104
           ...      
176              NaN
177              NaN
178              NaN
179              NaN
180              NaN
Name: (Unnamed: 0, std), Length: 180, dtype: float64
position
1           0.0
2           1.0
3           2.0
4           3.0
5           4.0
         ...   
176    419480.0
177    419481.0
178    419482.0
179    419483.0
180    419484.0

Unnamed: 0             count    None
                       mean     None
                       std      None
                       min      None
                       25%      None
                       50%      None
                       75%      None
                       max      None
case:RequestedAmount   count    None
                       mean     None
                       std      None
                       min      None
                       25%      None
                       50%      None
                       75%      None
                       max      None
FirstWithdrawalAmount  count    None
                       mean     None
                       std      None
                       min      None
                       25%      None
                       50%      None
                       75%      None
                       max      None
NumberOfTerms          count    None
                       mean     None
                       std      None
 

## Merge the training dataset with with the predicted actions based on their position

In [6]:
df_train = pd.merge(df_train, position_df, on='position')
df_train

Unnamed: 0.1,Unnamed: 0,Action,org:resource,concept:name_x,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,...,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,position,concept:name_y
0,0,Created,User_1,A_Create Application,Application,Application_1691306052,complete,2016-01-01 10:16:11.500000+00:00,Home improvement,New credit,...,,,,,,,,,1,W_Validate application
1,25,Obtained,User_30,W_Validate application,Workflow,Workitem_1522720538,resume,2016-01-07 11:34:12.366000+00:00,Home improvement,New credit,...,,,,,,,,,1,W_Validate application
2,43,Created,User_19,W_Call after offers,Workflow,Workitem_1530416988,schedule,2016-01-02 11:06:53.310000+00:00,Home improvement,New credit,...,,,,,,,,,1,W_Validate application
3,83,statechange,User_17,A_Concept,Application,ApplState_694592100,complete,2016-01-02 09:06:04.213000+00:00,Car,New credit,...,,,,,,,,,1,W_Validate application
4,134,Obtained,User_95,W_Validate application,Workflow,Workitem_576279767,resume,2016-01-27 13:29:52.530000+00:00,Car,New credit,...,,,,,,,,,1,W_Validate application
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620621,419480,Created,User_17,O_Create Offer,Offer,Offer_581651964,complete,2016-07-27 12:20:23.916000+00:00,Car,New credit,...,0.0,35.0,False,250.0,True,1030.0,8000.0,,176,O_Create Offer
620622,419481,statechange,User_17,O_Created,Offer,OfferState_850958320,complete,2016-07-27 12:20:24.625000+00:00,Car,New credit,...,,,,,,,,Offer_581651964,177,O_Created
620623,419482,statechange,User_17,O_Sent (mail and online),Offer,OfferState_1161686692,complete,2016-07-27 12:20:34.068000+00:00,Car,New credit,...,,,,,,,,Offer_581651964,178,O_Sent (mail and online)
620624,419483,Released,User_17,W_Call after offers,Workflow,Workitem_1096231714,suspend,2016-07-27 12:25:54.440000+00:00,Car,New credit,...,,,,,,,,,179,W_Call after offers


# 3. Apply pre-computed prediction to Validation and Test datasets

## Validation dataset

In [7]:
# Count number of processes per trace/ID
df_val = pd.merge(df_val, position_df, on='position')

df_val.sort_values(by=['time:timestamp'], inplace=True)
df_val.rename(columns = {"concept:name_y":"baseline_action_pred", "concept:name_x":"concept:name"}, inplace=True)
df_val

Unnamed: 0.1,Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,...,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,position,baseline_action_pred
0,0,Created,User_1,A_Create Application,Application,Application_1878239836,complete,2016-01-01 13:35:26.422000+00:00,Home improvement,New credit,...,,,,,,,,,1,W_Validate application
4078,1,statechange,User_1,A_Submitted,Application,ApplState_649299817,complete,2016-01-01 13:35:26.463000+00:00,Home improvement,New credit,...,,,,,,,,,2,W_Validate application
8156,2,Created,User_1,W_Handle leads,Workflow,Workitem_1693495632,schedule,2016-01-01 13:35:26.664000+00:00,Home improvement,New credit,...,,,,,,,,,3,W_Validate application
12234,3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1080257214,withdraw,2016-01-01 13:36:30.982000+00:00,Home improvement,New credit,...,,,,,,,,,4,W_Validate application
16312,4,Created,User_1,W_Complete application,Workflow,Workitem_1531288681,schedule,2016-01-01 13:36:30.993000+00:00,Home improvement,New credit,...,,,,,,,,,5,W_Validate application
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154543,150043,statechange,User_1,O_Cancelled,Offer,OfferState_516860315,complete,2016-11-13 07:00:36.337000+00:00,Home improvement,New credit,...,,,,,,,,Offer_869706358,88,W_Call after offers
154605,150044,Deleted,User_1,W_Call after offers,Workflow,Workitem_1599009653,ate_abort,2016-11-13 07:00:36.346000+00:00,Home improvement,New credit,...,,,,,,,,,89,W_Validate application
40604,148804,statechange,User_1,A_Cancelled,Application,ApplState_338161915,complete,2016-11-13 07:00:49.818000+00:00,Home improvement,New credit,...,,,,,,,,,10,W_Validate application
44681,148805,statechange,User_1,O_Cancelled,Offer,OfferState_341732870,complete,2016-11-13 07:00:49.840000+00:00,Home improvement,New credit,...,,,,,,,,Offer_230690584,11,W_Validate application


## Test dataset

In [8]:
df_test = pd.merge(df_test, position_df, on="position")
df_test.sort_values(by=['time:timestamp'], inplace=True)
df_test.rename(columns={"concept:name_y":"baseline_action_pred", "concept:name_x":"concept:name"}, inplace=True)
df_test

Unnamed: 0.1,Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,...,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,position,baseline_action_pred
0,0,Created,User_1,A_Create Application,Application,Application_837911105,complete,2016-11-14 05:57:57.461000+00:00,Existing loan takeover,New credit,...,,,,,,,,,1,W_Validate application
751,1,statechange,User_1,A_Submitted,Application,ApplState_1692094008,complete,2016-11-14 05:57:59.458000+00:00,Existing loan takeover,New credit,...,,,,,,,,,2,W_Validate application
1502,2,Created,User_1,W_Handle leads,Workflow,Workitem_1100633268,schedule,2016-11-14 05:57:59.981000+00:00,Existing loan takeover,New credit,...,,,,,,,,,3,W_Validate application
17931,25,Created,User_41,A_Create Application,Application,Application_912681536,complete,2016-11-14 08:12:30.523000+00:00,Home improvement,New credit,...,,,,,,,,,26,W_Validate application
18470,26,Created,User_41,W_Complete application,Workflow,Workitem_1334591020,schedule,2016-11-14 08:12:30.536000+00:00,Home improvement,New credit,...,,,,,,,,,27,W_Validate application
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24967,3727,Deleted,User_131,W_Call incomplete files,Workflow,Workitem_1308647312,ate_abort,2017-02-01 14:00:30.259000+00:00,Existing loan takeover,New credit,...,,,,,,,,,44,W_Validate application
25203,3728,Created,User_131,W_Validate application,Workflow,Workitem_1339594215,schedule,2017-02-01 14:00:30.272000+00:00,Existing loan takeover,New credit,...,,,,,,,,,45,W_Validate application
25431,3729,Obtained,User_131,W_Validate application,Workflow,Workitem_1697785565,start,2017-02-01 14:00:30.275000+00:00,Existing loan takeover,New credit,...,,,,,,,,,46,W_Validate application
25646,3730,statechange,User_131,A_Validating,Application,ApplState_1576288280,complete,2017-02-01 14:00:30.347000+00:00,Existing loan takeover,New credit,...,,,,,,,,,47,W_Validate application


### For easy exploration/correctness verification:

In [9]:
# Just dropping columns for easier inspection, this shouldn't go to the final version
# df_train.drop(df_train.columns.difference(['Action', 'concept:name_x', 'baseline_action_pred', 'position', 'lifecycle:transition']), 1, inplace=True)
# Save to csv for lookup
df_test.to_csv('baseline_case2.csv')