In [1]:
import numpy as np
import pandas as pd
import datetime as datetime

# 1. Read the datasets

In [2]:
df_train = pd.read_csv("bpi2017_train.csv", parse_dates = ['time:timestamp'])
df_val = pd.read_csv("bpi2017_val.csv", parse_dates = ['time:timestamp'])
df_test = pd.read_csv("bpi2017_test.csv", parse_dates = ['time:timestamp'])
df_train

Unnamed: 0.1,Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID
0,0,Created,User_1,A_Create Application,Application,Application_1691306052,complete,2016-01-01 10:16:11.500000+00:00,Home improvement,New credit,Application_1691306052,10000.0,,,,,,,,
1,1,statechange,User_1,A_Submitted,Application,ApplState_284636842,complete,2016-01-01 10:16:11.549000+00:00,Home improvement,New credit,Application_1691306052,10000.0,,,,,,,,
2,2,Created,User_1,W_Handle leads,Workflow,Workitem_831373279,schedule,2016-01-01 10:16:11.740000+00:00,Home improvement,New credit,Application_1691306052,10000.0,,,,,,,,
3,3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1299098074,withdraw,2016-01-01 10:17:31.573000+00:00,Home improvement,New credit,Application_1691306052,10000.0,,,,,,,,
4,4,Created,User_1,W_Complete application,Workflow,Workitem_1703931302,schedule,2016-01-01 10:17:31.584000+00:00,Home improvement,New credit,Application_1691306052,10000.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620621,620621,Created,User_44,O_Create Offer,Offer,Offer_1879818928,complete,2016-11-11 15:07:05.804000+00:00,Existing loan takeover,New credit,Application_1354066062,25000.0,25000.0,120.0,True,254.56,False,0.0,25000.0,
620622,620622,statechange,User_44,O_Created,Offer,OfferState_975946794,complete,2016-11-11 15:07:06.388000+00:00,Existing loan takeover,New credit,Application_1354066062,25000.0,,,,,,,,Offer_1879818928
620623,620623,statechange,User_44,A_Cancelled,Application,ApplState_1775629292,complete,2016-11-11 15:09:00.488000+00:00,Existing loan takeover,New credit,Application_1354066062,25000.0,,,,,,,,
620624,620624,statechange,User_44,O_Cancelled,Offer,OfferState_2048261328,complete,2016-11-11 15:09:00.507000+00:00,Existing loan takeover,New credit,Application_1354066062,25000.0,,,,,,,,Offer_1879818928


# 2. Baseline Case Prediction

In [3]:
def predict_next_case(df: pd.DataFrame) -> pd.DataFrame:
    # Count number of processes per trace/ID
    count_lst = df.groupby('case:concept:name').count()['lifecycle:transition'].tolist()

    position_lst_1 = [list(range(1, i + 1)) for i in count_lst]
    position_lst = []
    for i in position_lst_1:
        for j in i:
            position_lst.append(j)
    df['position'] = position_lst
    position_df = df.groupby('position').agg(pd.Series.mode)['concept:name'].to_frame()

    pred_df = pd.merge(df, position_df, on="position")
    pred_df.sort_values(by=['time:timestamp'], inplace=True)
    pred_df.rename(columns = {"concept:name_y":"baseline_action_pred", "concept:name_x":"concept:name"}, inplace=True)
    return pred_df

# Applied to training dataset

In [4]:
res_train = predict_next_case(df_train)

# Applied to validation dataset

In [5]:
res_val = predict_next_case(df_val)

# Applied to testing dataset

In [6]:
res_test = predict_next_case(df_test)

### For easy exploration/correctness verification:

In [7]:
# Just dropping columns for easier inspection, this shouldn't go to the final version
# pred_df.drop(pred_df.columns.difference(['Action', 'concept:name_x', 'baseline_action_pred', 'position', 'lifecycle:transition']), 1, inplace=True)
# Save to csv for lookup
# pred_df.to_csv('baseline_prediction.csv')