# Import modules

In [1]:
import numpy as np
import pandas as pd
import datetime as datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from tqdm import tqdm

## Convert time: timestamp and registration date to unix timestamp

In [2]:
# Read the split dataset
# Available on github/data/splits
df_train = pd.read_csv('bpi2017_train.csv')
df_test = pd.read_csv('bpi2017_test.csv')
df_val = pd.read_csv('bpi2017_val.csv')

# Perform conversion
df_train['Date'] = np.array(df_train['time:timestamp'].values, dtype = 'datetime64').astype(datetime.datetime)
df_train['time:unix'] = (df_train['Date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
df_test['Date'] = np.array(df_test['time:timestamp'].values, dtype = 'datetime64').astype(datetime.datetime)
df_test['time:unix'] = (df_test['Date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
df_val['Date'] = np.array(df_val['time:timestamp'].values, dtype = 'datetime64').astype(datetime.datetime)
df_val['time:unix'] = (df_val['Date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

  df_train['Date'] = np.array(df_train['time:timestamp'].values, dtype = 'datetime64').astype(datetime.datetime)
  df_test['Date'] = np.array(df_test['time:timestamp'].values, dtype = 'datetime64').astype(datetime.datetime)
  df_val['Date'] = np.array(df_val['time:timestamp'].values, dtype = 'datetime64').astype(datetime.datetime)


## Assign position

In [6]:
def assign_position(df: pd.DataFrame) -> pd.DataFrame:
    # Count number of processes per trace/ID
    count_lst = df.groupby('case:concept:name').count()['lifecycle:transition'].tolist()
    position_lst_1 = [list(range(1, i + 1)) for i in count_lst]
    position_lst = []
    for i in position_lst_1:
        for j in i:
            position_lst.append(j)
    df['position'] = position_lst
    return df

In [7]:
df_train = assign_position(df_train)
df_val = assign_position(df_val)
df_test = assign_position(df_test)

In [15]:
df_train

Unnamed: 0.1,Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,...,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,Date,time:unix,transition,position,action
0,0,Created,User_1,A_Create Application,Application,Application_1691306052,complete,2016-01-01 10:16:11.500000+00:00,Home improvement,New credit,...,,,,,,2016-01-01 10:16:11.500,1451643371,0,1,0
1,1,statechange,User_1,A_Submitted,Application,ApplState_284636842,complete,2016-01-01 10:16:11.549000+00:00,Home improvement,New credit,...,,,,,,2016-01-01 10:16:11.549,1451643371,0,2,1
2,2,Created,User_1,W_Handle leads,Workflow,Workitem_831373279,schedule,2016-01-01 10:16:11.740000+00:00,Home improvement,New credit,...,,,,,,2016-01-01 10:16:11.740,1451643371,1,3,0
3,3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1299098074,withdraw,2016-01-01 10:17:31.573000+00:00,Home improvement,New credit,...,,,,,,2016-01-01 10:17:31.573,1451643451,2,4,2
4,4,Created,User_1,W_Complete application,Workflow,Workitem_1703931302,schedule,2016-01-01 10:17:31.584000+00:00,Home improvement,New credit,...,,,,,,2016-01-01 10:17:31.584,1451643451,1,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621126,621126,Obtained,User_43,W_Call after offers,Workflow,Workitem_162257588,start,2016-11-12 09:31:06.232000+00:00,Not speficied,New credit,...,,,,,,2016-11-12 09:31:06.232,1478943066,3,67,3
621127,621127,statechange,User_43,A_Complete,Application,ApplState_1435859997,complete,2016-11-12 09:31:06.234000+00:00,Not speficied,New credit,...,,,,,,2016-11-12 09:31:06.234,1478943066,0,68,1
621128,621128,statechange,User_43,A_Cancelled,Application,ApplState_860885554,complete,2016-11-12 09:35:55.142000+00:00,Not speficied,New credit,...,,,,,,2016-11-12 09:35:55.142,1478943355,0,69,1
621129,621129,statechange,User_43,O_Cancelled,Offer,OfferState_2011904028,complete,2016-11-12 09:35:55.161000+00:00,Not speficied,New credit,...,,,,,Offer_1248413119,2016-11-12 09:35:55.161,1478943355,0,70,1


In [8]:
# Define mapping for lifecycle:transition
mapping_train = {item:i for i, item in enumerate(df_train["lifecycle:transition"].unique())}
mapping_test = {item:i for i, item in enumerate(df_test["lifecycle:transition"].unique())}
mapping_val = {item:i for i, item in enumerate(df_val["lifecycle:transition"].unique())}

# Apply mapping
df_train["transition"] = df_train["lifecycle:transition"].apply(lambda x: mapping_train[x])
df_test["transition"] = df_test["lifecycle:transition"].apply(lambda x: mapping_test[x])
df_val["transition"] = df_val["lifecycle:transition"].apply(lambda x: mapping_val[x])

In [14]:
# Define mapping for Action
mapping_train = {item:i for i, item in enumerate(df_train["Action"].unique())}
mapping_test = {item:i for i, item in enumerate(df_test["Action"].unique())}
mapping_val = {item:i for i, item in enumerate(df_val["Action"].unique())}

# Apply mapping
df_train["action"] = df_train["Action"].apply(lambda x: mapping_train[x])
df_test["action"] = df_test["Action"].apply(lambda x: mapping_test[x])
df_val["action"] = df_val["Action"].apply(lambda x: mapping_val[x])

In [16]:
# Define mapping for case:LoanGoal
mapping_train = {item:i for i, item in enumerate(df_train["case:LoanGoal"].unique())}
mapping_test = {item:i for i, item in enumerate(df_test["case:LoanGoal"].unique())}
mapping_val = {item:i for i, item in enumerate(df_val["case:LoanGoal"].unique())}

# Apply mapping
df_train["goal"] = df_train["case:LoanGoal"].apply(lambda x: mapping_train[x])
df_test["goal"] = df_test["case:LoanGoal"].apply(lambda x: mapping_test[x])
df_val["goal"] = df_val["case:LoanGoal"].apply(lambda x: mapping_val[x])

In [21]:
# Define mapping for case:LoanGoal
mapping_train = {item:i for i, item in enumerate(df_train["case:ApplicationType"].unique())}
mapping_test = {item:i for i, item in enumerate(df_test["case:ApplicationType"].unique())}
mapping_val = {item:i for i, item in enumerate(df_val["case:ApplicationType"].unique())}

# Apply mapping
df_train["type"] = df_train["case:ApplicationType"].apply(lambda x: mapping_train[x])
df_test["type"] = df_test["case:ApplicationType"].apply(lambda x: mapping_test[x])
df_val["type"] = df_val["case:ApplicationType"].apply(lambda x: mapping_val[x])

In [25]:
def Forest(n):
    df_1 = df_train.copy()
    df_2 = df_val.copy()
    # Define predictors
    predictors = ['time:unix', 'transition', 'type', 'action']
    # worse accuracy (compared to only using time:unix and transition): position, goal
    # better accuracy (---): action, type
    
    # Define the classifier
    rfc = RandomForestClassifier(n_estimators = n)
    # Fit the model
    rfc.fit(df_1[predictors], df_1['concept:name'])
    pred_val = rfc.predict(df_2[predictors])
    df_2['predicted_action'] = pred_val
    #
    actions_taken = df_2['concept:name']
    actions_taken = actions_taken[1: ]
    #
    actions_pred = df_2['predicted_action']
    actions_pred = actions_pred[: -1]
    #
    test = pd.concat([actions_taken, actions_pred], axis=1)
    test.dropna(axis=0, inplace=True)
    #
    return metrics.accuracy_score(test['concept:name'], test['predicted_action'])

In [26]:
Forest(50)

0.24458222311111685

In [16]:
pred_test = rfc.predict(df_test[predictors])
pred_val = rfc.predict(df_val[predictors])

df_test['predicted_action'] = pred_test
df_val['predicted_action'] = pred_val

In [17]:
from sklearn import metrics
actions_taken = df_val['concept:name']
actions_taken = actions_taken[1:]

actions_pred = df_val['predicted_action']
actions_pred = actions_pred[:-1]

test = pd.concat([actions_taken, actions_pred], axis=1)
test.dropna(axis=0, inplace=True)

print("Accuracy: ", metrics.accuracy_score(test['concept:name'], test['predicted_action']))

Accuracy:  0.2177497919341415


Check if accuracy is better with more estimators and plot:

In [24]:
n_lst = [10 * (i + 1) for i in range(20)]
accuracy_lst = [Forest(i) for i in tqdm(n_lst)]
accuracy_lst

  0%|                                                    | 0/20 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
plt.plot(accuracy_lst, color='magenta', marker='o',mfc='pink' ) #plot the data
#plt.xticks(n_lst) #set the tick frequency on x-axis