# Import modules

In [1]:
import numpy as np
import pandas as pd
import datetime as datetime
from sklearn.ensemble import RandomForestClassifier

# Convert time: timestamp and registration date to unix timestamp

In [2]:
# Read the split dataset
# Available on github/data/splits
df_train = pd.read_csv('bpi2012_train.csv')
df_test = pd.read_csv('bpi2012_test.csv')
df_val = pd.read_csv('bpi2012_val.csv')

# Perform conversion
df_train['Date'] = np.array(df_train['time:timestamp'].values, dtype = 'datetime64').astype(datetime.datetime)
df_train['Regdate'] = np.array(df_train['case:REG_DATE'].values, dtype = 'datetime64').astype(datetime.datetime)

df_train['reg:unix'] = (df_train['Regdate'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
df_train['time:unix'] = (df_train['Date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

df_test['Date'] = np.array(df_test['time:timestamp'].values, dtype = 'datetime64').astype(datetime.datetime)
df_test['Regdate'] = np.array(df_test['case:REG_DATE'].values, dtype = 'datetime64').astype(datetime.datetime)

df_test['reg:unix'] = (df_test['Regdate'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
df_test['time:unix'] = (df_test['Date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

df_val['Date'] = np.array(df_val['time:timestamp'].values, dtype = 'datetime64').astype(datetime.datetime)
df_val['Regdate'] = np.array(df_val['case:REG_DATE'].values, dtype = 'datetime64').astype(datetime.datetime)

df_val['reg:unix'] = (df_val['Regdate'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
df_val['time:unix'] = (df_val['Date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

  df_train['Date'] = np.array(df_train['time:timestamp'].values, dtype = 'datetime64').astype(datetime.datetime)
  df_train['Regdate'] = np.array(df_train['case:REG_DATE'].values, dtype = 'datetime64').astype(datetime.datetime)
  df_test['Date'] = np.array(df_test['time:timestamp'].values, dtype = 'datetime64').astype(datetime.datetime)
  df_test['Regdate'] = np.array(df_test['case:REG_DATE'].values, dtype = 'datetime64').astype(datetime.datetime)
  df_val['Date'] = np.array(df_val['time:timestamp'].values, dtype = 'datetime64').astype(datetime.datetime)
  df_val['Regdate'] = np.array(df_val['case:REG_DATE'].values, dtype = 'datetime64').astype(datetime.datetime)


# Map unique values of lifecycle:transition to a number

In [3]:
# Define mapping 
mapping_train = {item:i for i, item in enumerate(df_train["lifecycle:transition"].unique())}
mapping_test = {item:i for i, item in enumerate(df_test["lifecycle:transition"].unique())}
mapping_val = {item:i for i, item in enumerate(df_val["lifecycle:transition"].unique())}

# Apply mapping
df_train["transition"] = df_train["lifecycle:transition"].apply(lambda x: mapping_train[x])
df_test["transition"] = df_test["lifecycle:transition"].apply(lambda x: mapping_test[x])
df_val["transition"] = df_val["lifecycle:transition"].apply(lambda x: mapping_val[x])

# Fit the model

In [4]:
# Drop all NaNs etc.
df_train.dropna(axis=0, inplace=True)
df_test.dropna(axis=0, inplace=True)
df_val.dropna(axis=0, inplace=True)

# Define predictors
predictors=['time:unix','transition', 'org:resource', 'reg:unix']

# Define the classifier
rfc = RandomForestClassifier(n_estimators=250)

# Fit the model
rfc.fit(df_train[predictors], df_train['concept:name'])

RandomForestClassifier(n_estimators=250)

# Predict

In [7]:
pred_test = rfc.predict(df_test[predictors])
pred_val = rfc.predict(df_val[predictors])

df_test['predicted_action'] = pred_test
df_val['predicted_action'] = pred_val

# Metrics

In [8]:
from sklearn import metrics
actions_taken = df_val['concept:name']
actions_taken = actions_taken[1:]

actions_pred = df_val['predicted_action']
actions_pred = actions_pred[:-1]

test = pd.concat([actions_taken, actions_pred], axis=1)
test.dropna(axis=0, inplace=True)

print("Accuracy: ", metrics.accuracy_score(test['concept:name'], test['predicted_action']))

Accuracy:  0.5325057057887821
