In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import datetime as datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_classif, f_regression

In [2]:
df_train = pd.read_csv("bpi2017_train.csv", parse_dates = ['time:timestamp'])
df_val = pd.read_csv("bpi2017_val.csv", parse_dates = ['time:timestamp'])
df_test = pd.read_csv("bpi2017_test.csv", parse_dates = ['time:timestamp'])

# The default name indicating the case ID is case:concept:name
# concept:name is the event
# time:timestamp is the corresponding timestamp
# Load the datasets, sort them on case and consequently timestamp, then reset the index
df_train = df_train.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)
df_val = df_val.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)
df_test = df_test.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)

# Remove obsolete columns
df_train.drop(columns = ['Unnamed: 0'], inplace = True)
df_val.drop(columns = ['Unnamed: 0'], inplace = True)
df_test.drop(columns = ['Unnamed: 0'], inplace = True)

# 1. Calculate the Time Difference & Find Position

In [3]:
# Cumulative sum function to be used later
def CumSum(lists):
    # Returns the cumulative sum of a list
    length = len(lists)
    cu_list = [sum(lists[0: x: 1]) for x in range(0, length + 1)]
    return cu_list[1: ]

In [4]:
def time_difference(df):
    # Calculate time difference between each row
    df['time_diff'] = df['time:timestamp'].diff().dt.total_seconds()
    # Set the time difference of the 1st row to 0 as it's currently NaN
    df.at[0, 'time_diff'] = 0
    # Count number of steps per process
    length_per_case_List = df.groupby(['case:concept:name'])['time_diff'].count().tolist()

    # Using the cumulative sum we get all the positions that are a first step in a process
    # And then the time difference can be set to 0
    position_lst = CumSum(length_per_case_List)
    for i in tqdm(position_lst):
        df.at[i, 'time_diff'] = 0
    # For Loop mysteriously creates an empty row at the end of the df, gotta delete it
    df = df.iloc[: -1]

    # Unzip the position list to get the number of each steps of each process, make that into a list
    step_in_process = []
    for x in tqdm(length_per_case_List):
        for y in range(x):
            step_in_process.append(y + 1)
    # Assign position number to each row/process
    df['position'] = step_in_process

    # Find future time difference by shifting the current time difference
    df['future_time_diff'] = df['time_diff'].shift(-1)
    df.at[df.shape[0] - 1, 'future_time_diff'] = 0

    return df

In [5]:
# Apply the above changes to all dataframes
# The warnings are obsolete, it's because it uses .at which is considerably faster than .loc
df_train = time_difference(df_train)
df_val = time_difference(df_val)
df_test = time_difference(df_test)

100%|█████████████████████████████████████████████████████████████████████████| 16308/16308 [00:00<00:00, 65933.86it/s]
100%|████████████████████████████████████████████████████████████████████████| 16308/16308 [00:00<00:00, 272514.96it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|███████████████████████████████████████████████████████████████████████████| 4078/4078 [00:00<00:00, 62322.81it/s]
100%|██████████████████████████████████████████████████████████████████████████| 4078/4078 [00:00<00:00, 292062.90

# 2. Find Future Event

In [6]:
def next_event(df):
    # Find the next activity name by shifting the current event label
    df['next:concept:name'] = df['concept:name'].shift(-1)
    last_lst = [i - 1 for i in df[df['position'] == 1].index if i != 0]
    # The next event label is 'Nothing' when the cycle is ended
    df.at[df.shape[0] - 1, 'next:concept:name'] = 'Nothing'
    for i in last_lst:
        df.at[i, 'next:concept:name'] = 'Nothing'
    return df

df_train = next_event(df_train)
df_val = next_event(df_val)
df_test = next_event(df_test)

# 3. New Feature: Weekend or Weekday

In [7]:
def add_weekday(df):
    # Get day of week like Monday, Tuesday, etc
    df_day = pd.DataFrame(data = df['time:timestamp'].dt.dayofweek)
    df_day.rename(columns = {'time:timestamp': 'day'}, inplace = True)
    df['day'] = df_day['day']
    return df

df_train = add_weekday(df_train)
df_val = add_weekday(df_val)
df_test = add_weekday(df_test)

# 4. New Feature: Working Hour

In [8]:
def add_working_hour(df):
    # Get hour like 10, 15, etc
    df_day = pd.DataFrame(data = df['time:timestamp'].dt.hour)
    df_day.rename(columns = {'time:timestamp': 'hour'}, inplace = True)
    df['hour'] = df_day['hour']
    return df

df_train = add_working_hour(df_train)
df_val = add_working_hour(df_val)
df_test = add_working_hour(df_test)

# 5. Feature Selection

In [9]:
X_train_processed_num = df_train[['case:RequestedAmount', 'position', 'day', 'hour']]
X_train_processed_cat = df_train[['Action', 'concept:name', 'EventOrigin', 'lifecycle:transition', 'case:LoanGoal', 'case:ApplicationType']]
y_train_1 = df_train[['future_time_diff']]
y_train_2 = df_train[['next:concept:name']]

# One-hot encoding on categorical data
enc = OneHotEncoder(handle_unknown = 'ignore', sparse=False)
transformed = enc.fit_transform(X_train_processed_cat)
X_train_processed_cat = pd.DataFrame(transformed, columns = enc.get_feature_names())

In [10]:
skb = SelectKBest(score_func = f_classif, k = 15)
skb.fit_transform(X_train_processed_cat, y_train_2)
cols = skb.get_support(indices = True)
features_df_new = X_train_processed_cat.iloc[:, cols]
features_df_new.columns

  return f(**kwargs)
  f = msb / msw


Index(['x1_A_Accepted', 'x1_A_Cancelled', 'x1_A_Create Application',
       'x1_A_Denied', 'x1_A_Validating', 'x1_O_Accepted', 'x1_O_Create Offer',
       'x1_O_Created', 'x1_W_Assess potential fraud', 'x1_W_Call after offers',
       'x1_W_Call incomplete files', 'x1_W_Complete application',
       'x1_W_Validate application', 'x2_Offer', 'x3_start'],
      dtype='object')

In [11]:
skb = SelectKBest(score_func = f_classif, k = 3)
skb.fit_transform(X_train_processed_num, y_train_2)
cols = skb.get_support(indices = True)
features_df_new = X_train_processed_num.iloc[:, cols]
features_df_new.columns

  return f(**kwargs)


Index(['position', 'day', 'hour'], dtype='object')

In [12]:
skb = SelectKBest(score_func = f_regression, k = 15)
skb.fit_transform(X_train_processed_cat, y_train_1)
cols = skb.get_support(indices = True)
features_df_new = X_train_processed_cat.iloc[:, cols]
features_df_new.columns

  return f(**kwargs)


Index(['x0_Created', 'x0_Deleted', 'x0_Obtained', 'x0_Released',
       'x0_statechange', 'x1_W_Call after offers', 'x2_Application',
       'x2_Offer', 'x2_Workflow', 'x3_ate_abort', 'x3_complete', 'x3_resume',
       'x3_schedule', 'x3_start', 'x3_suspend'],
      dtype='object')

In [13]:
skb = SelectKBest(score_func = f_regression, k = 3)
skb.fit_transform(X_train_processed_num, y_train_1)
cols = skb.get_support(indices = True)
features_df_new = X_train_processed_num.iloc[:, cols]
features_df_new.columns

  return f(**kwargs)


Index(['case:RequestedAmount', 'position', 'day'], dtype='object')

# 5. Remove Outliers

In [14]:
# Remove outlier on both training and validation data
df_all = pd.concat([df_train, df_val])
df_all = df_all.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)

def find_outlier(process_name, df):
    # Remove outlier having time_diff larger than mean +- 3 * SD
    df_needed = df[(df['concept:name'] == process_name)]
    mean_value = df_needed['time_diff'].mean()
    std_value = df_needed['time_diff'].std()
    upper_bound =  mean_value + 3 * std_value
    lower_bound = mean_value - 3 * std_value
    new_df = df_needed[(df_needed['time_diff'] < lower_bound) | (df_needed['time_diff'] > upper_bound)]
    # Return case id that has at least 1 process as outlier
    return new_df['case:concept:name'].tolist()

outlier_lst = []
# i refers to the position number
for i in tqdm(range(2, len(df_all['position'].tolist()))):
    df_pos = df_all[df_all['position'] == i]
    # a refers to the concept name per position number
    for a in df_pos['concept:name'].unique().tolist():
        small_outlier_lst = find_outlier(a, df_pos)
        outlier_lst = list(set(outlier_lst + small_outlier_lst))

len(outlier_lst)

100%|████████████████████████████████████████████████████████████████████████| 776130/776130 [12:12<00:00, 1058.90it/s]


8459

In [15]:
# Remove all outliers
df_filtered = df_all[~df_all['case:concept:name'].isin(outlier_lst)]
final_all_train = sorted(df_filtered['case:concept:name'].unique().tolist())

# Split training and validation dataset
final_train, final_val = train_test_split(final_all_train, test_size = 0.2)
df_train = df_filtered[df_filtered['case:concept:name'].isin(final_train)]
df_val = df_filtered[df_filtered['case:concept:name'].isin(final_val)]

# To make sure, again sort the datasets on case and consequently timestamp, then reset the index
df_train = df_train.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)
df_val = df_val.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)

In [17]:
# Var for time: lifecycle:transition, Action, concept:name, EventOrigin, day, position, case:RequestedAmount
# Var for label: concept:name, lifecycle:transition, EventOrigin, Action, day, position, hour
df_train = df_train[['case:concept:name', 'next:concept:name', 'time:timestamp', 'time_diff', 
                     'future_time_diff', 'concept:name', 'lifecycle:transition', 
                     'EventOrigin', 'Action', 'day', 'position', 'hour', 'case:RequestedAmount']]
df_val = df_val[['case:concept:name', 'next:concept:name', 'time:timestamp', 'time_diff', 
                     'future_time_diff', 'concept:name', 'lifecycle:transition', 
                     'EventOrigin', 'Action', 'day', 'position', 'hour', 'case:RequestedAmount']]
df_test = df_test[['case:concept:name', 'next:concept:name', 'time:timestamp', 'time_diff', 
                     'future_time_diff', 'concept:name', 'lifecycle:transition', 
                     'EventOrigin', 'Action', 'day', 'position', 'hour', 'case:RequestedAmount']]
df_train.to_csv('bpi2017_train_filtered.csv', index = False)
df_val.to_csv('bpi2017_val_filtered.csv', index = False)
df_test.to_csv('bpi2017_test_filtered.csv', index = False)