<a href="https://colab.research.google.com/github/dbusn/process-mining-group-5/blob/main/preprocessing_2012.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import datetime as datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_classif, f_regression

In [2]:
from google.colab import drive
drive.mount('/content/drive')
df_train = pd.read_csv("/content/drive/MyDrive/DBL Process Mining/Data/Split/bpi2012_train.csv", parse_dates = ['time:timestamp'])
df_val = pd.read_csv("/content/drive/MyDrive/DBL Process Mining/Data/Split/bpi2012_val.csv", parse_dates = ['time:timestamp'])
df_test = pd.read_csv("/content/drive/MyDrive/DBL Process Mining/Data/Split/bpi2012_test.csv", parse_dates = ['time:timestamp'])

# The default name indicating the case ID is case:concept:name
# concept:name is the event
# time:timestamp is the corresponding timestamp
# Load the datasets, sort them on case and consequently timestamp, then reset the index
df_train = df_train.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)
df_val = df_val.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)
df_test = df_test.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)

Mounted at /content/drive


In [3]:
# Remove obsolete columns
df_train.drop(columns = ['Unnamed: 0'], inplace = True)
df_val.drop(columns = ['Unnamed: 0'], inplace = True)
df_test.drop(columns = ['Unnamed: 0'], inplace = True)

In [4]:
df_train.head()

Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ
0,112.0,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000
1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000
2,112.0,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000
3,112.0,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000
4,,START,W_Completeren aanvraag,2011-10-01 11:36:46.437000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000


1. Calculate the time difference & find position

In [5]:
# Cumulative sum function to be used later
def CumSum(lists):
    # Returns the cumulative sum of a list
    length = len(lists)
    cu_list = [sum(lists[0: x: 1]) for x in range(0, length + 1)]
    return cu_list[1: ]

In [6]:
def time_difference(df):
    # Calculate time difference between each row
    df['time_diff'] = df['time:timestamp'].diff().dt.total_seconds()
    # Set the time difference of the 1st row to 0 as it's currently NaN
    df.at[0, 'time_diff'] = 0
    # Count number of steps per process
    length_per_case_List = df.groupby(['case:concept:name'])['time_diff'].count().tolist()

    # Using the cumulative sum we get all the positions that are a first step in a process
    # And then the time difference can be set to 0
    position_lst = CumSum(length_per_case_List)
    for i in tqdm(position_lst):
        df.at[i, 'time_diff'] = 0
    # For Loop mysteriously creates an empty row at the end of the df, gotta delete it
    df = df.iloc[: -1]

    # Unzip the position list to get the number of each steps of each process, make that into a list
    step_in_process = []
    for x in tqdm(length_per_case_List):
        for y in range(x):
            step_in_process.append(y + 1)
    # Assign position number to each row/process
    df['position'] = step_in_process

    # Find future time difference by shifting the current time difference
    df['future_time_diff'] = df['time_diff'].shift(-1)
    df.at[df.shape[0] - 1, 'future_time_diff'] = 0

    return df

In [7]:
# Apply the above changes to all dataframes
# The warnings are obsolete, it's because it uses .at which is considerably faster than .loc
df_train = time_difference(df_train)
df_val = time_difference(df_val)
df_test = time_difference(df_test)

100%|██████████| 6545/6545 [00:00<00:00, 81558.33it/s]
100%|██████████| 6545/6545 [00:00<00:00, 378665.30it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|██████████| 1637/1637 [00:00<00:00, 80558.43it/s]
100%|██████████| 1637/1637 [00:00<00:00, 357663.99it/s]
100%|██████████| 2045/2045 [00:00<00:00, 71448.16it/s]
100%|██████████| 2045/2045 [00:00<00:00, 311609.09it/s]


2. Find future event

In [8]:
def next_event(df):
    # Find the next activity name by shifting the current event label
    df['next:concept:name'] = df['concept:name'].shift(-1)
    last_lst = [i - 1 for i in df[df['position'] == 1].index if i != 0]
    # The next event label is 'Nothing' when the cycle is ended
    df.at[df.shape[0] - 1, 'next:concept:name'] = 'Nothing'
    for i in last_lst:
        df.at[i, 'next:concept:name'] = 'Nothing'
    return df

df_train = next_event(df_train)
df_val = next_event(df_val)
df_test = next_event(df_test)

In [9]:
df_train.head()

Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ,time_diff,position,future_time_diff,next:concept:name
0,112.0,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546000+02:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,0.0,1,0.334,A_PARTLYSUBMITTED
1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+02:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,0.334,2,53.026,A_PREACCEPTED
2,112.0,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906000+02:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,53.026,3,0.969,W_Completeren aanvraag
3,112.0,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875000+02:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,0.969,4,39427.562,W_Completeren aanvraag
4,,START,W_Completeren aanvraag,2011-10-01 11:36:46.437000+02:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,39427.562,5,356.871,A_ACCEPTED


3. New feature: weekend or weekday

In [10]:
df_train.dtypes

org:resource            float64
lifecycle:transition     object
concept:name             object
time:timestamp           object
case:REG_DATE            object
case:concept:name       float64
case:AMOUNT_REQ         float64
time_diff               float64
position                  int64
future_time_diff        float64
next:concept:name        object
dtype: object

In [11]:
#The problem with this is that the timezone is changed. This should be ok for now.
df_train['time:timestamp'] =  pd.to_datetime(df_train['time:timestamp'], utc=True)
df_val['time:timestamp'] =  pd.to_datetime(df_val['time:timestamp'], utc=True)
df_test['time:timestamp'] =  pd.to_datetime(df_test['time:timestamp'], utc=True)

df_train.head()





Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ,time_diff,position,future_time_diff,next:concept:name
0,112.0,COMPLETE,A_SUBMITTED,2011-09-30 22:38:44.546000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,0.0,1,0.334,A_PARTLYSUBMITTED
1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-09-30 22:38:44.880000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,0.334,2,53.026,A_PREACCEPTED
2,112.0,COMPLETE,A_PREACCEPTED,2011-09-30 22:39:37.906000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,53.026,3,0.969,W_Completeren aanvraag
3,112.0,SCHEDULE,W_Completeren aanvraag,2011-09-30 22:39:38.875000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,0.969,4,39427.562,W_Completeren aanvraag
4,,START,W_Completeren aanvraag,2011-10-01 09:36:46.437000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,39427.562,5,356.871,A_ACCEPTED


In [12]:
def add_weekday(df):
    # Get day of week like Monday, Tuesday, etc
    df_day = pd.DataFrame(data = df['time:timestamp'].dt.dayofweek)
    df_day.rename(columns = {'time:timestamp': 'day'}, inplace = True)
    df['day'] = df_day['day']
    return df

df_train = add_weekday(df_train)
df_val = add_weekday(df_val)
df_test = add_weekday(df_test)

4. New feature: working hour

In [14]:
def add_working_hour(df):
    # Get hour like 10, 15, etc
    df_day = pd.DataFrame(data = df['time:timestamp'].dt.hour)
    df_day.rename(columns = {'time:timestamp': 'hour'}, inplace = True)
    df['hour'] = df_day['hour']
    return df

df_train = add_working_hour(df_train)
df_val = add_working_hour(df_val)
df_test = add_working_hour(df_test)

Feature selection

In [17]:
df_train.head()

Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ,time_diff,position,future_time_diff,next:concept:name,day,hour
0,112.0,COMPLETE,A_SUBMITTED,2011-09-30 22:38:44.546000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,0.0,1,0.334,A_PARTLYSUBMITTED,4,22
1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-09-30 22:38:44.880000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,0.334,2,53.026,A_PREACCEPTED,4,22
2,112.0,COMPLETE,A_PREACCEPTED,2011-09-30 22:39:37.906000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,53.026,3,0.969,W_Completeren aanvraag,4,22
3,112.0,SCHEDULE,W_Completeren aanvraag,2011-09-30 22:39:38.875000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,0.969,4,39427.562,W_Completeren aanvraag,4,22
4,,START,W_Completeren aanvraag,2011-10-01 09:36:46.437000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,39427.562,5,356.871,A_ACCEPTED,5,9


In [20]:
X_train_processed_num = df_train[['case:AMOUNT_REQ', 'position', 'day', 'hour']]
X_train_processed_cat = df_train[['concept:name','lifecycle:transition', 'next:concept:name']]
y_train_1 = df_train[['future_time_diff']]
y_train_2 = df_train[['next:concept:name']]

# One-hot encoding on categorical data
enc = OneHotEncoder(handle_unknown = 'ignore', sparse=False)
transformed = enc.fit_transform(X_train_processed_cat)
X_train_processed_cat = pd.DataFrame(transformed, columns = enc.get_feature_names())
X_train_processed = pd.concat([X_train_processed_cat, X_train_processed_num], axis = 1)



In [21]:
# Find the score for each variable for time prediction
skb_time = SelectKBest(score_func = f_regression)
skb_time.fit_transform(X_train_processed, y_train_1)
score_dct_time = dict(zip(X_train_processed.columns.tolist(), skb_time.scores_.round(decimals = 1).tolist()))
df_time_score = pd.DataFrame(list(score_dct_time.items()))
df_time_score.rename(columns = {0: 'variable', 1: 'score'}, inplace = True)
df_time_score = df_time_score.sort_values(by = ['score'], ascending = False).reset_index(drop = True)
df_time_score

  y = column_or_1d(y, warn=True)


Unnamed: 0,variable,score
0,x2_W_Nabellen offertes,10607.3
1,x0_W_Nabellen offertes,7915.8
2,x1_COMPLETE,4338.4
3,x1_START,3019.9
4,x0_W_Completeren aanvraag,748.3
5,x1_SCHEDULE,571.2
6,x0_W_Valideren aanvraag,499.8
7,x0_A_SUBMITTED,434.5
8,x2_A_PARTLYSUBMITTED,434.5
9,x2_Nothing,434.5


In [22]:
# Find the score for each variable for event prediction
skb_event = SelectKBest(score_func = f_classif)
skb_event.fit_transform(X_train_processed, y_train_2)
score_dct_event = dict(zip(X_train_processed.columns.tolist(), skb_event.scores_.round(decimals = 1).tolist()))
df_event_score = pd.DataFrame(list(score_dct_event.items()))
df_event_score.rename(columns = {0: 'variable', 1: 'score'}, inplace = True)
df_event_score = df_event_score.sort_values(by = ['score'], ascending = False).reset_index(drop = True)
df_event_score

  y = column_or_1d(y, warn=True)
  f = msb / msw


Unnamed: 0,variable,score
0,x2_A_ACCEPTED,inf
1,x2_O_SENT,inf
2,x2_A_CANCELLED,inf
3,x2_A_APPROVED,inf
4,x2_A_ACTIVATED,inf
5,x2_A_PARTLYSUBMITTED,inf
6,x2_A_PREACCEPTED,inf
7,x2_A_REGISTERED,inf
8,x2_Nothing,inf
9,x2_O_ACCEPTED,inf


Remove outliers

In [23]:
# Remove outlier on both training and validation data
df_all = pd.concat([df_train, df_val])
df_all = df_all.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)

def find_outlier(process_name, df):
    # Remove outlier having time_diff larger than mean +- 3 * SD
    df_needed = df[(df['concept:name'] == process_name)]
    mean_value = df_needed['time_diff'].mean()
    std_value = df_needed['time_diff'].std()
    upper_bound =  mean_value + 3 * std_value
    lower_bound = mean_value - 3 * std_value
    new_df = df_needed[(df_needed['time_diff'] < lower_bound) | (df_needed['time_diff'] > upper_bound)]
    # Return case id that has at least 1 process as outlier
    return new_df['case:concept:name'].tolist()

outlier_lst = []
# i refers to the position number
for i in tqdm(range(2, len(df_all['position'].tolist()))):
    df_pos = df_all[df_all['position'] == i]
    # a refers to the concept name per position number
    for a in df_pos['concept:name'].unique().tolist():
        small_outlier_lst = find_outlier(a, df_pos)
        outlier_lst = list(set(outlier_lst + small_outlier_lst))

len(outlier_lst)

100%|██████████| 162939/162939 [02:20<00:00, 1162.31it/s]


2289

In [24]:
# Remove all outliers
df_filtered = df_all[~df_all['case:concept:name'].isin(outlier_lst)]
final_all_train = sorted(df_filtered['case:concept:name'].unique().tolist())

# Split training and validation dataset
final_train, final_val = train_test_split(final_all_train, test_size = 0.2)
df_train = df_filtered[df_filtered['case:concept:name'].isin(final_train)]
df_val = df_filtered[df_filtered['case:concept:name'].isin(final_val)]

# To make sure, again sort the datasets on case and consequently timestamp, then reset the index
df_train = df_train.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)
df_val = df_val.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)

In [27]:
df_train.head()

Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ,time_diff,position,future_time_diff,next:concept:name,day,hour
0,112.0,COMPLETE,A_SUBMITTED,2011-09-30 22:38:44.546000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,0.0,1,0.334,A_PARTLYSUBMITTED,4,22
1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-09-30 22:38:44.880000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,0.334,2,53.026,A_PREACCEPTED,4,22
2,112.0,COMPLETE,A_PREACCEPTED,2011-09-30 22:39:37.906000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,53.026,3,0.969,W_Completeren aanvraag,4,22
3,112.0,SCHEDULE,W_Completeren aanvraag,2011-09-30 22:39:38.875000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,0.969,4,39427.562,W_Completeren aanvraag,4,22
4,,START,W_Completeren aanvraag,2011-10-01 09:36:46.437000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,39427.562,5,356.871,A_ACCEPTED,5,9


In [36]:
df_train.to_csv('bpi2012_train_filtered.csv', index = False)
df_val.to_csv('bpi2012_val_filtered.csv', index = False)
df_test.to_csv('bpi2012_test_filtered.csv', index = False)

In [37]:
from google.colab import files
files.download('bpi2012_train_filtered.csv')
files.download('bpi2012_val_filtered.csv')
files.download('bpi2012_test_filtered.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>