In [1]:
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import datetime as datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

# 1. Split the data

In [2]:
filename = 'bpi2017_test.csv'
print('Reading the dataset ' + filename)
df = pd.read_csv(filename, parse_dates = ['time:timestamp'])
if 'time_since_last_event' in df.columns:
    df = df.drop(columns = ['time_since_last_event', 'position', 'future_time_diff', 
                            'future_time_diff', 'time_since_case_starts', 'next:concept:name', 
                            'time_since_midnight', 'time_since_week_start', 'nor_time_since_last_event', 
                            'nor_time_since_case_starts', 'nor_time_since_midnight', 
                            'nor_time_since_week_start', 'nor_case:RequestedAmount']).reset_index(drop = True)
df.info()
# The default name indicating the case ID is case:concept:name
# concept:name is the event
# time:timestamp is the corresponding timestamp

Reading the dataset bpi2017_test.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28706 entries, 0 to 28705
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   Unnamed: 0             28706 non-null  int64              
 1   Action                 28706 non-null  object             
 2   org:resource           28706 non-null  object             
 3   concept:name           28706 non-null  object             
 4   EventOrigin            28706 non-null  object             
 5   EventID                28706 non-null  object             
 6   lifecycle:transition   28706 non-null  object             
 7   time:timestamp         28706 non-null  datetime64[ns, UTC]
 8   case:LoanGoal          28706 non-null  object             
 9   case:ApplicationType   28706 non-null  object             
 10  case:concept:name      28706 non-null  object             
 11  case:RequestedAmo

In [3]:
print('Obtaining datetime format from the dataset')
# Obtain date (datetime format) from datatype of time:timestamp 
df['Date'] = np.array(df['time:timestamp'].values, dtype = 'datetime64[D]').astype(datetime.datetime)
df

Obtaining datetime format from the dataset


Unnamed: 0.1,Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,...,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,Date
0,0,Created,User_1,A_Create Application,Application,Application_837911105,complete,2016-11-14 05:57:57.461000+00:00,Existing loan takeover,New credit,...,12000.0,,,,,,,,,2016-11-14
1,1,statechange,User_1,A_Submitted,Application,ApplState_1692094008,complete,2016-11-14 05:57:59.458000+00:00,Existing loan takeover,New credit,...,12000.0,,,,,,,,,2016-11-14
2,2,Created,User_1,W_Handle leads,Workflow,Workitem_1100633268,schedule,2016-11-14 05:57:59.981000+00:00,Existing loan takeover,New credit,...,12000.0,,,,,,,,,2016-11-14
3,3,Obtained,User_54,W_Handle leads,Workflow,Workitem_167603855,start,2016-11-14 08:27:08.979000+00:00,Existing loan takeover,New credit,...,12000.0,,,,,,,,,2016-11-14
4,4,Deleted,User_54,W_Handle leads,Workflow,Workitem_1887085579,complete,2016-11-14 08:27:59.637000+00:00,Existing loan takeover,New credit,...,12000.0,,,,,,,,,2016-11-14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28701,28701,Obtained,User_49,W_Call incomplete files,Workflow,Workitem_1264169641,resume,2017-01-06 16:13:55.507000+00:00,Car,New credit,...,16000.0,,,,,,,,,2017-01-06
28702,28702,Released,User_49,W_Call incomplete files,Workflow,Workitem_2138456013,suspend,2017-01-06 16:16:17.741000+00:00,Car,New credit,...,16000.0,,,,,,,,,2017-01-06
28703,28703,statechange,User_133,O_Accepted,Offer,OfferState_224690953,complete,2017-01-10 15:35:46.239000+00:00,Car,New credit,...,16000.0,,,,,,,,Offer_1880706415,2017-01-10
28704,28704,statechange,User_133,A_Pending,Application,ApplState_2038793609,complete,2017-01-10 15:35:46.242000+00:00,Car,New credit,...,16000.0,,,,,,,,,2017-01-10


In [4]:
print('Determining the training and testing data\'s date boundaries')
# Determine training and testing data's date boundaries
date_unique = sorted(df['Date'].unique())
total_date = len(date_unique)

# 0.8 is changed to 0.5 here so that there are enough test data
all_train_nr = round(total_date * 0.5)
date_before_test = date_unique[all_train_nr - 1]

Determining the training and testing data's date boundaries


In [5]:
print('Removing entries with case ID across date boundaries')
# Remove entries with case ID across date boundaries
small_df = df[['Date', 'case:concept:name']].drop_duplicates()
small_df_1 = small_df[small_df['Date'] <= date_before_test]
small_df_2 = small_df[small_df['Date'] > date_before_test]
small_df_inter = set(small_df_1['case:concept:name'].unique()).intersection(set(small_df_2['case:concept:name'].unique()))
case_unique_train = sorted(list(set(small_df_1['case:concept:name'].unique()) - small_df_inter))
case_unique_test = sorted(list(set(small_df_2['case:concept:name'].unique()) - small_df_inter))

Removing entries with case ID across date boundaries


In [6]:
print('Determining training and testing data\'s ID boundaries')
# Determine training and testing data's ID boundaries
all_case = sorted(df['case:concept:name'].unique())
total_case = len(all_case)

# 0.8 is changed to 0.5 here so that there are enough test data
all_train_case = round(total_case * 0.5)
case_all_train = sorted(all_case)[: all_train_case]
case_test = sorted(all_case)[all_train_case: ]

Determining training and testing data's ID boundaries


In [7]:
print('Combining ID boundaries ')
# Combine ID boundaries and time boundaries
final_all_train = sorted(list(set(case_unique_train).intersection(set(case_all_train))))
final_test = sorted(list(set(case_unique_test).intersection(set(case_test))))

Combining ID boundaries 


In [8]:
print('Splitting training and validation dataset')
# Split training and validation dataset
final_train, final_val = train_test_split(final_all_train, test_size = 0.2)

Splitting training and validation dataset


In [9]:
# Split the dataset
df_train = df[df['case:concept:name'].isin(final_train)]
df_val = df[df['case:concept:name'].isin(final_val)]
df_test = df[df['case:concept:name'].isin(final_test)]
if 'Unnamed: 0' in df.columns:
    df_train = df_train.drop(columns = ['Unnamed: 0', 'Date']).reset_index(drop = True)
    df_val = df_val.drop(columns = ['Unnamed: 0', 'Date']).reset_index(drop = True)
    df_test = df_test.drop(columns = ['Unnamed: 0', 'Date']).reset_index(drop = True)
else:
    df_train = df_train.drop(columns = ['Date']).reset_index(drop = True)
    df_val = df_val.drop(columns = ['Date']).reset_index(drop = True)
    df_test = df_test.drop(columns = ['Date']).reset_index(drop = True)

In [10]:
df_train = df_train.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)
df_val = df_val.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)
df_test = df_test.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)

# 2. Calculate the Time Difference & Find Position

In [11]:
# Cumulative sum function to be used later
def CumSum(lists):
    # Returns the cumulative sum of a list
    length = len(lists)
    cu_list = [sum(lists[0: x: 1]) for x in range(0, length + 1)]
    return cu_list[1: ]

In [12]:
def time_difference(df):
    # Calculate time difference between each row (Time since last event)
    df['time_since_last_event'] = df['time:timestamp'].diff().dt.total_seconds()
    # Set the time difference of the 1st row to 0 as it's currently NaN
    df.at[0, 'time_since_last_event'] = 0
    # Count number of steps per process
    length_per_case_List = df.groupby(['case:concept:name'])['time_since_last_event'].count().tolist()

    # Using the cumulative sum we get all the positions that are a first step in a process
    # And then the time difference can be set to 0
    position_lst = CumSum(length_per_case_List)
    for i in tqdm(position_lst):
        df.at[i, 'time_since_last_event'] = 0
    # For Loop mysteriously creates an empty row at the end of the df, gotta delete it
    df = df.iloc[: -1]

    # Unzip the position list to get the number of each steps of each process, make that into a list
    step_in_process = []
    for x in tqdm(length_per_case_List):
        for y in range(x):
            step_in_process.append(y + 1)
    # Assign position number to each row/process
    df['position'] = step_in_process

    # Find future time difference by shifting the current time difference
    df['future_time_diff'] = df['time_since_last_event'].shift(-1)
    df.at[df.shape[0] - 1, 'future_time_diff'] = 0

    # Find cumulative time difference (Time since case starts)
    cum_time_diff_lst = [CumSum(df[df['case:concept:name'] == i]['time_since_last_event'].tolist()) for i 
                         in tqdm(df['case:concept:name'].unique().tolist())]
    cum_lst = []
    for i in cum_time_diff_lst:
        for j in i:
            cum_lst.append(j)
    df['time_since_case_starts'] = cum_lst
    
    return df

In [13]:
# Apply the above changes to all dataframes
# The warnings are obsolete, it's because it uses .at which is considerably faster than .loc
df_train = time_difference(df_train)
df_val = time_difference(df_val)
df_test = time_difference(df_test)

100%|█████████████████████████████████████████████████████████████████████████████| 132/132 [00:00<00:00, 43351.98it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████| 132/132 [00:00<?, ?it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|██████████████████████████████████████████████████████████████████████████████| 132/132 [00:00<00:00, 1487.13it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value

# 3. Find Future Event

In [14]:
def next_event(df):
    # Find the next activity name by shifting the current event label
    df['next:concept:name'] = df['concept:name'].shift(-1)
    last_lst = [i - 1 for i in df[df['position'] == 1].index if i != 0]
    # The next event label is 'Nothing' when the cycle is ended
    df.at[df.shape[0] - 1, 'next:concept:name'] = 'Nothing'
    for i in last_lst:
        df.at[i, 'next:concept:name'] = 'Nothing'
    return df

df_train = next_event(df_train)
df_val = next_event(df_val)
df_test = next_event(df_test)

# 4. New Log Features: Time Since Midnight and Time Since Week Start

In [15]:
def add_new_features(df):
    # All the new columns added are calculated in seconds
    df['time_since_midnight'] = df['time:timestamp'].dt.hour * 3600 + df['time:timestamp'].dt.minute * 60 + df['time:timestamp'].dt.second
    # For dayofweek, 0 is Monday, 6 is Sunday
    df['time_since_week_start'] = df['time:timestamp'].dt.dayofweek * 3600 * 24 + df['time_since_midnight']
    return df

df_train = add_new_features(df_train)
df_val = add_new_features(df_val)
df_test = add_new_features(df_test)

# 5. Normalization

In [16]:
def normalization(column):
    # Normalize column so that the column's value is within 0 and 1
    nor_column = 'nor_' + column
    min_max_scaler = MinMaxScaler()
    df_train[nor_column] = min_max_scaler.fit_transform(np.array(df_train[column]).reshape(-1, 1))
    # Use the min max range from training data on validation and test data
    df_val[nor_column] = min_max_scaler.transform(np.array(df_val[column]).reshape(-1, 1))
    df_test[nor_column] = min_max_scaler.transform(np.array(df_test[column]).reshape(-1, 1))

# Normalize numerical columns like time_since_case_starts, time_since_midnight, time_since_week_start, case:RequestedAmount, etc
normalization('time_since_last_event')
normalization('time_since_case_starts')
normalization('time_since_midnight')
normalization('time_since_week_start')
normalization('case:RequestedAmount')

# 5. Remove Outliers

In [17]:
# Remove outlier on both training and validation data
df_all = pd.concat([df_train, df_val])
df_all = df_all.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)

def find_outlier(process_name, df):
    # Remove outlier having future_time_diff larger than mean +- 3 * SD
    df_needed = df[(df['concept:name'] == process_name)]
    mean_value = df_needed['future_time_diff'].mean()
    std_value = df_needed['future_time_diff'].std()
    upper_bound =  mean_value + 3 * std_value
    lower_bound = mean_value - 3 * std_value
    new_df = df_needed[(df_needed['future_time_diff'] < lower_bound) | (df_needed['future_time_diff'] > upper_bound)]
    # Return case id that has at least 1 process as outlier
    return new_df['case:concept:name'].tolist()

outlier_lst = []
# i refers to the position number
for i in tqdm(range(2, len(df_all['position'].tolist()))):
    df_pos = df_all[df_all['position'] == i]
    # a refers to the concept name per position number
    for a in df_pos['concept:name'].unique().tolist():
        small_outlier_lst = find_outlier(a, df_pos)
        outlier_lst = list(set(outlier_lst + small_outlier_lst))

len(outlier_lst)

100%|████████████████████████████████████████████████████████████████████████████| 6417/6417 [00:03<00:00, 2064.72it/s]


101

In [18]:
# Remove all outliers
df_filtered = df_all[~df_all['case:concept:name'].isin(outlier_lst)]
final_all_train = sorted(df_filtered['case:concept:name'].unique().tolist())

# Split training and validation dataset
final_train, final_val = train_test_split(final_all_train, test_size = 0.2)
df_train = df_filtered[df_filtered['case:concept:name'].isin(final_train)]
df_val = df_filtered[df_filtered['case:concept:name'].isin(final_val)]

# To make sure, again sort the datasets on case and consequently timestamp, then reset the index
df_train = df_train.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)
df_val = df_val.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)

In [19]:
print('Exporting to CSVs')
df_train.to_csv('bpi2017_test_train.csv', index = False)
df_val.to_csv('bpi2017_test_val.csv', index = False)
df_test.to_csv('bpi2017_test_test.csv', index = False)

Exporting to CSVs
