# Feature engineering

## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import datetime as datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_classif, f_regression

In [2]:
df_test = pd.read_csv('bpi2017_test.csv')
df_train = pd.read_csv("bpi2017_train.csv")
df_val = pd.read_csv("bpi2017_val.csv")

df_test['time:timestamp'] = pd.to_datetime(df_test['time:timestamp'])
df_train['time:timestamp'] = pd.to_datetime(df_train['time:timestamp'])
df_val['time:timestamp'] = pd.to_datetime(df_val['time:timestamp'])

df_train = df_train.drop(columns=['Unnamed: 0'])
df_val = df_val.drop(columns=["Unnamed: 0"])
df_test = df_test.drop(columns=["Unnamed: 0"])

## Global features

### Case occurrence number

In [3]:
df_train["case_occurrence_no"] = df_train.groupby(['case:concept:name'])['time:timestamp'].cumcount().tolist()

## One-hot encoding

In [4]:
encoded_cols = ['EventOrigin', 'Action', 'lifecycle:transition']
df_train = pd.get_dummies(df_train, columns=encoded_cols, prefix=["EventOrigin_is", "action_is", 'lifecycle:transition_is'])
df_val = pd.get_dummies(df_val, columns=encoded_cols, prefix=["EventOrigin_is", "action_is", 'lifecycle:transition_is'])
df_test = pd.get_dummies(df_test, columns=encoded_cols, prefix=["EventOrigin_is", "action_is", 'lifecycle:transition_is'])

# Creating additional features

### Next and past activity timedelta

In [5]:
def next_past_activity(df):
    temp = df['time:timestamp']
    next_activity = []
    for i in range(len(temp)-1):
        next_activity.append(temp[i+1])

    df['next_activity_delta_t'] = pd.Series(next_activity) - df['time:timestamp']
    df['past_activity_delta_t'] = df['time:timestamp'] - pd.Series(next_activity)
    
    return df

In [6]:
# Cumulative sum function to be used later
def CumSum(lists):
    # Returns the cumulative sum of a list
    length = len(lists)
    cu_list = [sum(lists[0: x: 1]) for x in range(0, length + 1)]
    return cu_list[1: ]

In [7]:
def next_event(df):
    # Find the next activity name by shifting the current event label
    df['next:concept:name'] = df['concept:name'].shift(-1)
    last_lst = [i - 1 for i in df[df['position'] == 1].index if i != 0]
    # The next event label is 'Nothing' when the cycle is ended
    df.at[df.shape[0] - 1, 'next:concept:name'] = 'Nothing'
    for i in last_lst:
        df.at[i, 'next:concept:name'] = 'Nothing'
    return df

### Time difference feature

In [8]:
def time_difference(df):
    # Calculate time difference between each row
    df['time_diff'] = df['time:timestamp'].diff().dt.total_seconds()
    # Set the time difference of the 1st row to 0 as it's currently NaN
    df.at[0, 'time_diff'] = 0
    # Count number of steps per process
    length_per_case_List = df.groupby(['case:concept:name'])['time_diff'].count().tolist()

    # Using the cumulative sum we get all the positions that are a first step in a process
    # And then the time difference can be set to 0
    position_lst = CumSum(length_per_case_List)
    for i in tqdm(position_lst):
        df.at[i, 'time_diff'] = 0
    # For Loop mysteriously creates an empty row at the end of the df, gotta delete it
    df = df.iloc[: -1]

    # Unzip the position list to get the number of each steps of each process, make that into a list
    step_in_process = []
    for x in tqdm(length_per_case_List):
        for y in range(x):
            step_in_process.append(y + 1)
    # Assign position number to each row/process
    df['position'] = step_in_process

    # Find future time difference by shifting the current time difference
    df['future_time_diff'] = df['time_diff'].shift(-1)
    df.at[df.shape[0] - 1, 'future_time_diff'] = 0

    return df

### Weekday feature

In [9]:
def add_weekday(df):
    # Get day of week like Monday, Tuesday, etc
    df_day = pd.DataFrame(data = df['time:timestamp'].dt.dayofweek)
    df_day.rename(columns = {'time:timestamp': 'day'}, inplace = True)
    df['day'] = df_day['day']
    return df

### Working hour feature

In [10]:
def add_working_hour(df):
    # Get hour like 10, 15, etc
    df_day = pd.DataFrame(data = df['time:timestamp'].dt.hour)
    df_day.rename(columns = {'time:timestamp': 'hour'}, inplace = True)
    df['hour'] = df_day['hour']
    return df

### Timestamp parsing

In [11]:
def parse_timestamp(df):
    temp = df["time:timestamp"]
    day_of_month = []
    month_no = []
    quarters = []
    week = []
    hour = []
    seconds = []

    for i in range(len(temp)):
        day_of_month.append(temp[i].day)
        month_no.append(temp[i].month)
        quarters.append(temp[i].quarter)
        week.append(temp[i].week)
        hour.append(temp[i].hour)
        seconds.append(temp[i].second)

    df['day_of_month'] = pd.Series(day_of_month)
    df['month_no'] = pd.Series(month_no)
    df['quarter'] = pd.Series(quarters)
    df['week'] = pd.Series(week)
    df['hour'] = pd.Series(hour)
    df['second'] = pd.Series(seconds)
    return df

### Time difference normalization

In [12]:
def normalize_delta_t(df):
    min_max_scaler = MinMaxScaler()

    df['norm_next_activity_delta'] = min_max_scaler.fit_transform(np.array(df["next_activity_delta_t"]).reshape(-1,1))
    df['norm_past_activity_delta'] = min_max_scaler.fit_transform(np.array(df["past_activity_delta_t"]).reshape(-1,1))
    return df

# Applying functions on the dataset

In [13]:
df_train = time_difference(df_train)
df_val = time_difference(df_val)
df_test = time_difference(df_test)

df_train = parse_timestamp(df_train)
df_val = parse_timestamp(df_val)
df_test = parse_timestamp(df_test)

df_train = next_past_activity(df_train)
df_val = next_past_activity(df_val)
df_test = next_past_activity(df_test)

df_train = normalize_delta_t(df_train)
df_val = normalize_delta_t(df_val)
df_test = normalize_delta_t(df_test)

df_train = next_event(df_train)
df_val = next_event(df_val)
df_test = next_event(df_test)

df_train = add_weekday(df_train)
df_val = add_weekday(df_val)
df_test = add_weekday(df_test)

df_train = add_working_hour(df_train)
df_val = add_working_hour(df_val)
df_test = add_working_hour(df_test)

100%|██████████| 16308/16308 [00:00<00:00, 21482.21it/s]
100%|██████████| 16308/16308 [00:00<00:00, 243936.28it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['position'] = step_in_process
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['future_time_diff'] = df['time_diff'].shift(-1)
100%|██████████| 4078/4078 [00:00<00:00, 27960.11it/s]
100%|██████████| 4078/4078 [00:00<00:00, 204043.71it/s]
100%|██████████| 751/751 [00:00<00:00, 16248.86it/s]
100%|██████████| 751/751 [00:00<00:00, 126365.88it/s]


## Locating outliers

In [14]:
# Remove outlier on both training and validation data
df_all = pd.concat([df_train, df_val])
df_all = df_all.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)

def find_outlier(process_name, df):
    # Remove outlier having time_diff larger than mean +- 3 * SD
    df_needed = df[(df['concept:name'] == process_name)]
    mean_value = df_needed['time_diff'].mean()
    std_value = df_needed['time_diff'].std()
    upper_bound =  mean_value + 3 * std_value
    lower_bound = mean_value - 3 * std_value
    new_df = df_needed[(df_needed['time_diff'] < lower_bound) | (df_needed['time_diff'] > upper_bound)]
    # Return case id that has at least 1 process as outlier
    return new_df['case:concept:name'].tolist()

In [15]:
outlier_lst = []
# i refers to the position number
for i in tqdm(range(2, len(df_all['position'].tolist()))):
    df_pos = df_all[df_all['position'] == i]
    # a refers to the concept name per position number
    for a in df_pos['concept:name'].unique().tolist():
        small_outlier_lst = find_outlier(a, df_pos)
        outlier_lst = list(set(outlier_lst + small_outlier_lst))

100%|██████████| 776130/776130 [29:42<00:00, 435.36it/s]  


## Removing outliers

In [16]:
# Remove all outliers
df_filtered = df_all[~df_all['case:concept:name'].isin(outlier_lst)]
final_all_train = sorted(df_filtered['case:concept:name'].unique().tolist())

# Split training and validation dataset
final_train, final_val = train_test_split(final_all_train, test_size = 0.2)
df_train = df_filtered[df_filtered['case:concept:name'].isin(final_train)]
df_val = df_filtered[df_filtered['case:concept:name'].isin(final_val)]

# To make sure, again sort the datasets on case and consequently timestamp, then reset the index
df_train = df_train.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)
df_val = df_val.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)

In [17]:
df_val

Unnamed: 0,org:resource,concept:name,EventID,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,...,quarter,week,hour,second,next_activity_delta_t,past_activity_delta_t,norm_next_activity_delta,norm_past_activity_delta,next:concept:name,day
0,User_1,A_Create Application,Application_1000691650,2016-01-17 10:54:30.997000+00:00,"Other, see explanation",New credit,Application_1000691650,50000.0,,,...,1,2,10,30,0 days 00:00:00.055000,-1 days +23:59:59.945000,0.999285,0.998428,A_Submitted,6
1,User_1,A_Submitted,ApplState_376135005,2016-01-17 10:54:31.052000+00:00,"Other, see explanation",New credit,Application_1000691650,50000.0,,,...,1,2,10,31,0 days 00:00:00.177000,-1 days +23:59:59.823000,0.999285,0.998428,Nothing,6
2,User_1,W_Handle leads,Workitem_20352517,2016-01-17 10:54:31.229000+00:00,"Other, see explanation",New credit,Application_1000691650,50000.0,,,...,1,2,10,31,0 days 00:00:31.074000,-1 days +23:59:28.926000,0.999285,0.998428,W_Handle leads,6
3,User_1,W_Handle leads,Workitem_707255868,2016-01-17 10:55:02.303000+00:00,"Other, see explanation",New credit,Application_1000691650,50000.0,,,...,1,2,10,2,0 days 00:00:00.008000,-1 days +23:59:59.992000,0.999285,0.998428,W_Complete application,6
4,User_1,W_Complete application,Workitem_1872803633,2016-01-17 10:55:02.311000+00:00,"Other, see explanation",New credit,Application_1000691650,50000.0,,,...,1,2,10,2,0 days 00:00:00.006000,-1 days +23:59:59.994000,0.999285,0.998428,A_Concept,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70886,User_16,W_Call after offers,Workitem_1568830249,2016-10-11 12:13:23.286000+00:00,Home improvement,New credit,Application_609719509,30000.0,,,...,4,41,12,23,0 days 00:00:24.724000,-1 days +23:59:35.276000,0.999285,0.998428,W_Call after offers,1
70887,User_16,W_Call after offers,Workitem_702982090,2016-10-11 12:13:48.010000+00:00,Home improvement,New credit,Application_609719509,30000.0,,,...,4,41,12,48,26 days 18:46:32.104000,-27 days +05:13:27.896000,0.999536,0.998178,A_Cancelled,1
70888,User_1,A_Cancelled,ApplState_557665957,2016-11-07 07:00:20.114000+00:00,Home improvement,New credit,Application_609719509,30000.0,,,...,4,45,7,20,0 days 00:00:00.021000,-1 days +23:59:59.979000,0.999285,0.998428,O_Cancelled,0
70889,User_1,O_Cancelled,OfferState_640057358,2016-11-07 07:00:20.135000+00:00,Home improvement,New credit,Application_609719509,30000.0,,,...,4,45,7,20,0 days 00:00:00.008000,-1 days +23:59:59.992000,0.999285,0.998428,W_Call after offers,0


In [18]:
df_test

Unnamed: 0,org:resource,concept:name,EventID,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,...,quarter,week,hour,second,next_activity_delta_t,past_activity_delta_t,norm_next_activity_delta,norm_past_activity_delta,next:concept:name,day
0,User_1,A_Create Application,Application_837911105,2016-11-14 05:57:57.461000+00:00,Existing loan takeover,New credit,Application_837911105,12000.0,,,...,4,46,5,57,0 days 00:00:01.997000,-1 days +23:59:58.003000,0.999710,0.999293,A_Submitted,0
1,User_1,A_Submitted,ApplState_1692094008,2016-11-14 05:57:59.458000+00:00,Existing loan takeover,New credit,Application_837911105,12000.0,,,...,4,46,5,59,0 days 00:00:00.523000,-1 days +23:59:59.477000,0.999710,0.999293,W_Handle leads,0
2,User_1,W_Handle leads,Workitem_1100633268,2016-11-14 05:57:59.981000+00:00,Existing loan takeover,New credit,Application_837911105,12000.0,,,...,4,46,5,59,0 days 02:29:08.998000,-1 days +21:30:51.002000,0.999711,0.999292,W_Handle leads,0
3,User_54,W_Handle leads,Workitem_167603855,2016-11-14 08:27:08.979000+00:00,Existing loan takeover,New credit,Application_837911105,12000.0,,,...,4,46,8,8,0 days 00:00:50.658000,-1 days +23:59:09.342000,0.999710,0.999293,W_Handle leads,0
4,User_54,W_Handle leads,Workitem_1887085579,2016-11-14 08:27:59.637000+00:00,Existing loan takeover,New credit,Application_837911105,12000.0,,,...,4,46,8,59,0 days 00:00:00.005000,-1 days +23:59:59.995000,0.999710,0.999293,W_Complete application,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28701,User_49,W_Call incomplete files,Workitem_1264169641,2017-01-06 16:13:55.507000+00:00,Car,New credit,Application_965278193,16000.0,,,...,1,1,16,55,0 days 00:02:22.234000,-1 days +23:57:37.766000,0.999710,0.999293,W_Call incomplete files,4
28702,User_49,W_Call incomplete files,Workitem_2138456013,2017-01-06 16:16:17.741000+00:00,Car,New credit,Application_965278193,16000.0,,,...,1,1,16,17,3 days 23:19:28.498000,-4 days +00:40:31.502000,0.999748,0.999255,O_Accepted,4
28703,User_133,O_Accepted,OfferState_224690953,2017-01-10 15:35:46.239000+00:00,Car,New credit,Application_965278193,16000.0,,,...,1,2,15,46,0 days 00:00:00.003000,-1 days +23:59:59.997000,0.999710,0.999293,A_Pending,1
28704,User_133,A_Pending,ApplState_2038793609,2017-01-10 15:35:46.242000+00:00,Car,New credit,Application_965278193,16000.0,,,...,1,2,15,46,0 days 00:00:00.002000,-1 days +23:59:59.998000,0.999710,0.999293,W_Call incomplete files,1


In [19]:
df_train

Unnamed: 0,org:resource,concept:name,EventID,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,...,quarter,week,hour,second,next_activity_delta_t,past_activity_delta_t,norm_next_activity_delta,norm_past_activity_delta,next:concept:name,day
0,User_1,A_Create Application,Application_1000086665,2016-08-03 15:57:21.673000+00:00,"Other, see explanation",New credit,Application_1000086665,5000.0,,,...,3,31,15,21,0 days 00:00:00.061000,-1 days +23:59:59.939000,0.999285,0.998428,A_Submitted,2
1,User_1,A_Submitted,ApplState_161925113,2016-08-03 15:57:21.734000+00:00,"Other, see explanation",New credit,Application_1000086665,5000.0,,,...,3,31,15,21,0 days 00:00:00.229000,-1 days +23:59:59.771000,0.999285,0.998428,W_Handle leads,2
2,User_1,W_Handle leads,Workitem_747707399,2016-08-03 15:57:21.963000+00:00,"Other, see explanation",New credit,Application_1000086665,5000.0,,,...,3,31,15,21,0 days 00:01:06.323000,-1 days +23:58:53.677000,0.999285,0.998428,W_Handle leads,2
3,User_1,W_Handle leads,Workitem_1030261128,2016-08-03 15:58:28.286000+00:00,"Other, see explanation",New credit,Application_1000086665,5000.0,,,...,3,31,15,28,0 days 00:00:00.007000,-1 days +23:59:59.993000,0.999285,0.998428,W_Complete application,2
4,User_1,W_Complete application,Workitem_1127124826,2016-08-03 15:58:28.293000+00:00,"Other, see explanation",New credit,Application_1000086665,5000.0,,,...,3,31,15,28,0 days 00:00:00.006000,-1 days +23:59:59.994000,0.999285,0.998428,A_Concept,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281234,User_30,W_Validate application,Workitem_1318522010,2016-03-07 13:17:01.830000+00:00,Car,New credit,Application_610156903,5000.0,,,...,1,10,13,1,0 days 00:00:14.274000,-1 days +23:59:45.726000,0.999026,0.998726,W_Validate application,0
281235,User_30,W_Validate application,Workitem_2034548280,2016-03-07 13:17:16.104000+00:00,Car,New credit,Application_610156903,5000.0,,,...,1,10,13,16,0 days 00:05:22.616000,-1 days +23:54:37.384000,0.999026,0.998726,O_Accepted,0
281236,User_119,O_Accepted,OfferState_1905445856,2016-03-07 13:22:38.720000+00:00,Car,New credit,Application_610156903,5000.0,,,...,1,10,13,38,0 days 00:00:00.004000,-1 days +23:59:59.996000,0.999026,0.998726,A_Pending,0
281237,User_119,A_Pending,ApplState_316927626,2016-03-07 13:22:38.724000+00:00,Car,New credit,Application_610156903,5000.0,,,...,1,10,13,38,0 days 00:00:00.003000,-1 days +23:59:59.997000,0.999026,0.998726,W_Validate application,0


## Export

In [20]:
df_train.to_csv('bci2017_train.csv', index=False)
df_test.to_csv("bci2017_test", index=False)
df_val.to_csv("bci2017_val", index=False)