# Feature engineering & Feature selection

## Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
import matplotlib.pyplot as plt
import matplotlib
import random

In [None]:
df_test = pd.read_csv('bpi2017_test.csv')
df_train = pd.read_csv("bpi2017_train.csv")
df_val = pd.read_csv("bpi2017_val.csv")

df_test['time:timestamp'] = pd.to_datetime(df_test['time:timestamp'])
df_train['time:timestamp'] = pd.to_datetime(df_train['time:timestamp'])
df_val['time:timestamp'] = pd.to_datetime(df_val['time:timestamp'])

df_train = df_train.drop(columns=['Unnamed: 0'])
df_val = df_val.drop(columns=["Unnamed: 0"])
df_test = df_test.drop(columns=["Unnamed: 0"])

## Global features

### Case occurrence number

In [None]:
df_train["case_occurrence_no"] = df_train.groupby(['case:concept:name'])['time:timestamp'].cumcount().tolist()

## One-hot encoding

In [None]:
encoded_cols = ['EventOrigin', 'Action', 'lifecycle:transition']
df_train = pd.get_dummies(df_train, columns=encoded_cols, prefix=["EventOrigin_is", "action_is", 'lifecycle:transition_is'])
df_val = pd.get_dummies(df_val, columns=encoded_cols, prefix=["EventOrigin_is", "action_is", 'lifecycle:transition_is'])
df_test = pd.get_dummies(df_test, columns=encoded_cols, prefix=["EventOrigin_is", "action_is", 'lifecycle:transition_is'])

# Creating additional features

### Next and past activity timedelta

In [None]:
def next_past_activity(df):
    temp = df['time:timestamp']
    next_activity = []
    for i in range(len(temp)-1):
        next_activity.append(temp[i+1])

    df['next_activity_delta_t'] = pd.Series(next_activity) - df['time:timestamp']
    df['past_activity_delta_t'] = df['time:timestamp'] - pd.Series(next_activity)
    
    return df

In [None]:
# Cumulative sum function to be used later
def CumSum(lists):
    # Returns the cumulative sum of a list
    length = len(lists)
    cu_list = [sum(lists[0: x: 1]) for x in range(0, length + 1)]
    return cu_list[1: ]

In [None]:
def next_event(df):
    # Find the next activity name by shifting the current event label
    df['next:concept:name'] = df['concept:name'].shift(-1)
    last_lst = [i - 1 for i in df[df['position'] == 1].index if i != 0]
    # The next event label is 'Nothing' when the cycle is ended
    df.at[df.shape[0] - 1, 'next:concept:name'] = 'Nothing'
    for i in last_lst:
        df.at[i, 'next:concept:name'] = 'Nothing'
    return df

### Time difference feature

In [None]:
def time_difference(df):
    # Calculate time difference between each row
    df['time_diff'] = df['time:timestamp'].diff().dt.total_seconds()
    # Set the time difference of the 1st row to 0 as it's currently NaN
    df.at[0, 'time_diff'] = 0
    # Count number of steps per process
    length_per_case_List = df.groupby(['case:concept:name'])['time_diff'].count().tolist()

    # Using the cumulative sum we get all the positions that are a first step in a process
    # And then the time difference can be set to 0
    position_lst = CumSum(length_per_case_List)
    for i in tqdm(position_lst):
        df.at[i, 'time_diff'] = 0
    # For Loop mysteriously creates an empty row at the end of the df, gotta delete it
    df = df.iloc[: -1]

    # Unzip the position list to get the number of each steps of each process, make that into a list
    step_in_process = []
    for x in tqdm(length_per_case_List):
        for y in range(x):
            step_in_process.append(y + 1)
    # Assign position number to each row/process
    df['position'] = step_in_process

    # Find future time difference by shifting the current time difference
    df['future_time_diff'] = df['time_diff'].shift(-1)
    df.at[df.shape[0] - 1, 'future_time_diff'] = 0

    return df

### Weekday feature

In [None]:
def add_weekday(df):
    # Get day of week like Monday, Tuesday, etc
    df_day = pd.DataFrame(data = df['time:timestamp'].dt.dayofweek)
    df_day.rename(columns = {'time:timestamp': 'day'}, inplace = True)
    df['day'] = df_day['day']
    return df

### Working hour feature

In [None]:
def add_working_hour(df):
    # Get hour like 10, 15, etc
    df_day = pd.DataFrame(data = df['time:timestamp'].dt.hour)
    df_day.rename(columns = {'time:timestamp': 'hour'}, inplace = True)
    df['hour'] = df_day['hour']
    return df

### Timestamp parsing

In [None]:
def parse_timestamp(df):
    temp = df["time:timestamp"]
    day_of_month = []
    month_no = []
    quarters = []
    week = []
    hour = []
    seconds = []

    for i in range(len(temp)):
        day_of_month.append(temp[i].day)
        month_no.append(temp[i].month)
        quarters.append(temp[i].quarter)
        week.append(temp[i].week)
        hour.append(temp[i].hour)
        seconds.append(temp[i].second)

    df['day_of_month'] = pd.Series(day_of_month)
    df['month_no'] = pd.Series(month_no)
    df['quarter'] = pd.Series(quarters)
    df['week'] = pd.Series(week)
    df['hour'] = pd.Series(hour)
    df['second'] = pd.Series(seconds)
    return df

### Time difference normalization

In [None]:
def normalize_delta_t(df):
    min_max_scaler = MinMaxScaler()

    df['norm_next_activity_delta'] = min_max_scaler.fit_transform(np.array(df["next_activity_delta_t"]).reshape(-1,1))
    df['norm_past_activity_delta'] = min_max_scaler.fit_transform(np.array(df["past_activity_delta_t"]).reshape(-1,1))
    return df

# Applying functions on the dataset

In [None]:
df_train = time_difference(df_train)
df_val = time_difference(df_val)
df_test = time_difference(df_test)

df_train = parse_timestamp(df_train)
df_val = parse_timestamp(df_val)
df_test = parse_timestamp(df_test)

df_train = next_past_activity(df_train)
df_val = next_past_activity(df_val)
df_test = next_past_activity(df_test)

df_train = normalize_delta_t(df_train)
df_val = normalize_delta_t(df_val)
df_test = normalize_delta_t(df_test)

df_train = next_event(df_train)
df_val = next_event(df_val)
df_test = next_event(df_test)

df_train = add_weekday(df_train)
df_val = add_weekday(df_val)
df_test = add_weekday(df_test)

df_train = add_working_hour(df_train)
df_val = add_working_hour(df_val)
df_test = add_working_hour(df_test)

# Feature Selection

In [None]:
df_train.columns

In [None]:
X_train_processed_num = df_train[['case:RequestedAmount']]
X_train_processed_cat = df_train[['action_is_Created',
       'action_is_Deleted', 'action_is_Obtained', 'action_is_Released',
       'action_is_statechange', 'lifecycle:transition_is_ate_abort',
       'lifecycle:transition_is_complete', 'lifecycle:transition_is_resume',
       'lifecycle:transition_is_schedule', 'lifecycle:transition_is_start',
       'lifecycle:transition_is_suspend', 'lifecycle:transition_is_withdraw', 'concept:name', 'EventOrigin_is_Application',
       'EventOrigin_is_Offer', 'EventOrigin_is_Workflow', 'case:LoanGoal', 'case:ApplicationType']]
y_train_1 = df_train[['time:timestamp']]
y_train_2 = df_train[['concept:name']]

# One-hot encoding on categorical data
enc = OneHotEncoder(handle_unknown = 'ignore', sparse=False)
transformed = enc.fit_transform(X_train_processed_cat)
X_train_processed_cat = pd.DataFrame(transformed, columns = enc.get_feature_names())
X_train_processed = pd.concat([X_train_processed_cat, X_train_processed_num], axis = 1)

In [None]:
X_train_processed_num = df_train[['case:RequestedAmount']]
X_train_processed_cat = df_train[['action_is_Created',
       'action_is_Deleted', 'action_is_Obtained', 'action_is_Released',
       'action_is_statechange', 'lifecycle:transition_is_ate_abort',
       'lifecycle:transition_is_complete', 'lifecycle:transition_is_resume',
       'lifecycle:transition_is_schedule', 'lifecycle:transition_is_start',
       'lifecycle:transition_is_suspend', 'lifecycle:transition_is_withdraw', 'concept:name', 'EventOrigin_is_Application',
       'EventOrigin_is_Offer', 'EventOrigin_is_Workflow', 'case:LoanGoal', 'case:ApplicationType']]
y_train_1 = df_train[['future_time_diff']]
y_train_2 = df_train[['next:concept:name']]

# One-hot encoding on categorical data
enc = OneHotEncoder(handle_unknown = 'ignore', sparse=False)
transformed = enc.fit_transform(X_train_processed_cat)
X_train_processed_cat = pd.DataFrame(transformed, columns = enc.get_feature_names())
X_train_processed = pd.concat([X_train_processed_cat, X_train_processed_num], axis = 1)

In [None]:
# Find the score for each variable for time prediction
skb_time = SelectKBest(score_func = f_regression)
skb_time.fit_transform(X_train_processed, y_train_1)
score_dct_time = dict(zip(X_train_processed.columns.tolist(), skb_time.scores_.round(decimals = 1).tolist()))
df_time_score = pd.DataFrame(list(score_dct_time.items()))
df_time_score.rename(columns = {0: 'variable', 1: 'score'}, inplace = True)
df_time_score = df_time_score.sort_values(by = ['score'], ascending = False).reset_index(drop = True)
df_time_score

In [None]:
# Find the score for each variable for event prediction
skb_event = SelectKBest(score_func = f_classif)
skb_event.fit_transform(X_train_processed, y_train_2)
score_dct_event = dict(zip(X_train_processed.columns.tolist(), skb_event.scores_.round(decimals = 1).tolist()))
df_event_score = pd.DataFrame(list(score_dct_event.items()))
df_event_score.rename(columns = {0: 'variable', 1: 'score'}, inplace = True)
df_event_score = df_event_score.sort_values(by = ['score'], ascending = False).reset_index(drop = True)
df_event_score

## Locating outliers

In [None]:
# Remove outlier on both training and validation data
df_all = pd.concat([df_train, df_val])
df_all = df_all.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)

def find_outlier(process_name, df):
    # Remove outlier having time_diff larger than mean +- 3 * SD
    df_needed = df[(df['concept:name'] == process_name)]
    mean_value = df_needed['time_diff'].mean()
    std_value = df_needed['time_diff'].std()
    upper_bound =  mean_value + 3 * std_value
    lower_bound = mean_value - 3 * std_value
    new_df = df_needed[(df_needed['time_diff'] < lower_bound) | (df_needed['time_diff'] > upper_bound)]
    # Return case id that has at least 1 process as outlier
    return new_df['case:concept:name'].tolist()

In [None]:
outlier_lst = []
# i refers to the position number
for i in tqdm(range(2, len(df_all['position'].tolist()))):
    df_pos = df_all[df_all['position'] == i]
    # a refers to the concept name per position number
    for a in df_pos['concept:name'].unique().tolist():
        small_outlier_lst = find_outlier(a, df_pos)
        outlier_lst = list(set(outlier_lst + small_outlier_lst))

In [None]:
# Remove all outliers
df_filtered = df_all[~df_all['case:concept:name'].isin(outlier_lst)]
final_all_train = sorted(df_filtered['case:concept:name'].unique().tolist())

# Split training and validation dataset
final_train, final_val = train_test_split(final_all_train, test_size = 0.2)
df_train = df_filtered[df_filtered['case:concept:name'].isin(final_train)]
df_val = df_filtered[df_filtered['case:concept:name'].isin(final_val)]

# To make sure, again sort the datasets on case and consequently timestamp, then reset the index
df_train = df_train.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)
df_val = df_val.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)

In [None]:
df_val

In [None]:
df_test

## PCA Analysis

### Select data for PCA

In [None]:
pca_df = df_test

# Select a subset of features you want 
features = ['case:RequestedAmount', 'EventOrigin_is_Application', 'EventOrigin_is_Offer', 
            'EventOrigin_is_Workflow', 'action_is_Created', 'action_is_Deleted',
            'action_is_Obtained', 'action_is_Released', 'action_is_statechange',
            'lifecycle:transition_is_ate_abort', 'lifecycle:transition_is_complete',
            'lifecycle:transition_is_resume', 'lifecycle:transition_is_schedule',
            'lifecycle:transition_is_start', 'lifecycle:transition_is_suspend',
            'lifecycle:transition_is_withdraw', 'position', 'day_of_month', 'month_no',
             'quarter', 'week', 'hour', 'second', 'norm_next_activity_delta', 'norm_past_activity_delta']

x = pca_df.loc[:, features].values

y = pca_df.loc[:, ['concept:name']].values

# Standardize the features
x = StandardScaler().fit_transform(x)

### Perform analysis

In [None]:
pca = PCA(n_components=3)

principal_components = pca.fit_transform(x)

principal_df = pd.DataFrame(data=principal_components, columns=['principal component 1', 'principal component 2', 'principal component 3'])

# Combine target variable with pricipal components

pca_res = pd.concat([principal_df, pca_df[['concept:name']]], axis = 1)

### Visualize results

In [None]:
pca_res

In [None]:
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(projection='3d')

ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.set_title('Dual component PCA Visualization')

# Select only the ones you're interested in
targets = ['A_Create Application', 'A_Submitted', 'W_Handle leads',
       'W_Complete application', 'A_Concept', 'A_Accepted',
       'O_Create Offer', 'O_Created', 'O_Sent (mail and online)',
       'W_Call after offers', 'A_Complete', 'A_Cancelled', 'O_Cancelled',
       'W_Validate application', 'A_Validating', 'O_Returned',
       'W_Call incomplete files', 'A_Incomplete', 'O_Accepted',
       'A_Pending', 'A_Denied', 'O_Refused', 'O_Sent (online only)',
       'W_Assess potential fraud']
       
colors_keys = [(k) for (k,v) in matplotlib.colors.cnames.items()]
colors = random.choices(colors_keys, k=len(features))

for target, color in zip(targets, colors):
    indicesToKeep = pca_res['concept:name'] == target
    ax.scatter(pca_res.loc[indicesToKeep, 'principal component 1']
               , pca_res.loc[indicesToKeep, 'principal component 2']
               , pca_res.loc[indicesToKeep, 'principal component 3']
               , c = color
               , s = 50)

ax.legend(targets)
# Change the view
# ax.view_init(10, 50)
ax.grid()

In [None]:
pca.explained_variance_ratio_

## Export

In [None]:
df_train.to_csv('bci2017_train_filtered.csv', index=False)
df_test.to_csv("bci2017_test_filtered.csv", index=False)
df_val.to_csv("bci2017_val_filtered.csv", index=False)