# Feature engineering

## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv('bpi_2017.csv')
df['time:timestamp'] = pd.to_datetime(df['time:timestamp'])
df = df.rename(columns = {"concept:name": 'event', "case:concept:name": 'case', "org:resource": 'role'})
df = df.drop(columns=['Unnamed: 0'])

## One-hot encoding

In [3]:
encoded_cols = ['EventOrigin', 'Action', 'lifecycle:transition', 'case:ApplicationType']
df = pd.get_dummies(df, columns=['event', 'EventOrigin', 'Action', 'case:ApplicationType', 'lifecycle:transition'], prefix=['event_is', "EventOrigin_is", "action_is", 'case:applicationType_is', 'lifecycle:transition_is'])

# Creating additional features

### Next and past activity timedelta

In [4]:
temp = df['time:timestamp']
next_activity = []
for i in range(len(temp)-1):
    next_activity.append(temp[i+1])

df['next_activity_delta_t'] = pd.Series(next_activity) - df['time:timestamp']
df['past_activity_delta_t'] = df['time:timestamp'] - pd.Series(next_activity)

### Timestamp parsing

In [5]:
# df['delta_t_since_midnight'] = pd.Timestamp() + df['time:timestamp']
temp = df["time:timestamp"]
day_of_month = []
month_no = []
quarters = []
week = []
hour = []
seconds = []

for i in range(len(temp)):
    day_of_month.append(temp[i].day)
    month_no.append(temp[i].month)
    quarters.append(temp[i].quarter)
    week.append(temp[i].week)
    hour.append(temp[i].hour)
    seconds.append(temp[i].second)

df['day_of_month'] = pd.Series(day_of_month)
df['month_no'] = pd.Series(month_no)
df['quarter'] = pd.Series(quarters)
df['week'] = pd.Series(week)
df['hour'] = pd.Series(hour)
df['second'] = pd.Series(seconds)

### Time difference normalization

In [6]:
min_max_scaler = MinMaxScaler()

df['norm_next_activity_delta'] = min_max_scaler.fit_transform(np.array(df["next_activity_delta_t"]).reshape(-1,1))
df['norm_past_activity_delta'] = min_max_scaler.fit_transform(np.array(df["past_activity_delta_t"]).reshape(-1,1))
df

Unnamed: 0,role,EventID,time:timestamp,case:LoanGoal,case,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,...,next_activity_delta_t,past_activity_delta_t,day_of_month,month_no,quarter,week,hour,second,norm_next_activity_delta,norm_past_activity_delta
0,User_1,Application_652823628,2016-01-01 09:51:15.304000+00:00,Existing loan takeover,Application_652823628,20000.0,,,,,...,0 days 00:00:00.048000,-1 days +23:59:59.952000,1,1,1,53,9,15,0.998726,0.997327
1,User_1,ApplState_1582051990,2016-01-01 09:51:15.352000+00:00,Existing loan takeover,Application_652823628,20000.0,,,,,...,0 days 00:00:00.422000,-1 days +23:59:59.578000,1,1,1,53,9,15,0.998726,0.997327
2,User_1,Workitem_1298499574,2016-01-01 09:51:15.774000+00:00,Existing loan takeover,Application_652823628,20000.0,,,,,...,0 days 00:01:20.618000,-1 days +23:58:39.382000,1,1,1,53,9,15,0.998726,0.997327
3,User_1,Workitem_1673366067,2016-01-01 09:52:36.392000+00:00,Existing loan takeover,Application_652823628,20000.0,,,,,...,0 days 00:00:00.011000,-1 days +23:59:59.989000,1,1,1,53,9,36,0.998726,0.997327
4,User_1,Workitem_1493664571,2016-01-01 09:52:36.403000+00:00,Existing loan takeover,Application_652823628,20000.0,,,,,...,0 days 00:00:00.010000,-1 days +23:59:59.990000,1,1,1,53,9,36,0.998726,0.997327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1202262,User_1,Workitem_1817549786,2017-01-06 06:33:02.212000+00:00,Home improvement,Application_1350494635,20000.0,,,,,...,0 days 00:00:00.009000,-1 days +23:59:59.991000,6,1,1,1,6,2,0.998726,0.997327
1202263,User_1,Workitem_363876066,2017-01-06 06:33:02.221000+00:00,Home improvement,Application_1350494635,20000.0,,,,,...,10 days 03:18:18.893000,-11 days +20:41:41.107000,6,1,1,1,6,2,0.998821,0.997233
1202264,User_28,ApplState_1869071797,2017-01-16 09:51:21.114000+00:00,Home improvement,Application_1350494635,20000.0,,,,,...,0 days 00:00:00.025000,-1 days +23:59:59.975000,16,1,1,3,9,21,0.998726,0.997327
1202265,User_28,OfferState_420066181,2017-01-16 09:51:21.139000+00:00,Home improvement,Application_1350494635,20000.0,,,,,...,0 days 00:00:00.007000,-1 days +23:59:59.993000,16,1,1,3,9,21,0.998726,0.997327


## Export

In [7]:
df.to_csv('BPIC2017_FULL.csv', index=False)