In [1]:
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import datetime as datetime
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('bpi_2017.csv', parse_dates = ['time:timestamp'])
df.info()
# The default name indicating the case ID is case:concept:name
# concept:name is the event
# time:timestamp is the corresponding timestamp

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1202267 entries, 0 to 1202266
Data columns (total 20 columns):
 #   Column                 Non-Null Count    Dtype              
---  ------                 --------------    -----              
 0   Unnamed: 0             1202267 non-null  int64              
 1   Action                 1202267 non-null  object             
 2   org:resource           1202267 non-null  object             
 3   concept:name           1202267 non-null  object             
 4   EventOrigin            1202267 non-null  object             
 5   EventID                1202267 non-null  object             
 6   lifecycle:transition   1202267 non-null  object             
 7   time:timestamp         1202267 non-null  datetime64[ns, UTC]
 8   case:LoanGoal          1202267 non-null  object             
 9   case:ApplicationType   1202267 non-null  object             
 10  case:concept:name      1202267 non-null  object             
 11  case:RequestedAmount   1

# 1. Splitting Data

In [3]:
# Obtain date (datetime format) from datatype of time:timestamp 
df['Date'] = np.array(df['time:timestamp'].values, dtype = 'datetime64[D]').astype(datetime.datetime)
df

Unnamed: 0.1,Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,...,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,Date
0,0,Created,User_1,A_Create Application,Application,Application_652823628,complete,2016-01-01 09:51:15.304000+00:00,Existing loan takeover,New credit,...,20000.0,,,,,,,,,2016-01-01
1,1,statechange,User_1,A_Submitted,Application,ApplState_1582051990,complete,2016-01-01 09:51:15.352000+00:00,Existing loan takeover,New credit,...,20000.0,,,,,,,,,2016-01-01
2,2,Created,User_1,W_Handle leads,Workflow,Workitem_1298499574,schedule,2016-01-01 09:51:15.774000+00:00,Existing loan takeover,New credit,...,20000.0,,,,,,,,,2016-01-01
3,3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1673366067,withdraw,2016-01-01 09:52:36.392000+00:00,Existing loan takeover,New credit,...,20000.0,,,,,,,,,2016-01-01
4,4,Created,User_1,W_Complete application,Workflow,Workitem_1493664571,schedule,2016-01-01 09:52:36.403000+00:00,Existing loan takeover,New credit,...,20000.0,,,,,,,,,2016-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1202262,1202262,Deleted,User_1,W_Call after offers,Workflow,Workitem_1817549786,ate_abort,2017-01-06 06:33:02.212000+00:00,Home improvement,New credit,...,20000.0,,,,,,,,,2017-01-06
1202263,1202263,Created,User_1,W_Call after offers,Workflow,Workitem_363876066,schedule,2017-01-06 06:33:02.221000+00:00,Home improvement,New credit,...,20000.0,,,,,,,,,2017-01-06
1202264,1202264,statechange,User_28,A_Cancelled,Application,ApplState_1869071797,complete,2017-01-16 09:51:21.114000+00:00,Home improvement,New credit,...,20000.0,,,,,,,,,2017-01-16
1202265,1202265,statechange,User_28,O_Cancelled,Offer,OfferState_420066181,complete,2017-01-16 09:51:21.139000+00:00,Home improvement,New credit,...,20000.0,,,,,,,,Offer_1580299144,2017-01-16


In [4]:
# Determine training and testing data's date boundaries
date_unique = sorted(df['Date'].unique())
total_date = len(date_unique)
all_train_nr = round(total_date * 0.8)
date_before_test = date_unique[all_train_nr - 1]
date_before_test

datetime.date(2016, 11, 13)

In [5]:
# Remove entries with case ID across date boundaries
small_df = df[['Date', 'case:concept:name']].drop_duplicates()
small_df_1 = small_df[small_df['Date'] <= date_before_test]
small_df_2 = small_df[small_df['Date'] > date_before_test]
bruh = set(small_df_1['case:concept:name'].unique()).intersection(set(small_df_2['case:concept:name'].unique()))
case_unique_train = sorted(list(set(small_df_1['case:concept:name'].unique()) - bruh))
case_unique_test = sorted(list(set(small_df_2['case:concept:name'].unique()) - bruh))

In [6]:
# Determine training and testing data's ID boundaries
all_case = sorted(df['case:concept:name'].unique())
total_case = len(all_case)
all_train_case = round(total_case * 0.8)
case_all_train = sorted(all_case)[: all_train_case]
case_test = sorted(all_case)[all_train_case: ]

# Combine ID boundaries and time boundaries
final_all_train = sorted(list(set(case_unique_train).intersection(set(case_all_train))))
final_test = sorted(list(set(case_unique_test).intersection(set(case_test))))

# Split training and validation dataset
final_train, final_val = train_test_split(final_all_train, test_size = 0.2)

# Split the dataset
df_train = df[df['case:concept:name'].isin(final_train)]
df_val = df[df['case:concept:name'].isin(final_val)]
df_test = df[df['case:concept:name'].isin(final_test)]
df_train = df_train.drop(columns = ['Unnamed: 0', 'Date']).reset_index(drop = True)
df_val = df_val.drop(columns = ['Unnamed: 0', 'Date']).reset_index(drop = True)
df_test = df_test.drop(columns = ['Unnamed: 0', 'Date']).reset_index(drop = True)

In [7]:
df_train

Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID
0,Created,User_1,A_Create Application,Application,Application_1691306052,complete,2016-01-01 10:16:11.500000+00:00,Home improvement,New credit,Application_1691306052,10000.0,,,,,,,,
1,statechange,User_1,A_Submitted,Application,ApplState_284636842,complete,2016-01-01 10:16:11.549000+00:00,Home improvement,New credit,Application_1691306052,10000.0,,,,,,,,
2,Created,User_1,W_Handle leads,Workflow,Workitem_831373279,schedule,2016-01-01 10:16:11.740000+00:00,Home improvement,New credit,Application_1691306052,10000.0,,,,,,,,
3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1299098074,withdraw,2016-01-01 10:17:31.573000+00:00,Home improvement,New credit,Application_1691306052,10000.0,,,,,,,,
4,Created,User_1,W_Complete application,Workflow,Workitem_1703931302,schedule,2016-01-01 10:17:31.584000+00:00,Home improvement,New credit,Application_1691306052,10000.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620621,Created,User_44,O_Create Offer,Offer,Offer_1879818928,complete,2016-11-11 15:07:05.804000+00:00,Existing loan takeover,New credit,Application_1354066062,25000.0,25000.0,120.0,True,254.56,False,0.0,25000.0,
620622,statechange,User_44,O_Created,Offer,OfferState_975946794,complete,2016-11-11 15:07:06.388000+00:00,Existing loan takeover,New credit,Application_1354066062,25000.0,,,,,,,,Offer_1879818928
620623,statechange,User_44,A_Cancelled,Application,ApplState_1775629292,complete,2016-11-11 15:09:00.488000+00:00,Existing loan takeover,New credit,Application_1354066062,25000.0,,,,,,,,
620624,statechange,User_44,O_Cancelled,Offer,OfferState_2048261328,complete,2016-11-11 15:09:00.507000+00:00,Existing loan takeover,New credit,Application_1354066062,25000.0,,,,,,,,Offer_1879818928


In [8]:
df_val

Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID
0,Created,User_1,A_Create Application,Application,Application_1878239836,complete,2016-01-01 13:35:26.422000+00:00,Home improvement,New credit,Application_1878239836,15000.0,,,,,,,,
1,statechange,User_1,A_Submitted,Application,ApplState_649299817,complete,2016-01-01 13:35:26.463000+00:00,Home improvement,New credit,Application_1878239836,15000.0,,,,,,,,
2,Created,User_1,W_Handle leads,Workflow,Workitem_1693495632,schedule,2016-01-01 13:35:26.664000+00:00,Home improvement,New credit,Application_1878239836,15000.0,,,,,,,,
3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1080257214,withdraw,2016-01-01 13:36:30.982000+00:00,Home improvement,New credit,Application_1878239836,15000.0,,,,,,,,
4,Created,User_1,W_Complete application,Workflow,Workitem_1531288681,schedule,2016-01-01 13:36:30.993000+00:00,Home improvement,New credit,Application_1878239836,15000.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155501,Obtained,User_43,W_Call after offers,Workflow,Workitem_162257588,start,2016-11-12 09:31:06.232000+00:00,Not speficied,New credit,Application_1583440896,50000.0,,,,,,,,
155502,statechange,User_43,A_Complete,Application,ApplState_1435859997,complete,2016-11-12 09:31:06.234000+00:00,Not speficied,New credit,Application_1583440896,50000.0,,,,,,,,
155503,statechange,User_43,A_Cancelled,Application,ApplState_860885554,complete,2016-11-12 09:35:55.142000+00:00,Not speficied,New credit,Application_1583440896,50000.0,,,,,,,,
155504,statechange,User_43,O_Cancelled,Offer,OfferState_2011904028,complete,2016-11-12 09:35:55.161000+00:00,Not speficied,New credit,Application_1583440896,50000.0,,,,,,,,Offer_1248413119


In [9]:
df_test

Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID
0,Created,User_1,A_Create Application,Application,Application_837911105,complete,2016-11-14 05:57:57.461000+00:00,Existing loan takeover,New credit,Application_837911105,12000.0,,,,,,,,
1,statechange,User_1,A_Submitted,Application,ApplState_1692094008,complete,2016-11-14 05:57:59.458000+00:00,Existing loan takeover,New credit,Application_837911105,12000.0,,,,,,,,
2,Created,User_1,W_Handle leads,Workflow,Workitem_1100633268,schedule,2016-11-14 05:57:59.981000+00:00,Existing loan takeover,New credit,Application_837911105,12000.0,,,,,,,,
3,Obtained,User_54,W_Handle leads,Workflow,Workitem_167603855,start,2016-11-14 08:27:08.979000+00:00,Existing loan takeover,New credit,Application_837911105,12000.0,,,,,,,,
4,Deleted,User_54,W_Handle leads,Workflow,Workitem_1887085579,complete,2016-11-14 08:27:59.637000+00:00,Existing loan takeover,New credit,Application_837911105,12000.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28701,Obtained,User_49,W_Call incomplete files,Workflow,Workitem_1264169641,resume,2017-01-06 16:13:55.507000+00:00,Car,New credit,Application_965278193,16000.0,,,,,,,,
28702,Released,User_49,W_Call incomplete files,Workflow,Workitem_2138456013,suspend,2017-01-06 16:16:17.741000+00:00,Car,New credit,Application_965278193,16000.0,,,,,,,,
28703,statechange,User_133,O_Accepted,Offer,OfferState_224690953,complete,2017-01-10 15:35:46.239000+00:00,Car,New credit,Application_965278193,16000.0,,,,,,,,Offer_1880706415
28704,statechange,User_133,A_Pending,Application,ApplState_2038793609,complete,2017-01-10 15:35:46.242000+00:00,Car,New credit,Application_965278193,16000.0,,,,,,,,


In [10]:
df_train.to_csv('bpi2017_train.csv')
df_val.to_csv('bpi2017_val.csv')
df_test.to_csv('bpi2017_test.csv')