In [None]:
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import datetime as datetime
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("BPI_Challenge_2012.csv", parse_dates = ['time:timestamp'])
df.info()
# The default name indicating the case ID is case:concept:name
# concept:name is the event
# time:timestamp is the corresponding timestamp

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262200 entries, 0 to 262199
Data columns (total 8 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Unnamed: 0            262200 non-null  int64  
 1   org:resource          244190 non-null  float64
 2   lifecycle:transition  262200 non-null  object 
 3   concept:name          262200 non-null  object 
 4   time:timestamp        262200 non-null  object 
 5   case:REG_DATE         262200 non-null  object 
 6   case:concept:name     262200 non-null  int64  
 7   case:AMOUNT_REQ       262200 non-null  int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 16.0+ MB


# 1. Splitting Data

In [None]:
# Obtain date (datetime format) from datatype of time:timestamp 
df['Date'] = np.array(df['time:timestamp'].values, dtype = 'datetime64[D]').astype(datetime.datetime)
df

  df['Date'] = np.array(df['time:timestamp'].values, dtype = 'datetime64[D]').astype(datetime.datetime)


Unnamed: 0.1,Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ,Date
0,0,112.0,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000,2011-09-30
1,1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000,2011-09-30
2,2,112.0,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000,2011-09-30
3,3,112.0,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000,2011-09-30
4,4,,START,W_Completeren aanvraag,2011-10-01 11:36:46.437000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000,2011-10-01
...,...,...,...,...,...,...,...,...,...
262195,262195,112.0,COMPLETE,A_PARTLYSUBMITTED,2012-02-29 23:51:17.423000+01:00,2012-02-29 23:51:16.799000+01:00,214376,15000,2012-02-29
262196,262196,112.0,SCHEDULE,W_Afhandelen leads,2012-02-29 23:52:01.287000+01:00,2012-02-29 23:51:16.799000+01:00,214376,15000,2012-02-29
262197,262197,11169.0,START,W_Afhandelen leads,2012-03-01 09:26:46.736000+01:00,2012-02-29 23:51:16.799000+01:00,214376,15000,2012-03-01
262198,262198,11169.0,COMPLETE,A_DECLINED,2012-03-01 09:27:37.118000+01:00,2012-02-29 23:51:16.799000+01:00,214376,15000,2012-03-01


In [None]:
# Determine training and testing data's date boundaries
date_unique = sorted(df['Date'].unique())
total_date = len(date_unique)
all_train_nr = round(total_date * 0.8)
date_before_test = date_unique[all_train_nr - 1]
date_before_test

datetime.date(2012, 2, 10)

In [None]:
# Remove entries with case ID across date boundaries
small_df = df[['Date', 'case:concept:name']].drop_duplicates()
small_df_before = small_df[small_df['Date'] <= date_before_test]
small_df_after = small_df[small_df['Date'] > date_before_test]
intersection = set(small_df_before['case:concept:name'].unique()).intersection(set(small_df_after['case:concept:name'].unique()))
case_unique = sorted(list(set(small_df_before['case:concept:name'].unique()) - intersection))

In [None]:
# Determine training and testing data's ID boundaries afer determining suitable IDs
total_case = len(case_unique)
all_train_case = round(total_case * 0.8)
case_all_train = case_unique[: all_train_case]
case_test = case_unique[all_train_case: ]

# Split training and validation dataset
case_train, case_val = train_test_split(case_all_train, test_size = 0.2)

# Split the dataset
df_train = df[df['case:concept:name'].isin(case_train)]
df_val = df[df['case:concept:name'].isin(case_val)]
df_test = df[df['case:concept:name'].isin(case_test)]
df_train = df_train.drop(columns = ['Unnamed: 0', 'Date']).reset_index(drop = True)
df_val = df_val.drop(columns = ['Unnamed: 0', 'Date']).reset_index(drop = True)
df_test = df_test.drop(columns = ['Unnamed: 0', 'Date']).reset_index(drop = True)

In [None]:
df_train

Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ
0,112.0,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000
1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000
2,112.0,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000
3,112.0,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000
4,,START,W_Completeren aanvraag,2011-10-01 11:36:46.437000+02:00,2011-10-01 00:38:44.546000+02:00,173688,20000
...,...,...,...,...,...,...,...
130715,10138.0,COMPLETE,W_Valideren aanvraag,2012-02-10 13:02:04.765000+01:00,2012-01-09 17:40:12.748000+01:00,199345,8500
130716,10138.0,START,W_Valideren aanvraag,2012-02-10 13:32:48.693000+01:00,2012-01-09 17:40:12.748000+01:00,199345,8500
130717,10138.0,COMPLETE,A_DECLINED,2012-02-10 13:41:38.050000+01:00,2012-01-09 17:40:12.748000+01:00,199345,8500
130718,10138.0,COMPLETE,O_DECLINED,2012-02-10 13:41:38.050000+01:00,2012-01-09 17:40:12.748000+01:00,199345,8500


In [None]:
df_val

Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ
0,112.0,COMPLETE,A_SUBMITTED,2011-10-01 08:08:58.256000+02:00,2011-10-01 08:08:58.256000+02:00,173691,5000
1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 08:09:02.195000+02:00,2011-10-01 08:08:58.256000+02:00,173691,5000
2,112.0,COMPLETE,A_PREACCEPTED,2011-10-01 08:09:56.648000+02:00,2011-10-01 08:08:58.256000+02:00,173691,5000
3,112.0,SCHEDULE,W_Completeren aanvraag,2011-10-01 08:09:59.578000+02:00,2011-10-01 08:08:58.256000+02:00,173691,5000
4,,START,W_Completeren aanvraag,2011-10-01 11:37:32.393000+02:00,2011-10-01 08:08:58.256000+02:00,173691,5000
...,...,...,...,...,...,...,...
32216,112.0,COMPLETE,A_PARTLYSUBMITTED,2012-01-09 16:52:14.739000+01:00,2012-01-09 16:52:14.348000+01:00,199321,4000
32217,112.0,SCHEDULE,W_Afhandelen leads,2012-01-09 16:52:24.226000+01:00,2012-01-09 16:52:14.348000+01:00,199321,4000
32218,11003.0,START,W_Afhandelen leads,2012-01-09 17:01:06.277000+01:00,2012-01-09 16:52:14.348000+01:00,199321,4000
32219,11003.0,COMPLETE,A_DECLINED,2012-01-09 17:13:30.037000+01:00,2012-01-09 16:52:14.348000+01:00,199321,4000


In [None]:
df_test

Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ
0,112.0,COMPLETE,A_SUBMITTED,2012-01-09 17:44:18.199000+01:00,2012-01-09 17:44:18.199000+01:00,199348,6000
1,112.0,COMPLETE,A_PARTLYSUBMITTED,2012-01-09 17:44:18.401000+01:00,2012-01-09 17:44:18.199000+01:00,199348,6000
2,112.0,COMPLETE,A_PREACCEPTED,2012-01-09 17:44:55.414000+01:00,2012-01-09 17:44:18.199000+01:00,199348,6000
3,112.0,SCHEDULE,W_Completeren aanvraag,2012-01-09 17:44:55.822000+01:00,2012-01-09 17:44:18.199000+01:00,199348,6000
4,10929.0,START,W_Completeren aanvraag,2012-01-09 17:45:33+01:00,2012-01-09 17:44:18.199000+01:00,199348,6000
...,...,...,...,...,...,...,...
27257,112.0,COMPLETE,A_PARTLYSUBMITTED,2012-02-10 22:15:22.972000+01:00,2012-02-10 22:15:19.240000+01:00,208295,48000
27258,112.0,COMPLETE,A_DECLINED,2012-02-10 22:16:18.231000+01:00,2012-02-10 22:15:19.240000+01:00,208295,48000
27259,112.0,COMPLETE,A_SUBMITTED,2012-02-10 22:57:49.541000+01:00,2012-02-10 22:57:49.541000+01:00,208301,2500
27260,112.0,COMPLETE,A_PARTLYSUBMITTED,2012-02-10 22:57:49.866000+01:00,2012-02-10 22:57:49.541000+01:00,208301,2500


In [None]:
df_train.to_csv('bpi2012_train.csv')
df_val.to_csv('bpi2012_val.csv')
df_test.to_csv('bpi2012_test.csv')