In [None]:
import pandas as pd
import numpy as np
import os

In [2]:
from functions.data_preprocessing import *

### Datasets

In [3]:
# BPIC 2017 (real)
# source: https://github.com/ocpm/ocpa/blob/main/sample_logs/csv/BPI2017.zip

# DS2 (simulated)
# source: https://github.com/niklasadams/OCCasesAndVariants/tree/master/example_logs/csv
# info: DS2 is a synthetic event log, consisting of an especially high amount of connected objects and variability

### BPIC 2017 Preprocessing

For all applications, the following data is available:

- Requested load amount (in Euro),
- The application type,
- The reason the loan was applied for (LoanGoal), and
- An application ID.


For all offers, the following data is available:

- An offer ID,
- The offered amount,
- The initial withdrawal amount,
- The number of payback terms agreed to,
- The monthly costs,
- The creditscore of the customer,
- The employee who created the offer,
- Whether the offer was selected, and
- Whether the offer was accepted by the customer.

In [4]:
bpi17 = pd.read_csv('ocel/BPIC17.csv', sep=',')

In [5]:
with pd.option_context('display.max_columns', None): 
    display(bpi17)

Unnamed: 0,event_None,event_Unnamed: 0,event_id,application,event_activity,event_start_timestamp,event_timestamp,event_LoanGoal,event_ApplicationType,event_RequestedAmount,event_Action,event_FirstWithdrawalAmount,event_Accepted,event_NumberOfTerms,offer,event_org:resource,event_MonthlyCost,event_EventOrigin,event_EventID,event_Selected,event_CreditScore,event_OfferedAmount,event_CaseID
0,0,0,0,['Application_652823628'],Create application,2016/01/01 10:51:15.304,2016/01/01 10:51:15.304,Existing loan takeover,New credit,20000.0,Created,,,,,User_1,,Application,Application_652823628,,,,Application_652823628
1,1,1,1,['Application_652823628'],Submit,2016/01/01 10:51:15.352,2016/01/01 10:51:15.352,Existing loan takeover,New credit,20000.0,statechange,,,,,User_1,,Application,ApplState_1582051990,,,,Application_652823628
2,2,2,6,['Application_652823628'],Complete,2016/01/02 11:45:22.429,2016/01/02 11:45:22.429,Existing loan takeover,New credit,20000.0,Obtained,,,,,User_17,,Workflow,Workitem_1875340971,,,,Application_652823628
3,3,3,8,['Application_652823628'],Accept,2016/01/02 12:23:04.299,2016/01/02 12:23:04.299,Existing loan takeover,New credit,20000.0,statechange,,,,,User_52,,Application,ApplState_99568828,,,,Application_652823628
4,4,4,10,['Application_652823628'],Create offer,2016/01/02 12:29:05.354,2016/01/02 12:29:05.354,Existing loan takeover,New credit,20000.0,statechange,20000.0,True,44.0,['Offer_148581083'],User_52,498.29,Offer,OfferState_1514834199,True,979.0,20000.0,Application_652823628
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393926,393926,393926,1160394,['Application_1350494635'],Create offer,2017/01/02 20:25:00.040,2017/01/02 20:25:00.040,Home improvement,New credit,20000.0,statechange,20000.0,False,77.0,['Offer_1580299144'],User_96,297.81,Offer,OfferState_30384573,False,0.0,20000.0,Application_1350494635
393927,393927,393927,1160395,,Send (mail and online),2017/01/02 20:27:20.453,2017/01/02 20:27:20.453,Home improvement,New credit,20000.0,statechange,20000.0,False,77.0,['Offer_1580299144'],User_96,297.81,Offer,OfferState_1959881309,False,0.0,20000.0,Application_1350494635
393928,393928,393928,1160397,['Application_1350494635'],Call,2017/01/02 20:27:20.472,2017/01/02 20:27:20.472,Home improvement,New credit,20000.0,Obtained,20000.0,False,77.0,['Offer_1580299144'],User_96,297.81,Workflow,Workitem_358206591,False,0.0,20000.0,Application_1350494635
393929,393929,393929,1160402,['Application_1350494635'],Cancel application,2017/01/16 10:51:21.114,2017/01/16 10:51:21.114,Home improvement,New credit,20000.0,statechange,20000.0,False,77.0,['Offer_1580299144'],User_28,297.81,Application,ApplState_1869071797,False,0.0,20000.0,Application_1350494635


In [6]:
# Drop unneeded columns
bpi17 = bpi17.drop(columns=['event_None', 'event_Unnamed: 0', 'event_start_timestamp', 'event_EventID','event_CaseID'])

In [7]:
# Apply preprocessing
objects = ['application', 'offer']
first_cols = ['event_id', 'event_timestamp', 'event_activity'] + objects
split = []

bpi17 = preprocess_dataframe(bpi17, first_cols, split)

In [8]:
with pd.option_context('display.max_rows', None): 
    display(bpi17.isna().any())

event_id                                 False
event_timestamp                          False
event_activity                           False
application                               True
offer                                     True
event_RequestedAmount                    False
event_FirstWithdrawalAmount              False
event_NumberOfTerms                      False
event_MonthlyCost                        False
event_CreditScore                        False
event_OfferedAmount                      False
event_Accepted_True                      False
event_Accepted_nan                       False
event_Selected_True                      False
event_Selected_nan                       False
event_Action_Created                     False
event_Action_Deleted                     False
event_Action_Obtained                    False
event_Action_statechange                 False
event_ApplicationType_Limit raise        False
event_ApplicationType_New credit         False
event_EventOr

In [9]:
with pd.option_context('display.max_columns', None): 
    display(bpi17)

Unnamed: 0,event_id,event_timestamp,event_activity,application,offer,event_RequestedAmount,event_FirstWithdrawalAmount,event_NumberOfTerms,event_MonthlyCost,event_CreditScore,event_OfferedAmount,event_Accepted_True,event_Accepted_nan,event_Selected_True,event_Selected_nan,event_Action_Created,event_Action_Deleted,event_Action_Obtained,event_Action_statechange,event_ApplicationType_Limit raise,event_ApplicationType_New credit,event_EventOrigin_Application,event_EventOrigin_Offer,event_EventOrigin_Workflow,event_LoanGoal_Boat,event_LoanGoal_Business goal,event_LoanGoal_Car,event_LoanGoal_Caravan / Camper,event_LoanGoal_Debt restructuring,event_LoanGoal_Existing loan takeover,event_LoanGoal_Extra spending limit,event_LoanGoal_Home improvement,event_LoanGoal_Motorcycle,event_LoanGoal_Not speficied,"event_LoanGoal_Other, see explanation",event_LoanGoal_Remaining debt home,event_LoanGoal_Tax payments,event_LoanGoal_Unknown,event_org_resource_User_1,event_org_resource_User_10,event_org_resource_User_100,event_org_resource_User_101,event_org_resource_User_102,event_org_resource_User_103,event_org_resource_User_104,event_org_resource_User_105,event_org_resource_User_106,event_org_resource_User_107,event_org_resource_User_108,event_org_resource_User_109,event_org_resource_User_11,event_org_resource_User_110,event_org_resource_User_111,event_org_resource_User_112,event_org_resource_User_113,event_org_resource_User_114,event_org_resource_User_115,event_org_resource_User_116,event_org_resource_User_117,event_org_resource_User_118,event_org_resource_User_119,event_org_resource_User_12,event_org_resource_User_120,event_org_resource_User_121,event_org_resource_User_122,event_org_resource_User_123,event_org_resource_User_124,event_org_resource_User_125,event_org_resource_User_126,event_org_resource_User_127,event_org_resource_User_128,event_org_resource_User_129,event_org_resource_User_13,event_org_resource_User_130,event_org_resource_User_131,event_org_resource_User_132,event_org_resource_User_133,event_org_resource_User_134,event_org_resource_User_135,event_org_resource_User_136,event_org_resource_User_137,event_org_resource_User_138,event_org_resource_User_139,event_org_resource_User_14,event_org_resource_User_140,event_org_resource_User_141,event_org_resource_User_142,event_org_resource_User_143,event_org_resource_User_144,event_org_resource_User_145,event_org_resource_User_15,event_org_resource_User_16,event_org_resource_User_17,event_org_resource_User_18,event_org_resource_User_19,event_org_resource_User_2,event_org_resource_User_20,event_org_resource_User_21,event_org_resource_User_22,event_org_resource_User_23,event_org_resource_User_24,event_org_resource_User_25,event_org_resource_User_26,event_org_resource_User_27,event_org_resource_User_28,event_org_resource_User_29,event_org_resource_User_3,event_org_resource_User_30,event_org_resource_User_31,event_org_resource_User_32,event_org_resource_User_33,event_org_resource_User_34,event_org_resource_User_35,event_org_resource_User_36,event_org_resource_User_37,event_org_resource_User_38,event_org_resource_User_39,event_org_resource_User_4,event_org_resource_User_40,event_org_resource_User_41,event_org_resource_User_42,event_org_resource_User_43,event_org_resource_User_44,event_org_resource_User_45,event_org_resource_User_46,event_org_resource_User_47,event_org_resource_User_48,event_org_resource_User_49,event_org_resource_User_5,event_org_resource_User_50,event_org_resource_User_51,event_org_resource_User_52,event_org_resource_User_53,event_org_resource_User_54,event_org_resource_User_55,event_org_resource_User_56,event_org_resource_User_57,event_org_resource_User_58,event_org_resource_User_59,event_org_resource_User_6,event_org_resource_User_60,event_org_resource_User_61,event_org_resource_User_62,event_org_resource_User_63,event_org_resource_User_64,event_org_resource_User_65,event_org_resource_User_66,event_org_resource_User_67,event_org_resource_User_68,event_org_resource_User_69,event_org_resource_User_7,event_org_resource_User_70,event_org_resource_User_71,event_org_resource_User_72,event_org_resource_User_73,event_org_resource_User_74,event_org_resource_User_75,event_org_resource_User_76,event_org_resource_User_77,event_org_resource_User_78,event_org_resource_User_79,event_org_resource_User_8,event_org_resource_User_80,event_org_resource_User_81,event_org_resource_User_82,event_org_resource_User_83,event_org_resource_User_84,event_org_resource_User_85,event_org_resource_User_86,event_org_resource_User_87,event_org_resource_User_88,event_org_resource_User_89,event_org_resource_User_9,event_org_resource_User_90,event_org_resource_User_91,event_org_resource_User_92,event_org_resource_User_93,event_org_resource_User_94,event_org_resource_User_95,event_org_resource_User_96,event_org_resource_User_97,event_org_resource_User_98,event_org_resource_User_99
0,0,2016/01/01 10:51:15.304,Create application,['Application_652823628'],,20000.0,0.0,0.0,0.00,0.0,0.0,0,1,0,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,2016/01/01 10:51:15.352,Submit,['Application_652823628'],,20000.0,0.0,0.0,0.00,0.0,0.0,0,1,0,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,6,2016/01/02 11:45:22.429,Complete,['Application_652823628'],,20000.0,0.0,0.0,0.00,0.0,0.0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,8,2016/01/02 12:23:04.299,Accept,['Application_652823628'],,20000.0,0.0,0.0,0.00,0.0,0.0,0,1,0,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,10,2016/01/02 12:29:05.354,Create offer,['Application_652823628'],['Offer_148581083'],20000.0,20000.0,44.0,498.29,979.0,20000.0,1,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393926,1160394,2017/01/02 20:25:00.040,Create offer,['Application_1350494635'],['Offer_1580299144'],20000.0,20000.0,77.0,297.81,0.0,20000.0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
393927,1160395,2017/01/02 20:27:20.453,Send (mail and online),,['Offer_1580299144'],20000.0,20000.0,77.0,297.81,0.0,20000.0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
393928,1160397,2017/01/02 20:27:20.472,Call,['Application_1350494635'],['Offer_1580299144'],20000.0,20000.0,77.0,297.81,0.0,20000.0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
393929,1160402,2017/01/16 10:51:21.114,Cancel application,['Application_1350494635'],['Offer_1580299144'],20000.0,20000.0,77.0,297.81,0.0,20000.0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
bpi17.describe()

Unnamed: 0,event_id,event_RequestedAmount,event_FirstWithdrawalAmount,event_NumberOfTerms,event_MonthlyCost,event_CreditScore,event_OfferedAmount,event_Accepted_True,event_Accepted_nan,event_Selected_True,...,event_org_resource_User_90,event_org_resource_User_91,event_org_resource_User_92,event_org_resource_User_93,event_org_resource_User_94,event_org_resource_User_95,event_org_resource_User_96,event_org_resource_User_97,event_org_resource_User_98,event_org_resource_User_99
count,393931.0,393931.0,393931.0,393931.0,393931.0,393931.0,393931.0,393931.0,393931.0,393931.0,...,393931.0,393931.0,393931.0,393931.0,393931.0,393931.0,393931.0,393931.0,393931.0,393931.0
mean,583681.9,16527.636122,5878.404788,58.964428,197.196904,296.31735,13128.649658,0.497521,0.297301,0.460637,...,0.011857,0.002353,0.000632,0.003894,0.001531,0.008517,0.003056,0.001899,0.001447,0.012675
std,334551.4,15488.392964,9727.546681,49.065638,202.608811,425.497728,14337.091854,0.499994,0.457071,0.498449,...,0.108244,0.048453,0.025133,0.062281,0.039095,0.091892,0.0552,0.043534,0.038011,0.111867
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,293445.5,6000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,588656.0,13000.0,74.0,58.0,161.42,0.0,10000.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,872555.0,22000.0,9000.0,120.0,299.55,830.0,20000.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1160403.0,450000.0,75000.0,180.0,6673.83,1145.0,75000.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
directory = 'data-prepro'
filename = directory + '/bpi17_prepro.csv'

if not os.path.exists(directory):
    os.makedirs(directory)

bpi17.to_csv(filename, index=False)

### DS2 Preprocessing

In [12]:
ds2 = pd.read_csv('ocel/DS2.csv', sep=',')

In [13]:
with pd.option_context('display.max_columns', None): 
    display(ds2)

Unnamed: 0,event_id,ocel:timestamp,ocel:activity,weight,price,ocel:type:items,ocel:type:products,ocel:type:customers,ocel:type:orders,ocel:type:packages
0,1,2019-05-20 09:07:47,place order,3.520,524.96,"['880001', '880004', '880003', '880002']","['Echo Studio', 'Echo Show 8', 'Fire Stick 4K'...",['Marco Pegoraro'],['990001'],
1,2,2019-05-20 10:35:21,place order,2.656,3255.99,"['880008', '880005', '880006', '880007']","['Kindle', 'iPad Air', 'iPad', 'MacBook Air']",['Gyunam Park'],['990002'],
2,3,2019-05-20 10:38:17,pick item,0.483,79.99,['880006'],['Kindle'],['Gyunam Park'],['990002'],
3,4,2019-05-20 11:13:54,confirm order,3.520,524.96,"['880001', '880004', '880003', '880002']","['Echo Studio', 'Echo Show 8', 'Fire Stick 4K'...",['Marco Pegoraro'],['990001'],
4,5,2019-05-20 11:20:13,pick item,0.280,89.99,['880002'],['Fire Stick 4K'],['Marco Pegoraro'],['990001'],
...,...,...,...,...,...,...,...,...,...,...
22362,22363,2020-08-18 11:11:09,send package,0.606,1275.00,"['888072', '888071']","['iPhone 11', 'iPad Air']",['Majid Rafiei'],['991976'],['661324']
22363,22364,2020-08-19 17:57:32,package delivered,0.606,1275.00,"['888072', '888071']","['iPhone 11', 'iPad Air']",['Majid Rafiei'],['991976'],['661324']
22364,22365,2020-08-22 01:00:00,create package,0.172,699.00,['888091'],['iPhone X'],['Mohammadreza Fani Sani'],['991983'],['661325']
22365,22366,2020-08-24 11:14:47,send package,0.172,699.00,['888091'],['iPhone X'],['Mohammadreza Fani Sani'],['991983'],['661325']


In [14]:
# rename columns
ds2.columns = ds2.columns.str.replace('ocel:', '')
ds2.columns = ds2.columns.str.replace('type:', '')
ds2 = ds2.rename(columns={'timestamp': 'event_timestamp', 'activity': 'event_activity'})

In [15]:
# Apply preprocessing
objects = ['orders', 'packages', 'items']
first_cols = ['event_id', 'event_timestamp', 'event_activity'] + objects
split = ['products', 'customers']

ds2 = preprocess_dataframe(ds2, first_cols, split)

  temp = pd.get_dummies(temp.apply(pd.Series), prefix='', prefix_sep='').sum(level=0, axis=1)
  temp = pd.get_dummies(temp.apply(pd.Series), prefix='', prefix_sep='').sum(level=0, axis=1)


In [16]:
with pd.option_context('display.max_columns', None): 
    display(ds2)

Unnamed: 0,event_id,event_timestamp,event_activity,orders,packages,items,weight,price,products_Echo_Dot,products_Echo_Plus,products_Echo_Show_5,products_Echo_Show_8,products_Echo_Studio,products_Echo,products_Fire_Stick_4K,products_Fire_Stick,products_Kindle_Paperwhite,products_Kindle,products_MacBook_Air,products_MacBook_Pro,products_iPad_Air,products_iPad_Pro,products_iPad_mini,products_iPad,products_iPhone_11_Pro,products_iPhone_11,products_iPhone_8,products_iPhone_X,customers_Anahita_Farhang_Ghahfarokhi,customers_Christina_Rensinghof,customers_Christine_Dobbert,customers_Claudia_Graf,customers_Gyunam_Park,customers_Junxiong_Gao,customers_Kefang_Ding,customers_Lisa_Mannel,customers_Luis_Santos,customers_Mahnaz_Qafari,customers_Mahsa_Bafrani,customers_Majid_Rafiei,customers_Marco_Pegoraro,customers_Mohammadreza_Fani_Sani,customers_Seran_Uysal,customers_Tobias_Brockhoff,customers_Wil_van_der_Aalst
0,1,2019-05-20 09:07:47,place order,['990001'],,"['880001', '880004', '880003', '880002']",3.520,524.96,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,2,2019-05-20 10:35:21,place order,['990002'],,"['880008', '880005', '880006', '880007']",2.656,3255.99,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,3,2019-05-20 10:38:17,pick item,['990002'],,['880006'],0.483,79.99,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,4,2019-05-20 11:13:54,confirm order,['990001'],,"['880001', '880004', '880003', '880002']",3.520,524.96,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,5,2019-05-20 11:20:13,pick item,['990001'],,['880002'],0.280,89.99,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22362,22363,2020-08-18 11:11:09,send package,['991976'],['661324'],"['888072', '888071']",0.606,1275.00,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
22363,22364,2020-08-19 17:57:32,package delivered,['991976'],['661324'],"['888072', '888071']",0.606,1275.00,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
22364,22365,2020-08-22 01:00:00,create package,['991983'],['661325'],['888091'],0.172,699.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
22365,22366,2020-08-24 11:14:47,send package,['991983'],['661325'],['888091'],0.172,699.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [17]:
ds2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22367 entries, 0 to 22366
Data columns (total 45 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   event_id                               22367 non-null  int64  
 1   event_timestamp                        22367 non-null  object 
 2   event_activity                         22367 non-null  object 
 3   orders                                 22367 non-null  object 
 4   packages                               4366 non-null   object 
 5   items                                  22367 non-null  object 
 6   weight                                 22367 non-null  float64
 7   price                                  22367 non-null  float64
 8   products_Echo_Dot                      22367 non-null  uint8  
 9   products_Echo_Plus                     22367 non-null  uint8  
 10  products_Echo_Show_5                   22367 non-null  uint8  
 11  pr

In [18]:
ds2.describe()

Unnamed: 0,event_id,weight,price,products_Echo_Dot,products_Echo_Plus,products_Echo_Show_5,products_Echo_Show_8,products_Echo_Studio,products_Echo,products_Fire_Stick_4K,...,customers_Lisa_Mannel,customers_Luis_Santos,customers_Mahnaz_Qafari,customers_Mahsa_Bafrani,customers_Majid_Rafiei,customers_Marco_Pegoraro,customers_Mohammadreza_Fani_Sani,customers_Seran_Uysal,customers_Tobias_Brockhoff,customers_Wil_van_der_Aalst
count,22367.0,22367.0,22367.0,22367.0,22367.0,22367.0,22367.0,22367.0,22367.0,22367.0,...,22367.0,22367.0,22367.0,22367.0,22367.0,22367.0,22367.0,22367.0,22367.0,22367.0
mean,11184.0,1.766915,1645.994526,0.121384,0.131399,0.127912,0.133947,0.129298,0.14204,0.135512,...,0.065722,0.060714,0.063173,0.061609,0.05365,0.054366,0.055305,0.061743,0.052354,0.059194
std,6456.941071,1.861446,1977.50185,0.326581,0.337844,0.333999,0.340603,0.335537,0.349099,0.342277,...,0.247801,0.238811,0.24328,0.240449,0.225332,0.226743,0.228579,0.240693,0.222745,0.235993
min,1.0,0.166,29.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5592.5,0.44,199.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,11184.0,1.25,804.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,16775.5,2.531,2500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,22367.0,16.455,19503.92,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
directory = 'data-prepro'
filename = directory + '/ds2_prepro.csv'

if not os.path.exists(directory):
    os.makedirs(directory)

ds2.to_csv(filename, index=False)