# Loading libraries and data

In [1]:
import pandas as pd
from datetime import datetime

In [2]:
df_train = pd.read_csv('event-log-training.csv')

In [3]:
df_train

Unnamed: 0,eventID,case concept:name,case description,case Class,event org:resource,event concept:name,event lifecycle:transition,event time:timestamp
0,0,1,Simulated process instance,Print,PRN1,Job,start,01-01-1970 01:00:00.000
1,1,1,Simulated process instance,Print,PRN1,Remote Print,complete,01-01-1970 01:15:00.000
2,2,1,Simulated process instance,Print,PRN1,Read Print Options,complete,01-01-1970 01:26:00.000
3,3,1,Simulated process instance,Print,PRN1,Rasterization,start,01-01-1970 01:38:00.000
4,4,1,Simulated process instance,Print,PRN1,Interpretation,start,01-01-1970 01:51:00.000
...,...,...,...,...,...,...,...,...
36623,42949673754,18,Simulated process instance,Print,PRN1,Pressure Roller Spin Stop,complete,25-12-1970 15:51:00.000
36624,42949673755,18,Simulated process instance,Print,PRN1,Fusing,complete,25-12-1970 16:05:00.000
36625,42949673756,18,Simulated process instance,Print,PRN1,Wipe Toner on Drum,complete,25-12-1970 16:10:00.000
36626,42949673757,18,Simulated process instance,Print,PRN1,Erase Charge on Drum,complete,25-12-1970 16:22:00.000


# Convert timestamp to datetime

In [4]:
date_list = []

for time in df_train['event time:timestamp']:
    datex = time[:-4]
    date = datetime.strptime(datex, '%d-%m-%Y %H:%M:%S')

    date_list.append(date)

In [5]:
df_train['time and date'] = date_list

In [6]:
df_train

Unnamed: 0,eventID,case concept:name,case description,case Class,event org:resource,event concept:name,event lifecycle:transition,event time:timestamp,time and date
0,0,1,Simulated process instance,Print,PRN1,Job,start,01-01-1970 01:00:00.000,1970-01-01 01:00:00
1,1,1,Simulated process instance,Print,PRN1,Remote Print,complete,01-01-1970 01:15:00.000,1970-01-01 01:15:00
2,2,1,Simulated process instance,Print,PRN1,Read Print Options,complete,01-01-1970 01:26:00.000,1970-01-01 01:26:00
3,3,1,Simulated process instance,Print,PRN1,Rasterization,start,01-01-1970 01:38:00.000,1970-01-01 01:38:00
4,4,1,Simulated process instance,Print,PRN1,Interpretation,start,01-01-1970 01:51:00.000,1970-01-01 01:51:00
...,...,...,...,...,...,...,...,...,...
36623,42949673754,18,Simulated process instance,Print,PRN1,Pressure Roller Spin Stop,complete,25-12-1970 15:51:00.000,1970-12-25 15:51:00
36624,42949673755,18,Simulated process instance,Print,PRN1,Fusing,complete,25-12-1970 16:05:00.000,1970-12-25 16:05:00
36625,42949673756,18,Simulated process instance,Print,PRN1,Wipe Toner on Drum,complete,25-12-1970 16:10:00.000,1970-12-25 16:10:00
36626,42949673757,18,Simulated process instance,Print,PRN1,Erase Charge on Drum,complete,25-12-1970 16:22:00.000,1970-12-25 16:22:00


# Add in true next event and time it takes for case 4

In [7]:
df_train['event lifecycle:transition'].value_counts()

complete    33802
start        2826
Name: event lifecycle:transition, dtype: int64

In [8]:
dct_events = dict(df_train['event concept:name'].value_counts())

In [9]:
df_train['case concept:name'].value_counts()

12    2140
33    1594
42    1390
24    1285
6     1198
      ... 
27      62
80      59
56      55
51      43
4       43
Name: case concept:name, Length: 80, dtype: int64

In [10]:
df_train.groupby(by = ['case concept:name', 'time and date'], as_index = False)
df_case4 = df_train[df_train['case concept:name'] == 4]

In [11]:
df_case4

Unnamed: 0,eventID,case concept:name,case description,case Class,event org:resource,event concept:name,event lifecycle:transition,event time:timestamp,time and date
64,146028888064,4,Simulated process instance,Copy/Scan,PRN1,Job,start,01-01-1970 11:23:00.000,1970-01-01 11:23:00
26983,146028888065,4,Simulated process instance,Copy/Scan,PRN1,Copy/Scan,complete,29-09-1970 12:33:00.000,1970-09-29 12:33:00
26984,146028888066,4,Simulated process instance,Copy/Scan,PRN1,Collect Copy/Scan Options,complete,29-09-1970 12:51:00.000,1970-09-29 12:51:00
26985,146028888067,4,Simulated process instance,Copy/Scan,PRN1,Place Doc,complete,29-09-1970 12:59:00.000,1970-09-29 12:59:00
26986,146028888068,4,Simulated process instance,Copy/Scan,PRN1,Illuminate Document,complete,29-09-1970 13:18:00.000,1970-09-29 13:18:00
26987,146028888069,4,Simulated process instance,Copy/Scan,PRN1,Move Scan Head,complete,29-09-1970 13:40:00.000,1970-09-29 13:40:00
26988,146028888070,4,Simulated process instance,Copy/Scan,PRN1,Focus Light Beam,complete,29-09-1970 13:58:00.000,1970-09-29 13:58:00
26989,146028888071,4,Simulated process instance,Copy/Scan,PRN1,A/D Conversion,complete,29-09-1970 14:05:00.000,1970-09-29 14:05:00
26990,146028888072,4,Simulated process instance,Copy/Scan,PRN1,Interpolation,complete,29-09-1970 14:14:00.000,1970-09-29 14:14:00
26991,146028888073,4,Simulated process instance,Copy/Scan,PRN1,Filtered Image,complete,29-09-1970 14:25:00.000,1970-09-29 14:25:00


Add event column

In [12]:
event_lst = [event for event in df_case4['event concept:name']]

event_lst = event_lst[1:]
event_lst.append('-')
    

In [13]:
df_case4['next_event'] = event_lst

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_case4['next_event'] = event_lst


Add time to next event column

In [14]:
nexttime_lst1 = [time for time in df_case4['time and date']]

nexttime_lst = nexttime_lst1[1:]
nexttime_lst.append(nexttime_lst[-1])

In [15]:
time_diff = []

for i in range(len(nexttime_lst)):
    time_diff.append(nexttime_lst[i] - nexttime_lst1[i])

In [16]:
df_case4['time to next event'] = time_diff

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_case4['time to next event'] = time_diff


In [17]:
df_case4

Unnamed: 0,eventID,case concept:name,case description,case Class,event org:resource,event concept:name,event lifecycle:transition,event time:timestamp,time and date,next_event,time to next event
64,146028888064,4,Simulated process instance,Copy/Scan,PRN1,Job,start,01-01-1970 11:23:00.000,1970-01-01 11:23:00,Copy/Scan,271 days 01:10:00
26983,146028888065,4,Simulated process instance,Copy/Scan,PRN1,Copy/Scan,complete,29-09-1970 12:33:00.000,1970-09-29 12:33:00,Collect Copy/Scan Options,0 days 00:18:00
26984,146028888066,4,Simulated process instance,Copy/Scan,PRN1,Collect Copy/Scan Options,complete,29-09-1970 12:51:00.000,1970-09-29 12:51:00,Place Doc,0 days 00:08:00
26985,146028888067,4,Simulated process instance,Copy/Scan,PRN1,Place Doc,complete,29-09-1970 12:59:00.000,1970-09-29 12:59:00,Illuminate Document,0 days 00:19:00
26986,146028888068,4,Simulated process instance,Copy/Scan,PRN1,Illuminate Document,complete,29-09-1970 13:18:00.000,1970-09-29 13:18:00,Move Scan Head,0 days 00:22:00
26987,146028888069,4,Simulated process instance,Copy/Scan,PRN1,Move Scan Head,complete,29-09-1970 13:40:00.000,1970-09-29 13:40:00,Focus Light Beam,0 days 00:18:00
26988,146028888070,4,Simulated process instance,Copy/Scan,PRN1,Focus Light Beam,complete,29-09-1970 13:58:00.000,1970-09-29 13:58:00,A/D Conversion,0 days 00:07:00
26989,146028888071,4,Simulated process instance,Copy/Scan,PRN1,A/D Conversion,complete,29-09-1970 14:05:00.000,1970-09-29 14:05:00,Interpolation,0 days 00:09:00
26990,146028888072,4,Simulated process instance,Copy/Scan,PRN1,Interpolation,complete,29-09-1970 14:14:00.000,1970-09-29 14:14:00,Filtered Image,0 days 00:11:00
26991,146028888073,4,Simulated process instance,Copy/Scan,PRN1,Filtered Image,complete,29-09-1970 14:25:00.000,1970-09-29 14:25:00,Collect Image,0 days 00:19:00


# Next event and time column for all data


In [18]:
cases = list(df_train['case concept:name'].unique())

In [19]:
#event column
total_events = []
for case in cases: #hier gaat t mis
    df_case = df_train[df_train['case concept:name'] == case]
    
    event_lst = [event for event in df_case['event concept:name']]
    event_lst = event_lst[1:]
    event_lst.append('-')
    
    total_events = total_events + event_lst

In [20]:
#time column
all_differences = []
for case in cases:
    df_case = df_train[df_train['case concept:name'] == case]
    
    nexttime_lst1 = [time for time in df_case['time and date']]
    nexttime_lst = nexttime_lst1[1:]
    nexttime_lst.append(nexttime_lst[-1])
    
    time_diff = []

    for i in range(len(nexttime_lst)):
        time_diff.append(nexttime_lst[i] - nexttime_lst1[i])
    
    all_differences = all_differences + time_diff

In [21]:
df_train['Next event'] = total_events
df_train['Time to next event'] = all_differences

In [22]:
df_train

Unnamed: 0,eventID,case concept:name,case description,case Class,event org:resource,event concept:name,event lifecycle:transition,event time:timestamp,time and date,Next event,Time to next event
0,0,1,Simulated process instance,Print,PRN1,Job,start,01-01-1970 01:00:00.000,1970-01-01 01:00:00,Remote Print,0 days 00:15:00
1,1,1,Simulated process instance,Print,PRN1,Remote Print,complete,01-01-1970 01:15:00.000,1970-01-01 01:15:00,Read Print Options,0 days 00:11:00
2,2,1,Simulated process instance,Print,PRN1,Read Print Options,complete,01-01-1970 01:26:00.000,1970-01-01 01:26:00,Rasterization,0 days 00:12:00
3,3,1,Simulated process instance,Print,PRN1,Rasterization,start,01-01-1970 01:38:00.000,1970-01-01 01:38:00,Interpretation,0 days 00:13:00
4,4,1,Simulated process instance,Print,PRN1,Interpretation,start,01-01-1970 01:51:00.000,1970-01-01 01:51:00,Unformatted Text,0 days 00:18:00
...,...,...,...,...,...,...,...,...,...,...,...
36623,42949673754,18,Simulated process instance,Print,PRN1,Pressure Roller Spin Stop,complete,25-12-1970 15:51:00.000,1970-12-25 15:51:00,Store Image,0 days 00:24:00
36624,42949673755,18,Simulated process instance,Print,PRN1,Fusing,complete,25-12-1970 16:05:00.000,1970-12-25 16:05:00,Transfer Image,0 days 00:17:00
36625,42949673756,18,Simulated process instance,Print,PRN1,Wipe Toner on Drum,complete,25-12-1970 16:10:00.000,1970-12-25 16:10:00,Send SMTP,0 days 00:16:00
36626,42949673757,18,Simulated process instance,Print,PRN1,Erase Charge on Drum,complete,25-12-1970 16:22:00.000,1970-12-25 16:22:00,Job,0 days 00:11:00


In [23]:
#df_predict = df_train.copy()

In [32]:
lst = df_train[['event concept:name', 'event lifecycle:transition', 'Next event']].apply(tuple, axis=1)

In [33]:
lst = [elm for elm in lst]

In [34]:
count_dict = {}
for elm in lst:
    if elm in count_dict:
        count_dict[elm] +=1
    else:
        count_dict[elm] = 1

In [35]:
count_dict

{('Job', 'start', 'Remote Print'): 1,
 ('Remote Print', 'complete', 'Read Print Options'): 1,
 ('Read Print Options', 'complete', 'Rasterization'): 1,
 ('Rasterization', 'start', 'Interpretation'): 1,
 ('Interpretation', 'start', 'Unformatted Text'): 2,
 ('Unformatted Text', 'complete', 'Interpretation'): 3,
 ('Interpretation', 'complete', 'Rendering'): 7,
 ('Rendering', 'complete', 'Screening'): 14,
 ('Screening', 'start', 'Interpretation'): 9,
 ('Interpretation', 'start', 'FM Screening'): 2,
 ('FM Screening', 'complete', 'Unformatted Text'): 1,
 ('Unformatted Text', 'complete', 'Screening'): 10,
 ('Screening', 'complete', 'Interpretation'): 8,
 ('Interpretation', 'complete', 'Current Page Image'): 3,
 ('Current Page Image', 'complete', 'Interpretation'): 15,
 ('Interpretation', 'start', 'Accumulate Images'): 4,
 ('Accumulate Images', 'complete', 'Rendering'): 8,
 ('Rendering', 'complete', 'Unformatted Text'): 2,
 ('Interpretation', 'complete', 'AM Screening'): 5,
 ('AM Screening', 'c