In [None]:
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import datetime as datetime
from sklearn.model_selection import train_test_split

In [None]:
df_train = pd.read_csv("bpi2017_train.csv", parse_dates = ['time:timestamp'])
df_val = pd.read_csv("bpi2017_val.csv", parse_dates = ['time:timestamp'])
df_test = pd.read_csv("bpi2017_test.csv", parse_dates = ['time:timestamp'])

df.info()
# The default name indicating the case ID is case:concept:name
# concept:name is the event
# time:timestamp is the corresponding timestamp

# 2. Baseline Time Prediction (Only on Training Dataset)

In [None]:
# Calculate time difference
df_train['time_diff'] = df_train['time:timestamp'].diff().dt.total_seconds()
# Set the time difference of the 1st row 0
df_train.loc[0, 'time_diff'] = 0
# Count number of processes per trace/ID
count_lst = df_train.groupby('case:concept:name').count()['time_diff'].tolist()
# Assign position number to each row/process
position_lst_1 = [list(range(1, i + 1)) for i in count_lst]
position_lst = []
for i in position_lst_1:
    for j in i:
        position_lst.append(j)
df_train['position'] = position_lst
# Set the time difference of every process with position = 1 as 0
df_train.loc[df_train['position'] == 1, 'time_diff'] = 0
df_train

In [None]:
# Calculate mean time difference grouped by position based on the number of cases
mean_time_lst = df_train.groupby('position').mean()['time_diff'].tolist()
# Delete 1st time difference mean (Position = 1 always has mean = 0)
del mean_time_lst[0]
# Add the last time differnce mean as 0 when it reaches the maximum value of position
mean_time_lst.append(0.0)
# Create the predicted time column per entry using the mean time difference
pred_time_lst = [mean_time_lst[j - 1] for j in position_lst]
df_train['baseline_predicted_time'] = pred_time_lst
df_train

# 3. Apply Above Calculated Mean Time to Validation and Test Set

In [None]:
# Calculate time difference
df_val['time_diff'] = df_val['time:timestamp'].diff().dt.total_seconds()
# Set the time difference of the 1st row 0
df_val.loc[0, 'time_diff'] = 0
# Count number of processes per trace/ID
count_val_lst = df_val.groupby('case:concept:name').count()['time_diff'].tolist()
# Assign position number to each row/process
position_lst_1_val = [list(range(1, i + 1)) for i in count_val_lst]
position_lst_val = []
for i in position_lst_1_val:
    for j in i:
        position_lst_val.append(j)
df_val['position'] = position_lst_val
# Set the time difference of every process with position = 1 as 0
df_val.loc[df_val['position'] == 1, 'time_diff'] = 0
# Create the predicted time column per entry using the mean time difference
pred_time_lst_val = []
for j in position_lst_val:
    if j <= len(mean_time_lst):
        pred_time_lst_val.append(mean_time_lst[j - 1])
    else:
        pred_time_lst_val.append(0)
df_val['baseline_predicted_time'] = pred_time_lst_val
df_val

In [None]:
# Calculate time difference
df_test['time_diff'] = df_test['time:timestamp'].diff().dt.total_seconds()
# Set the time difference of the 1st row 0
df_test.loc[0, 'time_diff'] = 0
# Count number of processes per trace/ID
count_test_lst = df_test.groupby('case:concept:name').count()['time_diff'].tolist()
# Assign position number to each row/process
position_lst_1_test = [list(range(1, i + 1)) for i in count_test_lst]
position_lst_test = []
for i in position_lst_1_test:
    for j in i:
        position_lst_test.append(j)
df_test['position'] = position_lst_test
# Set the time difference of every process with position = 1 as 0
df_test.loc[df_test['position'] == 1, 'time_diff'] = 0
# Create the predicted time column per entry using the mean time difference
pred_time_lst_test = []
for j in position_lst_test:
    if j <= len(mean_time_lst):
        pred_time_lst_test.append(mean_time_lst[j - 1])
    else:
        pred_time_lst_test.append(0)
df_test['baseline_predicted_time'] = pred_time_lst_test
df_test