<a href="https://colab.research.google.com/github/dbusn/process-mining-group-5/blob/main/preprocessing_2012.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import datetime as datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib
import random

In [2]:
from google.colab import drive
drive.mount('/content/drive')
df_train = pd.read_csv("/content/drive/MyDrive/DBL Process Mining/Data/Split/bpi2012_train.csv", parse_dates = ['time:timestamp'])
df_val = pd.read_csv("/content/drive/MyDrive/DBL Process Mining/Data/Split/bpi2012_val.csv", parse_dates = ['time:timestamp'])
df_test = pd.read_csv("/content/drive/MyDrive/DBL Process Mining/Data/Split/bpi2012_test.csv", parse_dates = ['time:timestamp'])


df_test['time:timestamp'] = pd.to_datetime(df_test['time:timestamp'], utc=True)
df_train['time:timestamp'] = pd.to_datetime(df_train['time:timestamp'], utc=True)
df_val['time:timestamp'] = pd.to_datetime(df_val['time:timestamp'], utc=True)



# The default name indicating the case ID is case:concept:name
# concept:name is the event
# time:timestamp is the corresponding timestamp
# Load the datasets, sort them on case and consequently timestamp, then reset the index
df_train = df_train.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)
df_val = df_val.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)
df_test = df_test.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)

Mounted at /content/drive


In [3]:
df_train

Unnamed: 0.1,Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ
0,0,112.0,COMPLETE,A_SUBMITTED,2011-09-30 22:38:44.546000+00:00,2011-10-01 00:38:44.546000+02:00,173688,20000
1,1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-09-30 22:38:44.880000+00:00,2011-10-01 00:38:44.546000+02:00,173688,20000
2,2,112.0,COMPLETE,A_PREACCEPTED,2011-09-30 22:39:37.906000+00:00,2011-10-01 00:38:44.546000+02:00,173688,20000
3,3,112.0,SCHEDULE,W_Completeren aanvraag,2011-09-30 22:39:38.875000+00:00,2011-10-01 00:38:44.546000+02:00,173688,20000
4,4,,START,W_Completeren aanvraag,2011-10-01 09:36:46.437000+00:00,2011-10-01 00:38:44.546000+02:00,173688,20000
...,...,...,...,...,...,...,...,...
130715,130715,10138.0,COMPLETE,W_Valideren aanvraag,2012-02-10 12:02:04.765000+00:00,2012-01-09 17:40:12.748000+01:00,199345,8500
130716,130716,10138.0,START,W_Valideren aanvraag,2012-02-10 12:32:48.693000+00:00,2012-01-09 17:40:12.748000+01:00,199345,8500
130717,130717,10138.0,COMPLETE,A_DECLINED,2012-02-10 12:41:38.050000+00:00,2012-01-09 17:40:12.748000+01:00,199345,8500
130718,130718,10138.0,COMPLETE,O_DECLINED,2012-02-10 12:41:38.050000+00:00,2012-01-09 17:40:12.748000+01:00,199345,8500


In [4]:
# Remove obsolete columns
df_train.drop(columns = ['Unnamed: 0'], inplace = True)
df_val.drop(columns = ['Unnamed: 0'], inplace = True)
df_test.drop(columns = ['Unnamed: 0'], inplace = True)

In [5]:
df_train.head()

Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ
0,112.0,COMPLETE,A_SUBMITTED,2011-09-30 22:38:44.546000+00:00,2011-10-01 00:38:44.546000+02:00,173688,20000
1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-09-30 22:38:44.880000+00:00,2011-10-01 00:38:44.546000+02:00,173688,20000
2,112.0,COMPLETE,A_PREACCEPTED,2011-09-30 22:39:37.906000+00:00,2011-10-01 00:38:44.546000+02:00,173688,20000
3,112.0,SCHEDULE,W_Completeren aanvraag,2011-09-30 22:39:38.875000+00:00,2011-10-01 00:38:44.546000+02:00,173688,20000
4,,START,W_Completeren aanvraag,2011-10-01 09:36:46.437000+00:00,2011-10-01 00:38:44.546000+02:00,173688,20000


In [6]:
df_train.shape

(130720, 7)

Case Occurence number

In [7]:
df_train["case_occurrence_no"] = df_train.groupby(['case:concept:name'])['time:timestamp'].cumcount().tolist()

One-hot encoding

In [8]:
encoded_cols = ['lifecycle:transition']
df_train = pd.get_dummies(df_train, columns=encoded_cols, prefix=['lifecycle:transition_is'])
df_val = pd.get_dummies(df_val, columns=encoded_cols, prefix=['lifecycle:transition_is'])
df_test = pd.get_dummies(df_test, columns=encoded_cols, prefix=['lifecycle:transition_is'])

## Creating additional features

Next and past activity timedelta

In [9]:
def next_past_activity(df):
    temp = df['time:timestamp']
    next_activity = []
    for i in range(len(temp)-1):
        next_activity.append(temp[i+1])

    df['next_activity_delta_t'] = pd.Series(next_activity) - df['time:timestamp']
    df['past_activity_delta_t'] = df['time:timestamp'] - pd.Series(next_activity)
    
    return df

1. Calculate the time difference & find position

In [10]:
# Cumulative sum function to be used later
def CumSum(lists):
    # Returns the cumulative sum of a list
    length = len(lists)
    cu_list = [sum(lists[0: x: 1]) for x in range(0, length + 1)]
    return cu_list[1: ]

In [11]:
def next_event(df):
    # Find the next activity name by shifting the current event label
    df['next:concept:name'] = df['concept:name'].shift(-1)
    last_lst = [i - 1 for i in df[df['position'] == 1].index if i != 0]
    # The next event label is 'Nothing' when the cycle is ended
    df.at[df.shape[0] - 1, 'next:concept:name'] = 'Nothing'
    for i in last_lst:
        df.at[i, 'next:concept:name'] = 'Nothing'
    return df

In [12]:
def time_difference(df):
    # Calculate time difference between each row
    df['time_diff'] = df['time:timestamp'].diff().dt.total_seconds()
    # Set the time difference of the 1st row to 0 as it's currently NaN
    df.at[0, 'time_diff'] = 0
    # Count number of steps per process
    length_per_case_List = df.groupby(['case:concept:name'])['time_diff'].count().tolist()

    # Using the cumulative sum we get all the positions that are a first step in a process
    # And then the time difference can be set to 0
    position_lst = CumSum(length_per_case_List)
    for i in tqdm(position_lst):
        df.at[i, 'time_diff'] = 0
    # For Loop mysteriously creates an empty row at the end of the df, gotta delete it
    df = df.iloc[: -1]

    # Unzip the position list to get the number of each steps of each process, make that into a list
    step_in_process = []
    for x in tqdm(length_per_case_List):
        for y in range(x):
            step_in_process.append(y + 1)
    # Assign position number to each row/process
    df['position'] = step_in_process

    # Find future time difference by shifting the current time difference
    df['future_time_diff'] = df['time_diff'].shift(-1)
    df.at[df.shape[0] - 1, 'future_time_diff'] = 0

    return df

 Weekday feature

In [13]:
def add_weekday(df):
    # Get day of week like Monday, Tuesday, etc
    df_day = pd.DataFrame(data = df['time:timestamp'].dt.dayofweek)
    df_day.rename(columns = {'time:timestamp': 'day'}, inplace = True)
    df['day'] = df_day['day']
    return df

Working hour feature

In [14]:
def add_working_hour(df):
    # Get hour like 10, 15, etc
    df_day = pd.DataFrame(data = df['time:timestamp'].dt.hour)
    df_day.rename(columns = {'time:timestamp': 'hour'}, inplace = True)
    df['hour'] = df_day['hour']
    return df

Timestamp parsing

In [15]:
def parse_timestamp(df):
    temp = df["time:timestamp"]
    day_of_month = []
    month_no = []
    quarters = []
    week = []
    hour = []
    seconds = []

    for i in range(len(temp)):
        day_of_month.append(temp[i].day)
        month_no.append(temp[i].month)
        quarters.append(temp[i].quarter)
        week.append(temp[i].week)
        hour.append(temp[i].hour)
        seconds.append(temp[i].second)

    df['day_of_month'] = pd.Series(day_of_month)
    df['month_no'] = pd.Series(month_no)
    df['quarter'] = pd.Series(quarters)
    df['week'] = pd.Series(week)
    df['hour'] = pd.Series(hour)
    df['second'] = pd.Series(seconds)
    return df

Time difference normalization

In [16]:
def normalize_delta_t(df):
    min_max_scaler = MinMaxScaler()

    df['norm_next_activity_delta'] = min_max_scaler.fit_transform(np.array(df["next_activity_delta_t"]).reshape(-1,1))
    df['norm_past_activity_delta'] = min_max_scaler.fit_transform(np.array(df["past_activity_delta_t"]).reshape(-1,1))
    return df

## Apply all functions

In [17]:
# Apply the above changes to all dataframes
# The warnings are obsolete, it's because it uses .at which is considerably faster than .loc
df_train = time_difference(df_train)
df_val = time_difference(df_val)
df_test = time_difference(df_test)

df_train = parse_timestamp(df_train)
df_val = parse_timestamp(df_val)
df_test = parse_timestamp(df_test)

df_train = next_past_activity(df_train)
df_val = next_past_activity(df_val)
df_test = next_past_activity(df_test)

df_train = normalize_delta_t(df_train)
df_val = normalize_delta_t(df_val)
df_test = normalize_delta_t(df_test)

df_train = next_event(df_train)
df_val = next_event(df_val)
df_test = next_event(df_test)

df_train = add_weekday(df_train)
df_val = add_weekday(df_val)
df_test = add_weekday(df_test)

df_train = add_working_hour(df_train)
df_val = add_working_hour(df_val)
df_test = add_working_hour(df_test)

100%|██████████| 6545/6545 [00:00<00:00, 63424.23it/s]
100%|██████████| 6545/6545 [00:00<00:00, 328912.79it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|██████████| 1637/1637 [00:00<00:00, 58503.90it/s]
100%|██████████| 1637/1637 [00:00<00:00, 378692.61it/s]
100%|██████████| 2045/2045 [00:00<00:00, 66448.34it/s]
100%|██████████| 2045/2045 [00:00<00:00, 242284.38it/s]


In [18]:
df_train

Unnamed: 0,org:resource,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ,case_occurrence_no,lifecycle:transition_is_COMPLETE,lifecycle:transition_is_SCHEDULE,lifecycle:transition_is_START,...,quarter,week,hour,second,next_activity_delta_t,past_activity_delta_t,norm_next_activity_delta,norm_past_activity_delta,next:concept:name,day
0,112.0,A_SUBMITTED,2011-09-30 22:38:44.546000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,0.0,1.0,0.0,0.0,...,3,39,22,44,0 days 00:00:00.334000,-1 days +23:59:59.666000,0.999710,0.999144,A_PARTLYSUBMITTED,4
1,112.0,A_PARTLYSUBMITTED,2011-09-30 22:38:44.880000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,1.0,1.0,0.0,0.0,...,3,39,22,44,0 days 00:00:53.026000,-1 days +23:59:06.974000,0.999710,0.999144,A_PREACCEPTED,4
2,112.0,A_PREACCEPTED,2011-09-30 22:39:37.906000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,2.0,1.0,0.0,0.0,...,3,39,22,37,0 days 00:00:00.969000,-1 days +23:59:59.031000,0.999710,0.999144,W_Completeren aanvraag,4
3,112.0,W_Completeren aanvraag,2011-09-30 22:39:38.875000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,3.0,0.0,1.0,0.0,...,3,39,22,38,0 days 10:57:07.562000,-1 days +13:02:52.438000,0.999715,0.999140,W_Completeren aanvraag,4
4,,W_Completeren aanvraag,2011-10-01 09:36:46.437000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,4.0,0.0,0.0,1.0,...,4,39,9,46,0 days 00:05:56.871000,-1 days +23:54:03.129000,0.999710,0.999144,A_ACCEPTED,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130715,10138.0,W_Valideren aanvraag,2012-02-10 12:02:04.765000+00:00,2012-01-09 17:40:12.748000+01:00,199345.0,8500.0,49.0,1.0,0.0,0.0,...,1,6,12,4,0 days 00:30:43.928000,-1 days +23:29:16.072000,0.999711,0.999144,W_Valideren aanvraag,4
130716,10138.0,W_Valideren aanvraag,2012-02-10 12:32:48.693000+00:00,2012-01-09 17:40:12.748000+01:00,199345.0,8500.0,50.0,0.0,0.0,1.0,...,1,6,12,48,0 days 00:08:49.357000,-1 days +23:51:10.643000,0.999710,0.999144,A_DECLINED,4
130717,10138.0,A_DECLINED,2012-02-10 12:41:38.050000+00:00,2012-01-09 17:40:12.748000+01:00,199345.0,8500.0,51.0,1.0,0.0,0.0,...,1,6,12,38,0 days 00:00:00,0 days 00:00:00,0.999710,0.999144,O_DECLINED,4
130718,10138.0,O_DECLINED,2012-02-10 12:41:38.050000+00:00,2012-01-09 17:40:12.748000+01:00,199345.0,8500.0,52.0,1.0,0.0,0.0,...,1,6,12,38,0 days 00:00:03.898000,-1 days +23:59:56.102000,0.999710,0.999144,W_Valideren aanvraag,4


# Feature selection

In [19]:
df_train.columns

Index(['org:resource', 'concept:name', 'time:timestamp', 'case:REG_DATE',
       'case:concept:name', 'case:AMOUNT_REQ', 'case_occurrence_no',
       'lifecycle:transition_is_COMPLETE', 'lifecycle:transition_is_SCHEDULE',
       'lifecycle:transition_is_START', 'time_diff', 'position',
       'future_time_diff', 'day_of_month', 'month_no', 'quarter', 'week',
       'hour', 'second', 'next_activity_delta_t', 'past_activity_delta_t',
       'norm_next_activity_delta', 'norm_past_activity_delta',
       'next:concept:name', 'day'],
      dtype='object')

In [20]:
df_train

Unnamed: 0,org:resource,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ,case_occurrence_no,lifecycle:transition_is_COMPLETE,lifecycle:transition_is_SCHEDULE,lifecycle:transition_is_START,...,quarter,week,hour,second,next_activity_delta_t,past_activity_delta_t,norm_next_activity_delta,norm_past_activity_delta,next:concept:name,day
0,112.0,A_SUBMITTED,2011-09-30 22:38:44.546000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,0.0,1.0,0.0,0.0,...,3,39,22,44,0 days 00:00:00.334000,-1 days +23:59:59.666000,0.999710,0.999144,A_PARTLYSUBMITTED,4
1,112.0,A_PARTLYSUBMITTED,2011-09-30 22:38:44.880000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,1.0,1.0,0.0,0.0,...,3,39,22,44,0 days 00:00:53.026000,-1 days +23:59:06.974000,0.999710,0.999144,A_PREACCEPTED,4
2,112.0,A_PREACCEPTED,2011-09-30 22:39:37.906000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,2.0,1.0,0.0,0.0,...,3,39,22,37,0 days 00:00:00.969000,-1 days +23:59:59.031000,0.999710,0.999144,W_Completeren aanvraag,4
3,112.0,W_Completeren aanvraag,2011-09-30 22:39:38.875000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,3.0,0.0,1.0,0.0,...,3,39,22,38,0 days 10:57:07.562000,-1 days +13:02:52.438000,0.999715,0.999140,W_Completeren aanvraag,4
4,,W_Completeren aanvraag,2011-10-01 09:36:46.437000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,4.0,0.0,0.0,1.0,...,4,39,9,46,0 days 00:05:56.871000,-1 days +23:54:03.129000,0.999710,0.999144,A_ACCEPTED,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130715,10138.0,W_Valideren aanvraag,2012-02-10 12:02:04.765000+00:00,2012-01-09 17:40:12.748000+01:00,199345.0,8500.0,49.0,1.0,0.0,0.0,...,1,6,12,4,0 days 00:30:43.928000,-1 days +23:29:16.072000,0.999711,0.999144,W_Valideren aanvraag,4
130716,10138.0,W_Valideren aanvraag,2012-02-10 12:32:48.693000+00:00,2012-01-09 17:40:12.748000+01:00,199345.0,8500.0,50.0,0.0,0.0,1.0,...,1,6,12,48,0 days 00:08:49.357000,-1 days +23:51:10.643000,0.999710,0.999144,A_DECLINED,4
130717,10138.0,A_DECLINED,2012-02-10 12:41:38.050000+00:00,2012-01-09 17:40:12.748000+01:00,199345.0,8500.0,51.0,1.0,0.0,0.0,...,1,6,12,38,0 days 00:00:00,0 days 00:00:00,0.999710,0.999144,O_DECLINED,4
130718,10138.0,O_DECLINED,2012-02-10 12:41:38.050000+00:00,2012-01-09 17:40:12.748000+01:00,199345.0,8500.0,52.0,1.0,0.0,0.0,...,1,6,12,38,0 days 00:00:03.898000,-1 days +23:59:56.102000,0.999710,0.999144,W_Valideren aanvraag,4


In [21]:
X_train_processed_num = df_train[['case:AMOUNT_REQ']]
X_train_processed_cat = df_train[['lifecycle:transition_is_COMPLETE', 'lifecycle:transition_is_SCHEDULE',
       'lifecycle:transition_is_START', 'concept:name']]
y_train_1 = df_train[['time:timestamp']]
y_train_2 = df_train[['concept:name']]

# One-hot encoding on categorical data
enc = OneHotEncoder(handle_unknown = 'ignore', sparse=False)
transformed = enc.fit_transform(X_train_processed_cat)
X_train_processed_cat = pd.DataFrame(transformed, columns = enc.get_feature_names())
X_train_processed = pd.concat([X_train_processed_cat, X_train_processed_num], axis = 1)



In [22]:
X_train_processed_num = df_train[['case:AMOUNT_REQ']]
X_train_processed_cat = df_train[['lifecycle:transition_is_COMPLETE', 'lifecycle:transition_is_SCHEDULE',
       'lifecycle:transition_is_START', 'concept:name']]

y_train_1 = df_train[['future_time_diff']]
y_train_2 = df_train[['next:concept:name']]

# One-hot encoding on categorical data
enc = OneHotEncoder(handle_unknown = 'ignore', sparse=False)
transformed = enc.fit_transform(X_train_processed_cat)
X_train_processed_cat = pd.DataFrame(transformed, columns = enc.get_feature_names())
X_train_processed = pd.concat([X_train_processed_cat, X_train_processed_num], axis = 1)



In [23]:
# Find the score for each variable for time prediction
skb_time = SelectKBest(score_func = f_regression)
skb_time.fit_transform(X_train_processed, y_train_1)
score_dct_time = dict(zip(X_train_processed.columns.tolist(), skb_time.scores_.round(decimals = 1).tolist()))
df_time_score = pd.DataFrame(list(score_dct_time.items()))
df_time_score.rename(columns = {0: 'variable', 1: 'score'}, inplace = True)
df_time_score = df_time_score.sort_values(by = ['score'], ascending = False).reset_index(drop = True)
df_time_score

  y = column_or_1d(y, warn=True)


Unnamed: 0,variable,score
0,x3_W_Nabellen offertes,7915.8
1,x0_0.0,4338.4
2,x0_1.0,4338.4
3,x2_0.0,3019.9
4,x2_1.0,3019.9
5,x3_W_Completeren aanvraag,748.3
6,x1_0.0,571.2
7,x1_1.0,571.2
8,x3_W_Valideren aanvraag,499.8
9,x3_A_SUBMITTED,434.5


In [24]:
# Find the score for each variable for event prediction
skb_event = SelectKBest(score_func = f_classif)
skb_event.fit_transform(X_train_processed, y_train_2)
score_dct_event = dict(zip(X_train_processed.columns.tolist(), skb_event.scores_.round(decimals = 1).tolist()))
df_event_score = pd.DataFrame(list(score_dct_event.items()))
df_event_score.rename(columns = {0: 'variable', 1: 'score'}, inplace = True)
df_event_score = df_event_score.sort_values(by = ['score'], ascending = False).reset_index(drop = True)
df_event_score

  y = column_or_1d(y, warn=True)
  f = msb / msw


Unnamed: 0,variable,score
0,x3_A_SUBMITTED,inf
1,x3_O_CREATED,inf
2,x3_W_Nabellen incomplete dossiers,7315.9
3,x3_W_Completeren aanvraag,5598.9
4,x3_W_Nabellen offertes,5357.6
5,x3_O_SELECTED,4935.5
6,x3_A_PARTLYSUBMITTED,4451.1
7,x3_A_ACCEPTED,3701.3
8,x3_A_FINALIZED,3200.0
9,x3_W_Beoordelen fraude,2673.5


Remove outliers

In [25]:
# Remove outlier on both training and validation data
df_all = pd.concat([df_train, df_val])
df_all = df_all.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)

def find_outlier(process_name, df):
    # Remove outlier having time_diff larger than mean +- 3 * SD
    df_needed = df[(df['concept:name'] == process_name)]
    mean_value = df_needed['time_diff'].mean()
    std_value = df_needed['time_diff'].std()
    upper_bound =  mean_value + 3 * std_value
    lower_bound = mean_value - 3 * std_value
    new_df = df_needed[(df_needed['time_diff'] < lower_bound) | (df_needed['time_diff'] > upper_bound)]
    # Return case id that has at least 1 process as outlier
    return new_df['case:concept:name'].tolist()

outlier_lst = []
# i refers to the position number
for i in tqdm(range(2, len(df_all['position'].tolist()))):
    df_pos = df_all[df_all['position'] == i]
    # a refers to the concept name per position number
    for a in df_pos['concept:name'].unique().tolist():
        small_outlier_lst = find_outlier(a, df_pos)
        outlier_lst = list(set(outlier_lst + small_outlier_lst))

len(outlier_lst)

100%|██████████| 162939/162939 [05:16<00:00, 514.37it/s]


2289

In [26]:
# Remove all outliers
df_filtered = df_all[~df_all['case:concept:name'].isin(outlier_lst)]
final_all_train = sorted(df_filtered['case:concept:name'].unique().tolist())

# Split training and validation dataset
final_train, final_val = train_test_split(final_all_train, test_size = 0.2)
df_train = df_filtered[df_filtered['case:concept:name'].isin(final_train)]
df_val = df_filtered[df_filtered['case:concept:name'].isin(final_val)]

# To make sure, again sort the datasets on case and consequently timestamp, then reset the index
df_train = df_train.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)
df_val = df_val.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)

In [27]:
df_train

Unnamed: 0,org:resource,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ,case_occurrence_no,lifecycle:transition_is_COMPLETE,lifecycle:transition_is_SCHEDULE,lifecycle:transition_is_START,...,quarter,week,hour,second,next_activity_delta_t,past_activity_delta_t,norm_next_activity_delta,norm_past_activity_delta,next:concept:name,day
0,112.0,A_SUBMITTED,2011-09-30 22:38:44.546000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,0.0,1.0,0.0,0.0,...,3,39,22,44,0 days 00:00:00.334000,-1 days +23:59:59.666000,0.999710,0.999144,A_PARTLYSUBMITTED,4
1,112.0,A_PARTLYSUBMITTED,2011-09-30 22:38:44.880000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,1.0,1.0,0.0,0.0,...,3,39,22,44,0 days 00:00:53.026000,-1 days +23:59:06.974000,0.999710,0.999144,A_PREACCEPTED,4
2,112.0,A_PREACCEPTED,2011-09-30 22:39:37.906000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,2.0,1.0,0.0,0.0,...,3,39,22,37,0 days 00:00:00.969000,-1 days +23:59:59.031000,0.999710,0.999144,W_Completeren aanvraag,4
3,112.0,W_Completeren aanvraag,2011-09-30 22:39:38.875000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,3.0,0.0,1.0,0.0,...,3,39,22,38,0 days 10:57:07.562000,-1 days +13:02:52.438000,0.999715,0.999140,W_Completeren aanvraag,4
4,,W_Completeren aanvraag,2011-10-01 09:36:46.437000+00:00,2011-10-01 00:38:44.546000+02:00,173688.0,20000.0,4.0,0.0,0.0,1.0,...,4,39,9,46,0 days 00:05:56.871000,-1 days +23:54:03.129000,0.999710,0.999144,A_ACCEPTED,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64629,112.0,A_PARTLYSUBMITTED,2012-01-09 16:19:49.173000+00:00,2012-01-09 17:19:48.940000+01:00,199336.0,12000.0,1.0,1.0,0.0,0.0,...,1,2,16,49,0 days 00:00:39.421000,-1 days +23:59:20.579000,0.999710,0.999144,A_DECLINED,0
64630,112.0,A_DECLINED,2012-01-09 16:20:28.594000+00:00,2012-01-09 17:19:48.940000+01:00,199336.0,12000.0,2.0,1.0,0.0,0.0,...,1,2,16,28,0 days 00:16:59.744000,-1 days +23:43:00.256000,0.999711,0.999144,Nothing,0
64631,112.0,A_SUBMITTED,2012-01-09 16:37:34.249000+00:00,2012-01-09 17:37:34.249000+01:00,199342.0,11500.0,0.0,1.0,0.0,0.0,...,1,2,16,34,0 days 00:00:00.500000,-1 days +23:59:59.500000,0.999710,0.999144,A_PARTLYSUBMITTED,0
64632,112.0,A_PARTLYSUBMITTED,2012-01-09 16:37:34.749000+00:00,2012-01-09 17:37:34.249000+01:00,199342.0,11500.0,1.0,1.0,0.0,0.0,...,1,2,16,34,0 days 00:00:36.401000,-1 days +23:59:23.599000,0.999710,0.999144,A_DECLINED,0


# Export data

In [None]:
df_train.to_csv('bpi2012_train_filtered.csv', index = False)
df_val.to_csv('bpi2012_val_filtered.csv', index = False)
df_test.to_csv('bpi2012_test_filtered.csv', index = False)

In [None]:
from google.colab import files
files.download('bpi2012_train_filtered.csv')
files.download('bpi2012_val_filtered.csv')
files.download('bpi2012_test_filtered.csv')