# Homework 4: Predictive Process Monitoring

### Task 1.

In [2]:
import pandas as pd
import numpy as np

df_org = pd.read_csv('BPI_Challenge_2012_filtered.csv')

In [3]:
df = df_org.sort_values(['case_id', 'Start_Time', 'End_Time'])

In [4]:
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
df['End_Time'] = pd.to_datetime(df['End_Time'])

In [5]:
df['remtime'] = (df.groupby('case_id')['End_Time'].transform('max') - df.groupby('case_id')['Start_Time'].transform('min')) // pd.Timedelta('1s')

In [6]:
df.head()

Unnamed: 0,case_id,AMOUNT_REQ,resource,activity,Start_Time,End_Time,REG_DATE,remtime
8272,195455,5000,112.0,A_SUBMITTED,2011-12-24 11:18:27.336,2011-12-24 11:18:27.336,2011-12-24T11:18:27.336,861917
8273,195455,5000,112.0,A_PARTLYSUBMITTED,2011-12-24 11:18:29.367,2011-12-24 11:18:29.367,2011-12-24T11:18:27.336,861917
8274,195455,5000,112.0,A_PREACCEPTED,2011-12-24 11:19:07.492,2011-12-24 11:19:07.492,2011-12-24T11:18:27.336,861917
8275,195455,5000,10913.0,W_Completeren aanvraag,2011-12-24 13:52:01.453,2011-12-24 13:52:12.990,2011-12-24T11:18:27.336,861917
8276,195455,5000,10913.0,W_Completeren aanvraag,2011-12-24 15:04:35.228,2011-12-24 15:13:54.172,2011-12-24T11:18:27.336,861917


### Task 2.

In [7]:
df['start_month'] = df['Start_Time'].dt.month
df['end_month'] = df['End_Time'].dt.month

df['start_day'] = df['Start_Time'].dt.weekday
df['end_day'] = df['End_Time'].dt.weekday

df['start_reltime'] = (df['Start_Time'] - pd.to_datetime(df['Start_Time'].dt.date)) // pd.Timedelta('1s')
df['end_reltime'] = (df['End_Time'] - pd.to_datetime(df['End_Time'].dt.date)) // pd.Timedelta('1s')

In [8]:
df.head()

Unnamed: 0,case_id,AMOUNT_REQ,resource,activity,Start_Time,End_Time,REG_DATE,remtime,start_month,end_month,start_day,end_day,start_reltime,end_reltime
8272,195455,5000,112.0,A_SUBMITTED,2011-12-24 11:18:27.336,2011-12-24 11:18:27.336,2011-12-24T11:18:27.336,861917,12,12,5,5,40707,40707
8273,195455,5000,112.0,A_PARTLYSUBMITTED,2011-12-24 11:18:29.367,2011-12-24 11:18:29.367,2011-12-24T11:18:27.336,861917,12,12,5,5,40709,40709
8274,195455,5000,112.0,A_PREACCEPTED,2011-12-24 11:19:07.492,2011-12-24 11:19:07.492,2011-12-24T11:18:27.336,861917,12,12,5,5,40747,40747
8275,195455,5000,10913.0,W_Completeren aanvraag,2011-12-24 13:52:01.453,2011-12-24 13:52:12.990,2011-12-24T11:18:27.336,861917,12,12,5,5,49921,49932
8276,195455,5000,10913.0,W_Completeren aanvraag,2011-12-24 15:04:35.228,2011-12-24 15:13:54.172,2011-12-24T11:18:27.336,861917,12,12,5,5,54275,54834


### Task 3.

In [9]:
def split_data_strict(data, timestamp_col, case_id_col, train_ratio, split="temporal"):  
    data = data.sort_values([timestamp_col, 'activity'], ascending=True, kind='mergesort')
    grouped = data.groupby(case_id_col)
    start_timestamps = grouped[timestamp_col].min().reset_index()
    start_timestamps = start_timestamps.sort_values(timestamp_col, ascending=True, kind='mergesort')
    train_ids = list(start_timestamps[case_id_col])[:int(train_ratio*len(start_timestamps))]
    train = data[data[case_id_col].isin(train_ids)].sort_values([timestamp_col, 'activity'], ascending=True, kind='mergesort')
    test = data[~data[case_id_col].isin(train_ids)].sort_values([timestamp_col, 'activity'], ascending=True, kind='mergesort')
    split_ts = test[timestamp_col].min()
    train = train[train[timestamp_col] < split_ts]
    return (train, test)

train, test = split_data_strict(df, 'End_Time', 'case_id', 0.8, split="temporal")

In [10]:
def get_pos_case_length_quantile(data, quantile=0.90):
    return int(np.ceil(data.groupby('case_id').size().quantile(quantile)))

min_prefix_length = 1
max_prefix_length = min(40, get_pos_case_length_quantile(df, 0.90))

In [11]:
from sklearn.model_selection import StratifiedKFold

In [12]:
def generate_prefix_data(data, min_length, max_length, gap=1):

    data['case_length'] = data.groupby('case_id')['activity'].transform(len)
    dt_prefixes = data[data['case_length'] >= min_length].groupby('case_id').head(min_length)
    dt_prefixes["prefix_nr"] = 1
    dt_prefixes["orig_case_id"] = dt_prefixes['case_id']   
    for nr_events in range(min_length+gap, max_length+1, gap):
        tmp = data[data['case_length'] >= nr_events].groupby('case_id').head(nr_events)
        tmp["orig_case_id"] = tmp['case_id']
        tmp['case_id'] = tmp['case_id'].apply(lambda x: "%s_%s"%(x, nr_events))
        tmp["prefix_nr"] = nr_events
        dt_prefixes = pd.concat([dt_prefixes, tmp], axis=0)
    dt_prefixes['case_length'] = dt_prefixes['case_length'].apply(lambda x: min(max_length, x))
    dt_prefixes = dt_prefixes.sort_values(['case_id', 'prefix_nr'], ascending=[True, True])
    return dt_prefixes

In [13]:
dt_train_prefixes = generate_prefix_data(train, min_prefix_length, max_prefix_length)
dt_test_prefixes = generate_prefix_data(test, min_prefix_length, max_prefix_length)

In [14]:
import BucketFactory

random_state=1234
bucketer_args = {'encoding_method': 'last', 
                 'case_id_col': 'case_id', 
                 'cat_cols':['activity'], 
                 'num_cols':[], 
                 'random_state':random_state}

bucket_method = 'prefix'
if bucket_method == "cluster":
    bucketer_args["n_clusters"] = 5
bucketer = BucketFactory.get_bucketer(bucket_method, **bucketer_args)
bucket_assignments_train = bucketer.fit_predict(dt_train_prefixes)
bucket_assignments_test = bucketer.predict(dt_test_prefixes)

for bucket_number in bucketer.n_states:
    bucket_indexes = dt_train_prefixes.groupby('case_id').first().index
    bucket_indexes = bucket_indexes[bucket_assignments_train == bucket_number]
    bucket_data = dt_train_prefixes[dt_train_prefixes['case_id'].isin(bucket_indexes)]
    bucket_data
    def get_label_numeric(data):
        y = data.groupby('case_id').first()['remtime']
        return y
    train_y = get_label_numeric(bucket_data)
    
bucket_number = 2  
bucket_indexes = dt_train_prefixes.groupby('case_id').first().index
bucket_indexes = bucket_indexes[bucket_assignments_train == bucket_number]
print(bucket_indexes)
bucket_data = dt_train_prefixes[dt_train_prefixes['case_id'].isin(bucket_indexes)]

def get_label_numeric(data):
    y = data.groupby('case_id').first()['remtime']
    return y
train_y = get_label_numeric(bucket_data)
bucket_indexes = dt_test_prefixes.groupby('case_id').first().index
bucket_indexes = bucket_indexes[bucket_assignments_test == bucket_number]
bucket_data_test = dt_test_prefixes[dt_test_prefixes['case_id'].isin(bucket_indexes)]

test_y = get_label_numeric(bucket_data_test)

Index(['195455_2', '195458_2', '195461_2', '195464_2', '195467_2', '195470_2',
       '195473_2', '195485_2', '195491_2', '195497_2',
       ...
       '200916_2', '200928_2', '200934_2', '200937_2', '200940_2', '200943_2',
       '200946_2', '200949_2', '200955_2', '200961_2'],
      dtype='object', name='case_id', length=1308)


In [15]:
import EncoderFactory
from sklearn.pipeline import FeatureUnion, Pipeline

encoding_dict = {
    "laststate": ["static", "last"],
    "agg": ["static", "agg"],
    "index": ["static", "index"],
    "combined": ["static", "last", "agg"]
}

methods = encoding_dict['combined']

In [17]:
log_schema = {'case_id_col': 'case_id',
'timestamp_col': ['Start_Time', 'End_Time', 'REG_DATE'],
'activity_col': 'activity',
'label': 'remtime',
'negative_outcome': 'deviant',
'static_cat_cols': ["resource"],
'static_num_cols': ["AMOUNT_REQ"],
'dynamic_cat_cols': ["activity", "resource"],
'dynamic_num_cols': ["remtime", 'start_month', 'end_month', 'start_day', 'end_day', 'start_reltime', 'end_reltime'],
}

feature_combiner = FeatureUnion([(method, EncoderFactory.get_encoder(method, **log_schema)) for method in methods])
feature_combiner
encoding = feature_combiner.fit_transform(bucket_data, train_y)

In [23]:
import xgboost as XGB

model = XGB.XGBRegressor(n_estimators=1000, learning_rate=0.02, n_jobs=2)
pipeline = Pipeline([('encoder', feature_combiner), ('cls', model)])
pipeline.fit(bucket_data, train_y)

Pipeline(steps=[('encoder',
                 FeatureUnion(transformer_list=[('static',
                                                 <transformers.StaticTransformer.StaticTransformer object at 0x00000274A190B460>),
                                                ('last',
                                                 <transformers.LastStateTransformer.LastStateTransformer object at 0x00000274A190B550>),
                                                ('agg',
                                                 <transformers.AggregateTransformer.AggregateTransformer object at 0x00000274A190B730>)])),
                ('cls',
                 XGBRegresso...
                              feature_types=None, gamma=0, gpu_id=-1,
                              grow_policy='depthwise', importance_type=None,
                              interaction_constraints='', learning_rate=0.02,
                              max_bin=256, max_cat_threshold=64,
                              max_cat_to_oneho

In [24]:
preds = pipeline.predict(bucket_data_test)
from sklearn.metrics import mean_absolute_error
score = mean_absolute_error(test_y, preds)
score

106.72122971071016

### Task 4.

In [25]:
log_schema_2 = {'case_id_col': 'case_id',
'timestamp_col': ['Start_Time', 'End_Time', 'REG_DATE'],
'activity_col': 'activity',
'label': 'remtime',
'negative_outcome': 'deviant',
'static_cat_cols': ["resource"],
'static_num_cols': ["AMOUNT_REQ"],
'dynamic_cat_cols': ["activity", "resource"],
'dynamic_num_cols': ["remtime"],
}

In [26]:
feature_combiner = FeatureUnion([(method, EncoderFactory.get_encoder(method, **log_schema_2)) for method in methods])
feature_combiner
encoding = feature_combiner.fit_transform(bucket_data, train_y)

In [28]:
model = XGB.XGBRegressor(n_estimators=1000, learning_rate=0.02, n_jobs=2)
pipeline = Pipeline([('encoder', feature_combiner), ('cls', model)])
pipeline.fit(bucket_data, train_y)

Pipeline(steps=[('encoder',
                 FeatureUnion(transformer_list=[('static',
                                                 <transformers.StaticTransformer.StaticTransformer object at 0x00000274A1916D00>),
                                                ('last',
                                                 <transformers.LastStateTransformer.LastStateTransformer object at 0x00000274A190B580>),
                                                ('agg',
                                                 <transformers.AggregateTransformer.AggregateTransformer object at 0x00000274A190B5B0>)])),
                ('cls',
                 XGBRegresso...
                              feature_types=None, gamma=0, gpu_id=-1,
                              grow_policy='depthwise', importance_type=None,
                              interaction_constraints='', learning_rate=0.02,
                              max_bin=256, max_cat_threshold=64,
                              max_cat_to_oneho

In [29]:
preds = pipeline.predict(bucket_data_test)
from sklearn.metrics import mean_absolute_error
score = mean_absolute_error(test_y, preds)
score

105.31946385672333

There was a minor improvement in the model by removing the extra variables. The extra variables impacted the models ability to predict and were not a necessary addition.