In [None]:
import os
from os.path import dirname
root_path = dirname(dirname(os.getcwd()))
print(root_path)
import sys
sys.path.append(root_path + '/RemainingCycleTimePrediction/2_Scripts/')
import pandas as pd
import numpy as np
import time, datetime

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split

from Event_log_processing_utils import Extract_trace_and_temporal_features, Extract_prefix
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings("ignore")


data_dir = root_path + '/RemainingCycleTimePrediction/1_Data/'
project_dir = root_path + '/RemainingCycleTimePrediction/'

## Load data

In [None]:
# data_name = 'BPIC20'
data_name = 'Helpdesk'

In [None]:
tab_all = pd.read_csv(data_dir+data_name+"_processed_all.csv")
tab_train= pd.read_csv(data_dir+data_name+"_processed_train.csv")
tab_test = pd.read_csv(data_dir+data_name+"_processed_test.csv")

In [None]:
data_dir+data_name+"_processed_all.csv"

## Prepare inputs and outputs for model training

In [None]:
def Prepare_X_Y_remaining_time(tab, list_activities, encoder, divisor, divisor2, divisor_rt):
    lines, lines_t, lines_t2, lines_t3, lines_t4 = Extract_trace_and_temporal_features(tab)
    prefixes, outputs = Extract_prefix(lines, lines_t, lines_t2, lines_t3, lines_t4)
    num_samples = len(prefixes[0])
    list_features = []
    list_edge_idx = []
    list_edge_weight = []
    list_rt = []
    for i, sentence in enumerate(prefixes[0]):
        list_rt.append(outputs[2][i]/divisor_rt)
        x = torch.zeros(len(sentence), len(list_activities)+5)
        edge_weight = []
        edge_idx = []
        sentence_t = prefixes[1][i]
        sentence_t2 = prefixes[2][i]
        sentence_t3 = prefixes[3][i]
        sentence_t4 = prefixes[4][i]
        for j, char in enumerate(sentence):
            x[j, :len(list_activities)] = torch.from_numpy(encoder.transform(np.array([[char]])).toarray()[0]) 
            x[j, len(list_activities)] = j+1
            x[j, len(list_activities)+1] = sentence_t[j]/divisor
            x[j, len(list_activities)+2] = sentence_t2[j]/divisor2
            x[j, len(list_activities)+3] = sentence_t3[j]/86400
            x[j, len(list_activities)+4] = sentence_t4[j]/7
            if j < len(sentence)-1:
                if sentence[j] == sentence[j+1]:
                    weight = 0
                elif sentence[j+1] in sentence[:j+1]:
                    weight = -1
                else:
                    weight = 1
                edge_weight.append(weight)
                edge_idx.append([j, j+1])
        list_features.append(x)
        list_edge_idx.append(torch.tensor(edge_idx).t())
        list_edge_weight.append(torch.tensor(edge_weight).t())
    return [list_features, list_edge_idx, list_edge_weight], torch.tensor(list_rt)

class EventLogData(Dataset):
    def __init__ (self, input_x, output):
        self.X = input_x[0]
        self.A = input_x[1]
        self.V = input_x[2]
        self.y = output
        self.y = self.y.to(torch.float32)
        self.y = self.y.reshape((len(self.y),1))

    #get the number of rows in the dataset
    def __len__(self):
        return len(self.X)

    #get a row at a particular index in the dataset
    def __getitem__ (self,idx):
        return [[self.X[idx], self.A[idx], self.V[idx]],self.y[idx]]
    
     # get the indices for the train and test rows
    def get_splits(self, n_valid = 0.2):
        train_idx,valid_idx = train_test_split(list(range(len(self.X))),test_size = n_valid, shuffle = True)
        train = Subset(self, train_idx)
        valid = Subset(self, valid_idx)
        return train, valid
    
def my_collate(batch):
    data = [item[0] for item in batch]
    Y = [item[1] for item in batch]
    return [data, Y]

In [None]:
list_activities = list(tab_all["Activity"].unique())
#creating instance of one-hot-encoder and fit on the whole dataset
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(np.array(list_activities).reshape((len(list_activities), 1)))

lines, lines_t, lines_t2, lines_t3, lines_t4 = Extract_trace_and_temporal_features(tab_all)
maxlen = max([len(x) for x in lines]) #find maximum line size
lines, lines_t, lines_t2, lines_t3, lines_t4 = Extract_trace_and_temporal_features(tab_train)
divisor = np.mean([item for sublist in lines_t for item in sublist]) #average time between events
print('divisor: {}'.format(divisor))
divisor2 = np.mean([item for sublist in lines_t2 for item in sublist]) #average time between current and first events
print('divisor2: {}'.format(divisor2))
prefixes, outputs = Extract_prefix(lines, lines_t, lines_t2, lines_t3, lines_t4)
divisor_rt = np.mean(outputs[2])
print('divisor_rt: {}'.format(divisor_rt))

In [None]:
X_train, Y_train = Prepare_X_Y_remaining_time(tab_train, list_activities, encoder, divisor, divisor2, divisor_rt)
train_dataset = EventLogData(X_train, Y_train)
train, valid = train_dataset.get_splits()
train_dl = DataLoader(train, batch_size=16, shuffle = True, collate_fn=my_collate)
valid_dl = DataLoader(valid, batch_size=1, shuffle = False, collate_fn=my_collate)

X_test, Y_test = Prepare_X_Y_remaining_time(tab_test, list_activities, encoder, divisor, divisor2, divisor_rt)
test = EventLogData(X_test, Y_test)
test_dl = DataLoader(test, batch_size=1, shuffle = False, collate_fn=my_collate)

In [None]:
len(X_train[0])

In [None]:
len(X_test[0])

In [None]:
len(valid_dl)

## Train GGNN model

In [None]:
import torch.nn as nn
from torch_geometric.nn import GatedGraphConv, global_mean_pool

In [None]:
class TimePredictor(torch.nn.Module):
    def __init__(self, out_channels=15):
        super(TimePredictor, self).__init__()

        self.layer1 = GatedGraphConv(out_channels, num_layers=5)
        self.layer2 = nn.Sequential(
            nn.Dropout(),
            nn.Linear(out_channels,256),
            nn.ReLU(),
            nn.Linear(256,256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256,1),
        )

    def forward(self, x):
        x = [self.layer1(X, A, V) for i, (X, A, V ) in enumerate(x)]
        x = torch.stack([global_mean_pool(single_x, batch = None) for single_x in x])
        x = x.squeeze(1)
        x = self.layer2(x)
        return x

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
save_folder = project_dir + '4_Outputs/Output_files/'+data_name+'_GGNN'
if not os.path.exists(project_dir + '4_Outputs/Output_files'):
    os.mkdir(project_dir + '4_Outputs/Output_files')
if not os.path.exists(save_folder):
    os.mkdir(save_folder)

num_features = len(list_activities)+5
num_epochs = 100
early_stop_patience = 10
min_delta = 0
lr_value = 1e-03
num_runs = 5
running_time = []
for run in range(num_runs):
    start=datetime.datetime.now()
    print("Run: {}".format(run+1))
    model = TimePredictor(num_features)  
    criterion = nn.L1Loss()
    optimizer = torch.optim.Adam(model.parameters(),lr=lr_value)

    model = model.to(device)
    epochs_plt = []
    mae_plt = []
    valid_loss_plt = []
    not_improved_count = 0

    for epoch in range(num_epochs):
        model.train()
        training_loss = 0
        predictions, actuals = list(),list()
        num_train = 0

        for i, (inputs,targets) in enumerate(train_dl):
            targets = torch.tensor(targets).to(device)
            optimizer.zero_grad() # Clearing the gradients
            yhat = model(inputs)
            loss = criterion(yhat.reshape((1,-1)),targets.reshape((1,-1)))
            loss.backward()
            optimizer.step()

            training_loss+= loss.item()
            num_train+=1

        with torch.no_grad():
            model.eval()
            num_valid = 0
            validation_loss = 0
            for i,(inputs,targets) in enumerate(valid_dl):            
                targets = torch.tensor(targets).to(device)
                yhat_valid = model(inputs)           
                loss_valid = criterion(yhat_valid.reshape((1,-1)),targets.reshape((1,-1)))
                validation_loss+= loss_valid.item()
                num_valid+= 1

        avg_training_loss = training_loss/num_train
        avg_validation_loss = validation_loss/num_valid
        print("Epoch: {}, Training MAE : {}, Validation loss : {}".format(epoch,avg_training_loss,avg_validation_loss))
        epochs_plt.append(epoch+1)
        mae_plt.append(avg_training_loss)
        valid_loss_plt.append(avg_validation_loss)
        if (epoch==0): 
            best_loss = avg_validation_loss
            torch.save(model.state_dict(),'{}/best_model_run_{}.pt'.format(save_folder,run+1))
        else:
            if (best_loss - avg_validation_loss >= min_delta):
                torch.save(model.state_dict(),'{}/best_model_run_{}.pt'.format(save_folder,run+1))
                best_loss = avg_validation_loss
                not_improved_count = 0
            else:
                not_improved_count += 1

       # Early stopping
        if not_improved_count == early_stop_patience:
            print("Validation performance didn\'t improve for {} epochs. "
                            "Training stops.".format(early_stop_patience))
            break
            
    filepath = '{}/Loss_'.format(save_folder)+data_name+'_run_{}.txt'.format(run+1)
    with open(filepath, 'w') as file:
        for item in zip(epochs_plt,mae_plt,valid_loss_plt):
            file.write("{}\n".format(item))
    running_time.append((datetime.datetime.now()-start).total_seconds())

In [None]:
running_time

In [None]:
np.mean(running_time)/60

### 3. Evaluation

In [None]:
def evaluate_model(model):
    err_dict = {}
    with torch.no_grad():
        model.eval()
        testing_loss_all = 0
        num_of_minibatch = 0
        for i,(inputs,targets) in enumerate(test_dl):
            prefix_len = inputs[0][0].size(0)
            targets = torch.tensor(targets).to(device)
            yhat = model(inputs)
            loss_mape = torch.abs((targets.reshape((1,-1)) - yhat.reshape((1,-1)))/targets.reshape((1,-1)))*100
            criterion = nn.L1Loss()
            loss_mae = criterion(yhat.reshape((1,-1)),targets.reshape((1,-1))).item()
            if prefix_len not in err_dict.keys():
                err_dict[prefix_len] = [[loss_mape, loss_mae]]
            else:
                err_dict[prefix_len].append([loss_mape, loss_mae])
    return err_dict

In [None]:
err_total_dict = {}
print(save_folder)
for run in range(5):
    print("Run: {}".format(run+1))
    trained_model = TimePredictor(num_features)
    trained_model.load_state_dict(torch.load('{}/best_model_run_{}.pt'.format(save_folder,run+1),
                                         map_location=torch.device(device)))
    err_dict = evaluate_model(trained_model)
    
    for key in err_dict.keys():
        err = torch.mean(torch.tensor(err_dict[key]), axis = 0)
        if key in err_total_dict.keys():
            err_total_dict[key].append(torch.tensor([err[0], err[1]*divisor_rt/86400]))
        else:
            err_total_dict[key] = [torch.tensor([err[0], err[1]*divisor_rt/86400])]

In [None]:
num_samples_dict = {}
for i,(inputs,targets) in enumerate(test_dl):
    key = inputs[0][0].size(0)
    if key in num_samples_dict.keys():
        num_samples_dict[key] += 1
    else:
        num_samples_dict[key] = 1

In [None]:
err_total_dict[key][0]

In [None]:
list_prefix_len = []
list_num_samples = []
list_mape_err = []
list_mape_std = []
list_mae_err = []
list_mae_std = []
for key, value in err_total_dict.items():
    list_prefix_len.append(key)
    list_num_samples.append(num_samples_dict[key])
    list_mape_err.append(round(torch.stack(err_total_dict[key]).mean(axis = 0)[0].item(), 3))
    list_mape_std.append(round(torch.stack(err_total_dict[key]).std(axis=0)[0].item(), 3))
    list_mae_err.append(round(torch.stack(err_total_dict[key]).mean(axis = 0)[1].item(), 3))
    list_mae_std.append(round(torch.stack(err_total_dict[key]).std(axis=0)[1].item(), 3))
tab_result = pd.DataFrame({"Prefix length":list_prefix_len, "Num samples": list_num_samples, 
                           "MAPE(%)":list_mape_err, "MAPE std": list_mape_std,
                           "MAE(days)": list_mae_err, "MAE std": list_mae_std})

if not os.path.exists(project_dir + '4_Outputs/Evaluation'):
    os.mkdir(project_dir + '4_Outputs/Evaluation')
tab_result.to_csv(project_dir+"4_Outputs/Evaluation/"+data_name+"_GGNN_eval.csv", index = False)