In [None]:
import os
from os.path import dirname
root_path = dirname(dirname(os.getcwd()))
print(root_path)
import sys
sys.path.append(root_path + '/RemainingCycleTimePrediction/2_Scripts/')
import pandas as pd
import numpy as np
import time, datetime

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split

from Event_log_processing_utils import Extract_trace_and_temporal_features, Extract_prefix

import warnings
warnings.filterwarnings("ignore")

data_dir = root_path + '/RemainingCycleTimePrediction/1_Data/'
project_dir = root_path + '/RemainingCycleTimePrediction/'

## Load data

In [None]:
data_name = 'BPIC20'
# data_name = 'Helpdesk'

In [None]:
tab_all = pd.read_csv(data_dir+data_name+"_processed_all.csv")
tab_train= pd.read_csv(data_dir+data_name+"_processed_train.csv")
tab_test = pd.read_csv(data_dir+data_name+"_processed_test.csv")

## Prepare inputs and outputs for model training

In [None]:
def Prepare_X_Y_remaining_time(tab, list_activities, num_features, divisor, divisor2, divisor_rt):
    lines, lines_t, lines_t2, lines_t3, lines_t4 = Extract_trace_and_temporal_features(tab)
    prefixes, outputs = Extract_prefix(lines, lines_t, lines_t2, lines_t3, lines_t4)
    num_samples = len(prefixes[0])
    list_features = []
    list_rt = []
    for i, sentence in enumerate(prefixes[0]):
        list_rt.append(outputs[2][i]/divisor_rt)
        x = torch.zeros(len(list_activities), num_features)
        sentence_t = prefixes[1][i]
        sentence_t2 = prefixes[2][i]
        sentence_t3 = prefixes[3][i]
        sentence_t4 = prefixes[4][i]
        for j, char in enumerate(sentence):
            x[list_activities.index(char),:] = torch.tensor([sentence_t[j]/divisor, sentence_t2[j]/divisor2, 
                                                            sentence_t3[j]/86400, sentence_t4[j]/7])
        list_features.append(x)
    return torch.stack(list_features), torch.tensor(list_rt)


class EventLogData(Dataset):
    def __init__ (self, input_x, output):
        self.X = input_x
        self.y = output
        self.y = self.y.to(torch.float32)
        self.y = self.y.reshape((len(self.y),1))

    #get the number of rows in the dataset
    def __len__(self):
        return len(self.X)

    #get a row at a particular index in the dataset
    def __getitem__ (self,idx):
        return [self.X[idx],self.y[idx]]
    
     # get the indices for the train and test rows
    def get_splits(self, n_valid = 0.2):
        train_idx,valid_idx = train_test_split(list(range(len(self.X))),test_size = n_valid, shuffle = True)
        train = Subset(self, train_idx)
        valid = Subset(self, valid_idx)
        return train, valid

In [None]:
list_activities = list(tab_all["Activity"].unique())

lines, lines_t, lines_t2, lines_t3, lines_t4 = Extract_trace_and_temporal_features(tab_all)
maxlen = max([len(x) for x in lines]) #find maximum line size
lines, lines_t, lines_t2, lines_t3, lines_t4 = Extract_trace_and_temporal_features(tab_train)
divisor = np.mean([item for sublist in lines_t for item in sublist]) #average time between events
print('divisor: {}'.format(divisor))
divisor2 = np.mean([item for sublist in lines_t2 for item in sublist]) #average time between current and first events
print('divisor2: {}'.format(divisor2))
prefixes, outputs = Extract_prefix(lines, lines_t, lines_t2, lines_t3, lines_t4)
divisor_rt = np.mean(outputs[2])
print('divisor_rt: {}'.format(divisor_rt))

In [None]:
num_features = 4
X_train, Y_train = Prepare_X_Y_remaining_time(tab_train, list_activities, num_features, divisor, divisor2, divisor_rt)
train_dataset = EventLogData(X_train, Y_train)
train, valid = train_dataset.get_splits()
train_dl = DataLoader(train, batch_size=16, shuffle = True)
valid_dl = DataLoader(valid, batch_size=1, shuffle = False)

X_test, Y_test = Prepare_X_Y_remaining_time(tab_test, list_activities, num_features, divisor, divisor2, divisor_rt)
test = EventLogData(X_test, Y_test)
test_dl = DataLoader(test, batch_size=1, shuffle = False)

In [None]:
X_train.size()

### Compute adjacency matrix

In [None]:
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.algo.discovery.dfg import algorithm as dfg_algorithm
from scipy.linalg import fractional_matrix_power
import copy
import torch 
import torch.nn as nn
from torch.nn import Parameter
from torch_geometric.nn.inits import glorot, zeros


def generate_process_graph(tab, list_nodes, showProcessGraph = False, binary_adjacency = False, laplacian_matrix = True):
    data = copy.deepcopy(tab)
    num_nodes = len(list_nodes) 
    cols = ['case:concept:name','concept:name','time:timestamp']
    data.columns = cols 
    data['time:timestamp'] = pd.to_datetime(data['time:timestamp'])
    data['concept:name'] = data['concept:name'].astype(str)
    log = log_converter.apply(data, variant=log_converter.Variants.TO_EVENT_LOG)
    dfg = dfg_algorithm.apply(log)
    if showProcessGraph:
        visualize_process_graph(dfg,log)
    max = 0
    min = 0
    adj = np.zeros((num_nodes,num_nodes))
    for k,v in dfg.items():
        for i in range(num_nodes):
            if(k[0] == list_nodes[i]):
                for j in range(num_nodes):
                    if (k[1] == list_nodes[j]):
                        adj[i][j] = v
                        if (v > max): max=v
                        if (v< min): min=v
  # print("Raw weighted adjacency matrix: {}".format(adj))
  
    if binary_adjacency:
        for i in range(num_nodes):
            for j in range(num_nodes):
                if (adj[i][j]!=0):
                    adj[i][j]=1
    # print("Binary adjacency matrix: {}".format(adj))
    adj = adj + np.identity(len(list_nodes))
    D = np.array(np.sum(adj, axis=1))
    D = np.matrix(np.diag(D))

    # print("Degree matrix: {}".format(D))
    
    adj = np.matrix(adj)

    if laplacian_matrix:
        adj = D - adj # Laplacian Transform 
    # print("Laplacian matrix: {}".format(adj))
    
  # adj = (D**-1)*adj
    adj = fractional_matrix_power(D, -0.5)*adj*fractional_matrix_power(D, -0.5)
    adj = torch.Tensor(adj).to(torch.float)  
    return adj

In [None]:
adj = generate_process_graph(tab_train, list_activities, binary_adjacency = False, laplacian_matrix = True)
adj.size()

## Train GCN model

In [None]:
class GCNConv(torch.nn.Module):
    def __init__(self, num_nodes, num_features, out_channels):
        super(GCNConv, self).__init__()

        self.in_channels = num_features
        self.out_channels = out_channels

        self.weight = Parameter(torch.Tensor(num_features, out_channels))
        self.bias = Parameter(torch.Tensor(num_nodes))

        self.reset_parameters()

    def reset_parameters(self):
        glorot(self.weight)
        zeros(self.bias)

    def forward(self, x, adj):
        batch_size = x.size(0)
        adj = adj.repeat(batch_size, 1, 1)
        weight = self.weight.repeat(batch_size, 1, 1)
        bias = self.bias.repeat(batch_size, 1)
        x = adj@x@weight
        x = x.squeeze(2)
        x = x + bias
        return x

    
class TimePredictor(torch.nn.Module):
    def __init__(self,num_nodes, num_features = 4):
        super(TimePredictor, self).__init__()

        self.layer1 = GCNConv(num_nodes , num_features, out_channels=1)
        self.layer2 = nn.Sequential(
            nn.Dropout(),
            nn.Linear(num_nodes,256),
            nn.ReLU(),
            nn.Linear(256,256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256,1),
        )

    def forward(self, x, adj):
        x = self.layer1(x,adj)
        x = self.layer2(x)
        return x

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
variant = 'laplacianOnWeighted'
save_folder = project_dir + '4_Outputs/Output_files/'+data_name+'_GCN'
if not os.path.exists(project_dir + '4_Outputs/Output_files'):
    os.mkdir(project_dir + '4_Outputs/Output_files')
if not os.path.exists(save_folder):
    os.mkdir(save_folder)
num_runs = 5
num_nodes = len(list_activities)
num_features = 4
num_epochs = 100
lr_value = 1e-03
early_stop_patience = 10
min_delta = 0
running_time = []
for run in range(num_runs):
    start=datetime.datetime.now()
    print("Run: {}".format(run+1))
    model = TimePredictor(num_nodes, num_features)  
    criterion = nn.L1Loss()
    optimizer = torch.optim.Adam(model.parameters(),lr=lr_value)

    # print("************* Timestamp Predictor ***************")
    # print("Train size: {}, Validation size:{}, Test size: {}".format(len(train_dl.dataset),len(valid_dl.dataset),len(test_dl.dataset)))
    # print(model)
    model = model.to(device)
    adj = adj.to(device)
    epochs_plt = []
    mae_plt = []
    valid_loss_plt = []
    not_improved_count = 0
    
    for epoch in range(num_epochs):
        model.train()
        training_loss = 0
        predictions, actuals = list(),list()
        num_train = 0

        for i, (inputs,targets) in enumerate(train_dl):

            inputs,targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad() # Clearing the gradients
            yhat = model(inputs.float(),adj.float())
            loss = criterion(yhat.reshape((1,-1)),targets.reshape((1,-1)))
            loss.backward()
            optimizer.step()

            training_loss+= loss.item()
            num_train+=1

        with torch.no_grad():
            model.eval()
            num_valid = 0
            validation_loss = 0
            for i,(inputs,targets) in enumerate(valid_dl):
                inputs,targets = inputs.to(device),targets.to(device)
                yhat_valid = model(inputs,adj)
                loss_valid = criterion(yhat_valid.reshape((1,-1)),targets.reshape((1,-1)))
                validation_loss+= loss_valid.item()
                num_valid+= 1

        avg_training_loss = training_loss/num_train
        avg_validation_loss = validation_loss/num_valid
        print("Epoch: {}, Training MAE : {}, Validation loss : {}".format(epoch,avg_training_loss,avg_validation_loss))
        epochs_plt.append(epoch+1)
        mae_plt.append(avg_training_loss)
        valid_loss_plt.append(avg_validation_loss)
        if (epoch==0): 
            best_loss = avg_validation_loss
            torch.save(model.state_dict(),'{}/best_model_run_{}.pt'.format(save_folder,run+1))
        else:
            if (best_loss - avg_validation_loss >= min_delta):
                torch.save(model.state_dict(),'{}/best_model_run_{}.pt'.format(save_folder,run+1))
                best_loss = avg_validation_loss
                not_improved_count = 0
            else:
                not_improved_count += 1
        # Early stopping
        if not_improved_count == early_stop_patience:
            print("Validation performance didn\'t improve for {} epochs. "
                            "Training stops.".format(early_stop_patience))
            break
    
    delta_time = (datetime.datetime.now()-start).total_seconds()
    running_time.append(delta_time)
    filepath = '{}/Loss_'.format(save_folder)+data_name+'_{}_{}_run{}.txt'.format(variant,lr_value,run)    
    with open(filepath, 'w') as file:
        for item in zip(epochs_plt,mae_plt,valid_loss_plt):
            file.write("{}\n".format(item))
        file.write("Running time: {}\n".format(delta_time))
    

In [None]:
running_time

In [None]:
np.mean(running_time)/60

## Evaluation

In [None]:
lines, lines_t, lines_t2, lines_t3, lines_t4 = Extract_trace_and_temporal_features(tab_test)
prefixes, outputs = Extract_prefix(lines, lines_t, lines_t2, lines_t3, lines_t4)

In [None]:
def evaluate_model(model):
    err_dict = {}
    with torch.no_grad():
        model.eval()
        testing_loss_all = 0
        num_of_minibatch = 0
        for i,(inputs,targets) in enumerate(test_dl):
            prefix_len = len(prefixes[0][i])
            targets = torch.tensor(targets).to(device)
            yhat = model(inputs, adj)
            loss_mape = torch.abs((targets.reshape((1,-1)) - yhat.reshape((1,-1)))/targets.reshape((1,-1)))*100
            criterion = nn.L1Loss()
            loss_mae = criterion(yhat.reshape((1,-1)),targets.reshape((1,-1))).item()
            if prefix_len not in err_dict.keys():
                err_dict[prefix_len] = [[loss_mape, loss_mae]]
            else:
                err_dict[prefix_len].append([loss_mape, loss_mae])
    return err_dict

In [None]:
err_total_dict = {}
print(save_folder)
for run in range(5):
    print("Run: {}".format(run+1))
    trained_model = TimePredictor(num_nodes, num_features) 
    trained_model.load_state_dict(torch.load('{}/best_model_run_{}.pt'.format(save_folder,run+1),
                                         map_location=torch.device(device)))
    err_dict = evaluate_model(trained_model)
    
    for key in err_dict.keys():
        err = torch.mean(torch.tensor(err_dict[key]), axis = 0)
        if key in err_total_dict.keys():
            err_total_dict[key].append(torch.tensor([err[0], err[1]*divisor_rt/86400]))
        else:
            err_total_dict[key] = [torch.tensor([err[0], err[1]*divisor_rt/86400])]

In [None]:
num_samples_dict = {}
for i,(inputs,targets) in enumerate(test_dl):
    key = len(prefixes[0][i])
    if key in num_samples_dict.keys():
        num_samples_dict[key] += 1
    else:
        num_samples_dict[key] = 1

In [None]:
list_prefix_len = []
list_num_samples = []
list_mape_err = []
list_mape_std = []
list_mae_err = []
list_mae_std = []
for key, value in err_total_dict.items():
    list_prefix_len.append(key)
    list_num_samples.append(num_samples_dict[key])
    list_mape_err.append(round(torch.stack(err_total_dict[key]).mean(axis = 0)[0].item(), 3))
    list_mape_std.append(round(torch.stack(err_total_dict[key]).std(axis=0)[0].item(), 3))
    list_mae_err.append(round(torch.stack(err_total_dict[key]).mean(axis = 0)[1].item(), 3))
    list_mae_std.append(round(torch.stack(err_total_dict[key]).std(axis=0)[1].item(), 3))
tab_result = pd.DataFrame({"Prefix length":list_prefix_len, "Num samples": list_num_samples, 
                           "MAPE(%)":list_mape_err, "MAPE std": list_mape_std,
                           "MAE(days)": list_mae_err, "MAE std": list_mae_std})

if not os.path.exists(project_dir + '4_Outputs/Evaluation'):
    os.mkdir(project_dir + '4_Outputs/Evaluation')
tab_result.to_csv(project_dir+"4_Outputs/Evaluation/"+data_name+"_GCN_eval.csv", index = False)