In [None]:
import os
from os.path import dirname
root_path = dirname(dirname(os.getcwd()))
print(root_path)
import sys
sys.path.append(root_path + '/RemainingCycleTimePrediction/2_Scripts/')
import pandas as pd
import numpy as np
import time, datetime
import pickle as pkl
import copy

import torch 
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import GatedGraphConv, global_mean_pool
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split

from ax.plot.contour import plot_contour
from ax.plot.trace import optimization_trace_single_method
from ax.service.managed_loop import optimize
from ax.utils.notebook.plotting import render

from Event_log_processing_utils import Extract_trace_and_temporal_features, Extract_prefix
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings("ignore")

data_dir = root_path + '/RemainingCycleTimePrediction/1_Data/'
project_dir = root_path + '/RemainingCycleTimePrediction/'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

## 1. Load data

In [None]:
# data_name = 'BPIC20'
data_name = 'Helpdesk'

In [None]:
tab_all = pd.read_csv(data_dir+data_name+"_processed_all.csv")
tab_train = pd.read_csv(data_dir+data_name+"_processed_train.csv")
tab_valid = pd.read_csv(data_dir+data_name+"_processed_valid.csv")
tab_test = pd.read_csv(data_dir+data_name+"_processed_test.csv")
tab_all.head()

## 2. Prepare inputs and outputs for model training

In [None]:
list_activities = list(tab_all["Activity"].unique())
#creating instance of one-hot-encoder and fit on the whole dataset
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(np.array(list_activities).reshape((len(list_activities), 1)))

lines, lines_t, lines_t2, lines_t3, lines_t4 = Extract_trace_and_temporal_features(tab_all)
maxlen = max([len(x) for x in lines]) #find maximum line size
lines, lines_t, lines_t2, lines_t3, lines_t4 = Extract_trace_and_temporal_features(tab_train)
divisor = np.mean([item for sublist in lines_t for item in sublist]) #average time between events
print('divisor: {}'.format(divisor))
divisor2 = np.mean([item for sublist in lines_t2 for item in sublist]) #average time between current and first events
print('divisor2: {}'.format(divisor2))
prefixes, outputs = Extract_prefix(lines, lines_t, lines_t2, lines_t3, lines_t4)
divisor_rt = np.mean(outputs[2])
print('divisor_rt: {}'.format(divisor_rt))

In [None]:
with open(data_dir+"GGNN_"+data_name+"_train.pkl", "rb") as f:
    X_train, Y_train =  pkl.load(f)
with open(data_dir+"GGNN_"+data_name+"_valid.pkl", "rb") as f:
    X_valid, Y_valid =  pkl.load(f)
with open(data_dir+"GGNN_"+data_name+"_test.pkl", "rb") as f:
    X_test, Y_test =  pkl.load(f)

In [None]:
class EventLogData(Dataset):
    def __init__ (self, input_x, output):
        self.X = input_x[0]
        self.A = input_x[1]
        self.V = input_x[2]
        self.y = output
        self.y = self.y.to(torch.float32)
        self.y = self.y.reshape((len(self.y),1))

    #get the number of rows in the dataset
    def __len__(self):
        return len(self.X)

    #get a row at a particular index in the dataset
    def __getitem__ (self,idx):
        return [[self.X[idx], self.A[idx], self.V[idx]],self.y[idx]]
    
     # get the indices for the train and test rows
    def get_splits(self, n_valid = 0.2):
        train_idx,valid_idx = train_test_split(list(range(len(self.X))),test_size = n_valid, shuffle = True)
        train = Subset(self, train_idx)
        valid = Subset(self, valid_idx)
        return train, valid
    
def my_collate(batch):
    data = [item[0] for item in batch]
    Y = [item[1] for item in batch]
    return [data, Y]

In [None]:
valid_loader = DataLoader(EventLogData(X_valid, Y_valid),
                                batch_size=len(X_valid[0]),
                                shuffle=False, collate_fn=my_collate)
test_loader = DataLoader(EventLogData(X_test, Y_test),
                                batch_size=1,
                                shuffle=False, collate_fn=my_collate)

## 3. Hyperparameter tuning with Ax package

In [None]:
# Creating the model class
class GGNN_model(nn.Module):
    def __init__(self, parameterization):
        super(GGNN_model, self).__init__()        
        self.ggnn_dim = parameterization.get("neurons", 15)
        self.num_layers = parameterization.get("layers", 1) 
        self.droppout_prob = parameterization.get("dropout", 0.2)
        
        self.ggnn = GatedGraphConv(self.ggnn_dim, num_layers=self.num_layers)
        self.fc = nn.Sequential(
            nn.Dropout(p = self.droppout_prob),
            nn.Linear(self.ggnn_dim,256),
            nn.ReLU(),
            nn.Linear(256,256),
            nn.ReLU(),
            nn.Dropout(p = self.droppout_prob),
            nn.Linear(256,1),
        )
    
    # Progresses data across layers    
    def forward(self, x):
        x = [self.ggnn(X, A.to(torch.long), V) for i, (X, A, V ) in enumerate(x)]
        x = torch.stack([global_mean_pool(single_x, batch = None) for single_x in x])
        x = x.squeeze(1)
        out = self.fc(x)
        return out
    
def net_train(net, train_loader, valid_loader, parameters, dtype, device, early_stop_patience):
    net.to(dtype=dtype, device=device)
    min_delta = 0
    # Define loss and optimizer
    criterion = nn.L1Loss()
    optimizer = optim.Adam(net.parameters(), lr=parameters.get("lr", 0.001)) # 0.001 is used if no lr is specified    
    num_epochs = 100 # Play around with epoch number
    
    # Train Network
    not_improved_count = 0
    start_time = time.time()
    for epoch in range(num_epochs):
        net.train()
        training_loss = 0
        num_train = 0
        for inputs, labels in train_loader:
            # move data to proper dtype and device
            inputs = [[sub_item.to(device=device) for sub_item in item] for item in inputs]
            labels = torch.tensor(labels).to(device=device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward
            output = net(inputs)
            loss = criterion(output.reshape((1,-1)),labels.reshape((1,-1)))
            # back prop
            loss.backward()
            # optimize
            optimizer.step()
            training_loss+= loss.item()
            num_train+=1
        with torch.no_grad():
            net.eval()
            num_valid = 0
            validation_loss = 0
            for i,(inputs,targets) in enumerate(valid_loader):
                inputs = [[sub_item.to(device=device) for sub_item in item] for item in inputs]
                targets = torch.tensor(targets).to(device=device)
                yhat_valid = net(inputs)
                loss_valid = criterion(yhat_valid.reshape((1,-1)),targets.reshape((1,-1)))
                validation_loss+= loss_valid.item()
                num_valid+= 1
        avg_training_loss = training_loss/num_train
        avg_validation_loss = validation_loss/num_valid        
        print("Epoch: {}, Training MAE : {}, Validation loss : {}".format(epoch,avg_training_loss,avg_validation_loss))
        if (epoch==0): 
            best_loss = avg_validation_loss
            best_model = copy.deepcopy(net)
        else:
            if (best_loss - avg_validation_loss >= min_delta):
                best_model = copy.deepcopy(net)
                best_loss = avg_validation_loss
                not_improved_count = 0
            else:
                not_improved_count += 1
        # Early stopping
        if not_improved_count == early_stop_patience:
            print("Validation performance didn\'t improve for {} epochs. "
                            "Training stops.".format(early_stop_patience))
            break
    training_time = time.time() - start_time
    print("Training time:", training_time)
    return best_model


def model_evaluate(net, data_loader, dtype, device):
    criterion = nn.L1Loss()
    net.eval()
    loss = 0
    total = 0
    with torch.no_grad():
        for i,(inputs,targets) in enumerate(data_loader):
            # move data to proper dtype and device
            inputs = [[sub_item.to(dtype=dtype, device=device) for sub_item in item] for item in inputs]
            targets = torch.tensor(targets).to(device=device)
            outputs = net(inputs)
            loss += criterion(outputs,targets)
            total += 1
    return loss.item() / total


def train_evaluate(parameterization):

    # constructing a new training data loader allows us to tune the batch size
    train_loader = DataLoader(EventLogData(X_train,Y_train),
                                batch_size=parameterization.get("batchsize", 32),
                                shuffle=True, collate_fn=my_collate)
    
    # Get neural net
    untrained_net = GGNN_model(parameterization)
    # train
    trained_net = net_train(net=untrained_net, train_loader=train_loader, valid_loader = valid_loader, 
                            parameters=parameterization, dtype=dtype, device=device, early_stop_patience = 10)
    
    # return the accuracy of the model as it was trained in this run
    return model_evaluate(
        net=trained_net,
        data_loader=valid_loader,
        dtype=dtype,
        device=device,
    )

In [None]:
dtype = torch.float
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

best_parameters, values, experiment, model = optimize(
    parameters=[
        {"name": "neurons", "type": "choice", "values": [40, 60, 80, 100], "value_type": "int"},
        {"name": "layers", "type": "choice", "values": [3, 4, 5], "value_type": "int"},
        {"name": "lr", "type": "range", "bounds": [1e-4, 0.01], "value_type": "float", "log_scale": True},
        {"name": "dropout", "type": "range", "bounds": [0, 0.5], "value_type": "float"},
        {"name": "batchsize", "type": "choice", "values": [16, 32, 64], "value_type": "int"}
    ],
  
    evaluation_function=train_evaluate,
    objective_name='MAE loss',
    minimize = True,
    random_seed = 123,
    total_trials = 100
)

print(best_parameters)
means, covariances = values
print(means)

In [None]:
data = experiment.fetch_data()
df = data.df
best_arm_name = df.arm_name[df['mean'] == df['mean'].min()].values[0]
best_arm = experiment.arms_by_name[best_arm_name]
best_arm

## 4. Re-Train model with tuned hyperparameters

In [None]:
# Creating the model class
class GGNN_model(nn.Module):
    def __init__(self, ggnn_dim, num_layers, droppout_prob):
        super(GGNN_model, self).__init__()        
        self.ggnn_dim = ggnn_dim
        self.num_layers = num_layers
        self.droppout_prob = droppout_prob
        
        self.ggnn = GatedGraphConv(self.ggnn_dim, num_layers=self.num_layers)
        self.fc = nn.Sequential(
            nn.Dropout(p = self.droppout_prob),
            nn.Linear(self.ggnn_dim,256),
            nn.ReLU(),
            nn.Linear(256,256),
            nn.ReLU(),
            nn.Dropout(p = self.droppout_prob),
            nn.Linear(256,1),
        )
    
    # Progresses data across layers    
    def forward(self, x):
        x = [self.ggnn(X, A.to(torch.long), V) for i, (X, A, V ) in enumerate(x)]
        x = torch.stack([global_mean_pool(single_x, batch = None) for single_x in x])
        x = x.squeeze(1)
        out = self.fc(x)
        return out

In [None]:
batch_size = best_arm.parameters['batchsize']
ggnn_dim = best_arm.parameters['neurons']
num_layers = best_arm.parameters['layers']
lr_value = best_arm.parameters['lr']
droppout_prob = best_arm.parameters['dropout']
dtype = torch.float
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_loader = DataLoader(EventLogData(X_train,Y_train), batch_size=batch_size, 
                          shuffle=True, collate_fn=my_collate)

In [None]:
save_folder = project_dir + '5_Output_files/Remaining_time_prediction/'+data_name+'_model_Gated_GNN'
num_epochs = 100
early_stop_patience = 20
min_delta = 0
num_runs = 5
running_time = []
for run in range(num_runs):
    start=datetime.datetime.now()
    print("Run: {}".format(run+1))
    model = GGNN_model(ggnn_dim, num_layers, droppout_prob)  
    criterion = nn.L1Loss()
    optimizer = torch.optim.Adam(model.parameters(),lr=lr_value)

    model = model.to(device)
    epochs_plt = []
    mae_plt = []
    valid_loss_plt = []
    not_improved_count = 0

    for epoch in range(num_epochs):
        model.train()
        training_loss = 0
        predictions, actuals = list(),list()
        num_train = 0

        for i, (inputs,targets) in enumerate(train_loader):
            inputs = [[sub_item.to(device=device) for sub_item in item] for item in inputs]
            targets = torch.tensor(targets).to(device=device)
            
            optimizer.zero_grad() # Clearing the gradients
            yhat = model(inputs)
            loss = criterion(yhat.reshape((1,-1)),targets.reshape((1,-1)))
            loss.backward()
            optimizer.step()

            training_loss+= loss.item()
            num_train+=1

        with torch.no_grad():
            model.eval()
            num_valid = 0
            validation_loss = 0
            for i,(inputs,targets) in enumerate(valid_loader):            
                inputs = [[sub_item.to(device=device) for sub_item in item] for item in inputs]
                targets = torch.tensor(targets).to(device=device)
                yhat_valid = model(inputs)           
                loss_valid = criterion(yhat_valid.reshape((1,-1)),targets.reshape((1,-1)))
                validation_loss+= loss_valid.item()
                num_valid+= 1

        avg_training_loss = training_loss/num_train
        avg_validation_loss = validation_loss/num_valid
        print("Epoch: {}, Training MAE : {}, Validation loss : {}".format(epoch,avg_training_loss,avg_validation_loss))
        epochs_plt.append(epoch+1)
        mae_plt.append(avg_training_loss)
        valid_loss_plt.append(avg_validation_loss)
        if (epoch==0): 
            best_loss = avg_validation_loss
            torch.save(model.state_dict(),'{}/best_model_run_{}.pt'.format(save_folder,run+1))
        else:
            if (best_loss - avg_validation_loss >= min_delta):
                torch.save(model.state_dict(),'{}/best_model_run_{}.pt'.format(save_folder,run+1))
                best_loss = avg_validation_loss
                not_improved_count = 0
            else:
                not_improved_count += 1

       # Early stopping
        if not_improved_count == early_stop_patience:
            print("Validation performance didn\'t improve for {} epochs. "
                            "Training stops.".format(early_stop_patience))
            break
            
    filepath = '{}/Loss_'.format(save_folder)+data_name+'_run_{}.txt'.format(run+1)
    with open(filepath, 'w') as file:
        for item in zip(epochs_plt,mae_plt,valid_loss_plt):
            file.write("{}\n".format(item))
    running_time.append((datetime.datetime.now()-start).total_seconds())

## 5. Evaluation

In [None]:
def evaluate_model(model):
    err_dict = {}
    with torch.no_grad():
        model.eval()
        testing_loss_all = 0
        num_of_minibatch = 0
        for i,(inputs,targets) in enumerate(test_loader):
            prefix_len = inputs[0][0].size(0)
            inputs = [[sub_item.to(dtype=dtype, device=device) for sub_item in item] for item in inputs]
            targets = torch.tensor(targets).to(device=device)
            yhat = model(inputs)
            loss_mape = torch.abs((targets - yhat)/targets)*100
            criterion = nn.L1Loss()
            loss_mae = criterion(yhat,targets).item()
            if prefix_len not in err_dict.keys():
                err_dict[prefix_len] = [[loss_mape, loss_mae]]
            else:
                err_dict[prefix_len].append([loss_mape, loss_mae])
    return err_dict

In [None]:
err_total_dict = {}
print(save_folder)
for run in range(5):
    print("Run: {}".format(run+1))
    trained_model = GGNN_model(ggnn_dim, num_layers, droppout_prob)
    trained_model = trained_model.to(device)
    trained_model.load_state_dict(torch.load('{}/best_model_run_{}.pt'.format(save_folder,run+1),
                                         map_location=torch.device(device)))
    err_dict = evaluate_model(trained_model)
    
    for key in err_dict.keys():
        err = torch.mean(torch.tensor(err_dict[key]), axis = 0)
        if key in err_total_dict.keys():
            err_total_dict[key].append(torch.tensor([err[0], err[1]*divisor_rt/86400]))
        else:
            err_total_dict[key] = [torch.tensor([err[0], err[1]*divisor_rt/86400])]

In [None]:
num_samples_dict = {}
for i,(inputs,targets) in enumerate(test_loader):
    key = inputs[0][0].size(0)
    if key in num_samples_dict.keys():
        num_samples_dict[key] += 1
    else:
        num_samples_dict[key] = 1

In [None]:
list_prefix_len = []
list_num_samples = []
list_mape_err = []
list_mape_std = []
list_mae_err = []
list_mae_std = []
for key, value in err_total_dict.items():
    list_prefix_len.append(key)
    list_num_samples.append(num_samples_dict[key])
    list_mape_err.append(round(torch.stack(err_total_dict[key]).mean(axis = 0)[0].item(), 3))
    list_mape_std.append(round(torch.stack(err_total_dict[key]).std(axis=0)[0].item(), 3))
    list_mae_err.append(round(torch.stack(err_total_dict[key]).mean(axis = 0)[1].item(), 3))
    list_mae_std.append(round(torch.stack(err_total_dict[key]).std(axis=0)[1].item(), 3))
tab_result = pd.DataFrame({"Prefix length":list_prefix_len, "Num samples": list_num_samples, 
                           "MAPE(%)":list_mape_err, "MAPE std": list_mape_std,
                           "MAE(days)": list_mae_err, "MAE std": list_mae_std})
tab_result

In [None]:
tab = tab_result[tab_result["Num samples"] >= 20]
sum(tab["Num samples"]*tab["MAE(days)"])/sum(tab["Num samples"])

In [None]:
tab_result.to_csv(project_dir+"4_Outputs/Evaluation/"+data_name+"_GGNN_eval.csv", index = False)