# Artist Similarity with Graph Neural Network 1st Notebook

In this notebook are implemented the experiments that were previously conducted in the paper  ['Artist Similarity with Graph Neural Network'](https://archives.ismir.net/ismir2021/paper/000043.pdf).  
In addition to the architectures described in the research there are also experiments with the famous Graph Attention layer, in order to show how this approach can sharply outperform the GraphSAGE configuration.  
In this notebook are shown the performances from the different embeddings of artists.

In [None]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)
!pip install torchmetrics
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import json
import torch.nn as nn
import torch.nn.functional as F
from torchmetrics.functional import pairwise_euclidean_distance
from torch_geometric.nn import GATConv, SAGEConv
from torch.optim import lr_scheduler
import random
from random import choice,randrange
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
import math
import time
from torch_geometric import seed_everything


random_seed=280085

seed_everything(random_seed)

In [None]:
# from google.colab import drive
# drive.mount("/content/drive")
%cd 
from utils import *  #In this files are reported the most useful functions
from architectures import *

Mounted at /content/drive
/content/drive/My Drive/asproject


In [None]:
import IPython
js_code = '''
function ClickConnect(){
console.log("Working");
document.querySelector("colab-toolbar-button#connect").click()
}
setInterval(ClickConnect,60000)
'''
display(IPython.display.Javascript(js_code))

<IPython.core.display.Javascript object>

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

With the help of the [Torch geometric framework](https://pytorch-geometric.readthedocs.io/en/latest/) was really easy to handle the graph attributes and nodes and then the training of the GNNs.

In [None]:
X = torch.load('instance').T.to(device)      # Instance matrix
A = torch.load('adjacencyCOO').to(device)    # Adjacency matrix in the COO format, that is that supported by torch geometric
A1 = torch.load('adjacency').to(device)      # Normal adjacency matrix format is obtained with torch.load('adjacency')
num_samples = X.shape[0]
print(num_samples)

In [None]:
''' These variables contain the information about the artists' names, and their position in the dataset, this makes easy to look for their name and to better draw conclusions at inference time '''
num2artist = load_data('dizofartist.json')
artist2num = {num2artist[key]:key for key in num2artist}

print(num2artist)
print(artist2num)

## Import the data with Torch geometric:


In [None]:
''' In order to conduct the experiments was fundamental to split the dataset, either the nodes and also the edges.
    The splitting was performed according to the information in the paper, and considering the fact that a lot of date were lost in the preprocessing part.'''

from torch_geometric.data import Data
from torch_geometric.utils import structured_negative_sampling


''' This variable contains the indices for the splitting, that are necessary to compute the masks, according to the torch geometric pipeline '''
data_summary = {'train_with_val' : {'low' : 0, 'high': 9022},
                'train' : {'low' : 0, 'high' : 10190},
                'val' : {'low' : 9022, 'high' : 10190},
                'test' : {'low' : 10190, 'high' : 11261}}

total_mask = torch.zeros(X.shape[0], dtype = torch.bool)

vtrain_mask = total_mask.clone()
train_mask = total_mask.clone()
val_mask = total_mask.clone()
test_mask = total_mask.clone()
eval_val = total_mask.clone()


vtrain_mask[data_summary['train_with_val']['low']:data_summary['train_with_val']['high']] = True
val_mask[data_summary['val']['low']:data_summary['val']['high']] = 1
train_mask[data_summary['train']['low']:data_summary['train']['high']] = 1
test_mask[data_summary['test']['low']:data_summary['test']['high']] = 1

eval_val[data_summary['train_with_val']['low']:data_summary['val']['high']] = 1

kwargs = {'vtrain_mask':vtrain_mask, 'train_mask':train_mask, 'val_mask':val_mask, 'test_mask':test_mask}

data = Data(x=X, edge_index = A, **kwargs)


class data_split:
  ''' This class shows an alternative to the torch geometric masks procedure, it was necessary at inference time, where was needed the whole graph for the embedding compuutation '''
  def __init__(self, data, low, high):
    ''' Starting from the index 0 to 11260, we choose the interval of intersting samples
        self.data: contains the whole dataset (nodes and edges)
        self.rang: define the boundaries
        self.get_split_from_bounds perform the splitting, returning a x and edge_index attribute resembling the torch geometric Data objects.'''
    self.data = data
    self.rang = torch.arange(low, high + 1,1, device = 'cuda' if torch.cuda.is_available() else 'cpu')
    self.get_split_from_bounds(low, high)

  def get_split_from_bounds(self, low, high):
    self.x= self.data.x[low:high]
    v1_0 = self.data.edge_index[0]
    v2_0 = self.data.edge_index[1]
    v1_1 = v1_0[v1_0 < high]
    v1_2 = v1_1[v1_1 >= low]

    v2_1 = v2_0[v1_0 < high]
    v2_2 = v2_1[v1_1 >= low]
    v2_3 = v2_2[v2_2 < high]
    v2_4 = v2_3[v2_3 >= low]
    v1_3 = v1_2[v2_2 < high]
    v1_4 = v1_3[v2_3 >= low]

    self.edge_index = torch.cat((v1_4.unsqueeze(0), v2_4.unsqueeze(0)), dim = 0)
    
    return self.x, self.edge_index

  def split_for_inference(self, low_train, low_test, high_train, high_test):
    ''' At inference time we need to compute the embedding through the train and test artists, but we don want to consider the linkings between the test artist, those must be predicted.
        This function takes as input the boundaries of the train, and test set, computes the edge indices by removing the undesired connection.
        This method will be used to compute the accuracy. '''
    
    final_edge_indices = torch.tensor([[],[]], device = 'cuda' if torch.cuda.is_available() else 'cpu')
    for edge in range(self.edge_index.shape[1]):
      up = self.edge_index[0][edge].item()
      down = self.edge_index[1][edge].item()

      if up in range(low_test,high_test) and down in range(low_test, high_test): # If the connection is between test artist we remove it from the edge indices.
        continue

      else:
        final_edge_indices = torch.cat((final_edge_indices, self.edge_index[:,edge].reshape((2,1))), dim = 1)

    if device.type == 'cuda':
      return self.x, final_edge_indices.type(torch.cuda.LongTensor)
    else:
      return self.x, final_edge_indices.type(torch.LongTensor)


## Define the architectures

* GraphSage-based architecture is the one defined in the original paper [Artist similarity with Graph Neural Networks](https://arxiv.org/pdf/2107.14541.pdf).  
  1. SAGEConv(2613,256)
  2. SAGEConv(256,256) *
  3. SAGEConv(256,256) *
  4. Linear(256,100)  
  5. TripletLoss(a,p,n)
  
The second and third layer are optionals based on the desired configuration.

* The GAT-based architectures are the one used for our own experiments, they are defined as follows:  

GAT1: 
  1. GATConv(2613, 256)    *Multi-head attention mechanism
  2. GATConv(256 * n_heads, 256) *Multi-head attention mechanism
  3. Linear(256 * n_heads, 256)  
  4. Linear(256, 256)
  5. TripletLoss(a,p,n)

GAT2:  
  1. Linear(2613, 256)
  2. Linear(256,256)
  3. Linear(256, 256) 
  4. GATConv(256 , 256) *Multi-head attention mechanism
  5. GATConv(256* n_heads, 256)    *Multi-head attention mechanism
  6. TripletLoss(a,p,n)

According to the paper's authors the training was done with the aid of mini-batches of 512 size, in order to do so the torch-geometric framework offer the NeighborLoader method, that shuffles the data and look for the neighbors of each batch in a few seconds.

In [None]:
from torch_geometric.loader import NeighborLoader

In [None]:
lamb = 0.8 # This is the value used for the distance weighted sampling

def get_triplets(embedding, edges, batch_size, get_vecs = False):
  ''' The loss function to minimize is  a triplet loss function, thus we need to look for positives and negatives for each sample in the batches.
      This function takes as input:
      * embedding:  the output of the GCN.
      * edges:      the edges for the mini-batch.
      * batch_size: the size for the batch.
      * get_vecs:   It is a boolean, if True the function returns the tensor of positives and negatives, otherwise only the indices are returned.  '''

  n_of_neigh = 10 # How many samples to consider for the sampling
  edges_n = edges.clone()
  edges[0] = edges_n[1]
  edges[1] = edges_n[0]
  
  total_triplets = structured_negative_sampling(edges)
  total_ancor = total_triplets[0]
  total_pos = total_triplets[1]
  total_negs = total_triplets[2]

 
  shape = batch_size
  positives = torch.zeros(shape)
  negatives = torch.zeros(shape)
  
  for ancor in range(shape):
    pos = total_pos[total_ancor == ancor] # We get the positives that are also neighbors for the anchor
    #pos = pos_n[pos_n < 512]  
    if pos.shape[0] > n_of_neigh:
      pos = pos[:n_of_neigh]
    neg = total_negs[total_ancor == ancor] # We get the negatives that are also neighbors for the anchor
    #neg = neg_n[neg_n < 512]  

    if neg.shape[0] > n_of_neigh:
      neg = neg[:n_of_neigh]
    p = 1
    n = 1

    if pos.shape[0] == 0:
      positives[ancor] == ancor
      p = 0
    
    if neg.shape[0] == 0:
      negatives[ancor] == ancor
      n = 0
    
    if p:
      pos_index = compute_idx_p(embedding, pos, ancor)
      positives[ancor] = pos_index
    if n:
      neg_index = compute_idx_n(embedding, neg, ancor)
      negatives[ancor] = neg_index
  
  
  if device.type == 'cuda':
    positives = positives.type(torch.cuda.LongTensor) 
    negatives = negatives.type(torch.cuda.LongTensor)
  else:
    positives = positives.type(torch.LongTensor)
    negatives = negatives.type(torch.LongTensor)

  
  if get_vecs:
    return embedding[positives], embedding[negatives] #Return the embedding vectors

  else:
    return positives, negatives                       # Or return the list of indices


def compute_idx_p(embedding, pos, ancor):
# This function performs the distance weighted sampling.
# We look for all positives for the anchor sample and we weight their distance from it. We choose then the 'hard positive' namely one of the furthest positives for the anchor
  diz_pos_ = {}
  #print(pos)
  for idx in pos:
    diz_pos_[idx.item()] = pairwise_euclidean_distance(embedding[ancor].unsqueeze(0), embedding[idx.item()].unsqueeze(0))[0][0].item()
  max_dist = max(list(diz_pos_.values())) if max(list(diz_pos_.values()))!=0 else 1e-5
  keys = list(diz_pos_.keys())

  for key in keys:
    if diz_pos_[key]/max_dist > lamb and len(diz_pos_) != 1:
      diz_pos_.pop(key)


  return max(diz_pos_,key=diz_pos_.get)

def compute_idx_n(embedding, neg, ancor):
# This function performs the distance weighted sampling.
# We look for all negatives for the anchor sample and we weight their distance from it. We choose then the 'hard negative' namely one of the closest negative for the anchor 
    diz_neg_ = {}
    #print(neg)
    for idx in neg:
      diz_neg_[idx.item()] = pairwise_euclidean_distance(embedding[ancor].unsqueeze(0), embedding[idx.item()].unsqueeze(0))[0][0].item()

  
    max_dist = max(list(diz_neg_.values())) if max(list(diz_neg_.values()))!=0 else 1e-5
    keys = list(diz_neg_.keys())
    for key in keys:
      if diz_neg_[key]/max_dist < 1 - lamb and len(diz_neg_) != 1:
        diz_neg_.pop(key)


    return min(diz_neg_,key=diz_neg_.get)


class Trainer:
  ''' This class contains all the method needed to train, test, evaluate, save and load the model.''' 
  def __init__(self, model, optimizer, scheduler, loss, num_epochs, mode, path, first):

    self.model = model  # model class
    self.optimizer = optimizer # Adam optimizer
    self.scheduler = scheduler # Cosine learning rate scheduler
    self.loss = loss           # Triplet Loss function
    self.num_epochs = num_epochs # Num of epochs for the training
    self.mode = mode             # This must be specified to know what splitting to use in the model evaluation
    self.path = path             # This is the path where to save the model, if None the model won't be saved, otherwise it will be at the end of each epoch.
    self.first = first
    if self.first != True:
      self.checkpoint = load_model(self.path, self.model, device) # If exists already a checkpoint for the model we load it.
    else:
      self.checkpoint = {}



  def train(self):
    self.model.train()

    for epoch in range(num_epochs):
      print("Processing {}-th epoch".format(epoch+1))
      print("Training step....")
      loss_train_list = []
      loss_test_list = []
      for batch in train_loader:
        self.optimizer.zero_grad()

        out = self.model(batch.x, batch.edge_index.to(device))
        
        positives, negatives = get_triplets(out.clone(), batch.edge_index.clone(), batch.batch_size, get_vecs = True)
        out_l = out[:batch.batch_size]

      

        loss_train = self.loss(out_l, positives, negatives)
        loss_train.backward()
        self.optimizer.step()
        loss_train_list.append(loss_train.item())


      self.loss_test_f = self.test()
      self.loss_train_f = sum(loss_train_list)/len(loss_train_list)
      self.accuracy_on_test = self.eval_accuracy(self.mode)

      self.end_epoch()
      self.scheduler.step()

      print("At the {}-th epoch we have obtained: train_loss {:.6f} \t test_loss {:.6f} \t test_accuracy {:.6f}".format(epoch+1,self.loss_train_f, self.loss_test_f, self.accuracy_on_test))

    

  def test(self):
    self.model.eval()
    with torch.no_grad():
      print("Testing step....")
      loss_test_list = []
      for batch in test_loader:

        out = self.model(batch.x, batch.edge_index.to(device))
        
        positives, negatives = get_triplets(out.clone(), batch.edge_index.clone(), batch.batch_size, get_vecs = True)

        out_l = out[:batch.batch_size]
        

        loss_test = self.loss(out_l, positives, negatives)
        loss_test_list.append(loss_test.item())


    return sum(loss_test_list)/len(loss_test_list) 

  def calcG(self,ID):  #This method is used for the evaluation of accuracy, in particular it computes the denominator
    if ID>200:       # as described in the paper.
        ID=200
    c=1
    somm=0
    while c<=ID:
        somm+=1/(math.log2(1+c))
        c+=1
    return somm

  def eval_accuracy(self, mode = 1): 
    ''' This function computes the Normalized Discounted Cumulative Gain, that is the metric adopted in the research. '''
    if mode == 1:
      low_train = data_summary['train_with_val']['low']
      high_train = data_summary['train_with_val']['high']
      low_test = data_summary['val']['low']
      high_test = data_summary['val']['high']

    else:
      low_train = data_summary['train']['low']
      high_train = data_summary['train']['high']
      low_test = data_summary['test']['low']
      high_test = data_summary['test']['high']      


    self.model.eval()
    with torch.no_grad():
      ''' It is necessary to compute the embedding by condisering the edges between train and test data, but without considering the linkings between test samples, because they ,must be predicted in the evaluation. '''
      inference_data = data_split(data, low = low_train, high = high_test).split_for_inference(low_train, low_test, high_train, high_test) # This function takes care of the link remotion.
      out = self.model(inference_data[0], inference_data[1].to(device))[torch.arange(low_test,high_test)]

      A_acc = A1[low_test:high_test, low_test:high_test]

      print("Evaluation step....")
      test_embs = out.to(torch.device('cpu')).numpy()
      
      neigh=NearestNeighbors(n_neighbors=(200+1),algorithm='ball_tree').fit(test_embs) #With the K-NN we get the nearest 
      
      dist,ind=neigh.kneighbors(test_embs) 

      acc=[]

      c=0
      for test_artist in range(high_test-low_test):
          summ=0
          # ideal=len([i for i in range(self.test[0],self.test[0]+A_acc[k,:].shape[0]) if A_acc[k,i]!=0]) 
          
          ideal = A_acc[ind[c][0]].sum().item()
          
          
          
          den = self.calcG(ideal)
          if den==0:
              c+=1
              continue  
          for j in range(len(ind[c][1:])):
              if A_acc[ind[c][0]][ind[c][1:][j]].item()!=0:
                  summ+= 1/(math.log2(1+(j+1)))
                  
              else:
                  continue
          c+=1    
          summ/=den
          acc.append(summ)
      return sum(acc)/len(acc)

  def end_epoch(self):

    ''' This method is called at the end of each epoch and its purpose is to save the model state, and metrics, in order to be loaded again when needed.'''
    if self.path != None:
      if self.first:
        self.checkpoint['loss_train'] = [self.loss_train_f]
        self.checkpoint['loss_test'] = [self.loss_test_f]
        self.checkpoint['accuracy'] = [self.accuracy_on_test]
        self.checkpoint['modelState'] = self.model.state_dict()
        save_model(self.checkpoint, self.path)
        self.first = False
      else:
        self.checkpoint['loss_train'] += [self.loss_train_f]
        self.checkpoint['loss_test'] += [self.loss_test_f]
        self.checkpoint['accuracy'] += [self.accuracy_on_test]
        self.checkpoint['modelState'] = self.model.state_dict()
        save_model(self.checkpoint, self.path)

      load_model(self.path, self.model, device)



    

In [None]:
set_mode = 'val'  # 'val' or 'test', in order to set up the specific training procedure

if set_mode == 'val':
  trainmask = data.vtrain_mask
  testmask = data.val_mask
  mode = 1
elif set_mode == 'test':
  trainmask = data.train_mask
  testmask = data.test_mask
  mode = 2
n_layers = 2
model =  Gat1Model = GAT1() #Gat2Model = GAT2()GraphSage(n_layers = n_layers) #
path = None                    #'models/GAT1.pt'   #Choose the path where to save the trained model
first = True

''' NeighborLoader is really important when it comes to train with mini-batches, it provides easily the splittings and the training is faster. '''
train_loader = NeighborLoader(data, input_nodes = data.vtrain_mask, num_neighbors=[25]*n_layers, shuffle = False, batch_size = 512, num_workers = 2, persistent_workers = True)
test_loader = NeighborLoader(data, input_nodes = data.val_mask, num_neighbors=[25]*n_layers, shuffle = True, batch_size = 512, num_workers = 2, persistent_workers = True)



In [None]:
lr = 6e-5    # SAGE1: lr = 1e-5, wd = 1e-2 , SAGE2: lr = 5e-5, wd = 1e-2 , SAGE3: lr = 1e-5, wd = 1e-2 , GAT1: lr = 5e-5, wd = 1e-2.
weight_decay = 1e-2
num_epochs = 50
optimizer = torch.optim.Adam(model.parameters(), lr = lr, weight_decay = weight_decay)
scheduler=lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min= 0, last_epoch= -1, verbose=True)
loss = torch.nn.TripletMarginLoss(margin=0.2)
trainer = Trainer(model.to(device), optimizer, scheduler, loss, num_epochs, mode, path, first)


In [None]:
trainer.train()  # This cell make the training start.

# Results

* Now that the trainings are finished we can load our models and see how they have performed.

In [None]:
# Number of layers #
G1 = GraphSage(1)
G2 = GraphSage(2)
G3 = GraphSage(3)

Gat1 = GAT1()
Gat2 = GAT2()

diz_of_models = {'G1':{'path': "./models/one_layerSAGE.pt", 'model': G1, 'accuracy':load_model("./models/one_layerSAGE.pt", G1, device)['accuracy'], 'checkpoint' : load_model("./models/one_layerSAGE.pt", G1, device)}, 
                 'G2':{'path': "./models/two_layerSAGE.pt", 'model': G2, 'accuracy':load_model("./models/two_layerSAGE.pt", G2, device)['accuracy'], 'checkpoint' : load_model("./models/two_layerSAGE.pt", G2, device)},
                 'G3':{'path': "./models/three_layerSAGE.pt", 'model': G3, 'accuracy':load_model("./models/three_layerSAGE.pt", G3, device)['accuracy'], 'checkpoint' : load_model("./models/three_layerSAGE.pt", G3, device)},
                 'GAT1':{'path': "./models/GAT1.pt", 'model': Gat1, 'accuracy':load_model("./models/GAT1.pt", Gat1, device)['accuracy'], 'checkpoint' : load_model("./models/GAT1.pt", Gat1, device)},
                 'GAT2':{'path': "./models/GAT2.pt", 'model': Gat2, 'accuracy':load_model("./models/GAT2.pt", Gat2, device)['accuracy'], 'checkpoint' : load_model("./models/GAT2.pt", Gat2, device)}}

* We can see how the GAT architecture, in the same conditions are able to reach definitely better results w.r.t. GraphSAGE.

In [None]:
#@title
fig = go.Figure(data=[
    go.Bar(name='GraphSAGE1', y=[diz_of_models['G1']['accuracy'][-1]],text = round(diz_of_models['G1']['accuracy'][-1],4), textposition='auto'),
    go.Bar(name='GraphSAGE2', y=[diz_of_models['G2']['accuracy'][-1]],text = round(diz_of_models['G2']['accuracy'][-1],4), textposition='auto'),
    go.Bar(name='GraphSAGE3', y=[diz_of_models['G3']['accuracy'][-1]],text = round(diz_of_models['G3']['accuracy'][-1],4), textposition='auto'),
    go.Bar(name='GAT1', y =[diz_of_models['GAT1']['accuracy'][-1]],text = round(diz_of_models['GAT1']['accuracy'][-1],4), textposition='auto'),
    go.Bar(name='GAT2', y=[diz_of_models['GAT2']['accuracy'][-1]],text = round(diz_of_models['GAT2']['accuracy'][-1],4), textposition='auto'),
    
])
fig.update_layout(
    title='Performances over the different architectures',
    xaxis_tickfont_size=14,
    yaxis=dict(
        title='Normalized Discounted Cumulative Gain',
        titlefont_size=16,
        tickfont_size=14,
    ),
    legend=dict(
        x=0,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    
    bargap=0.15, # gap between bars of adjacent location coordinates.
    bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.update_layout(width = 1000, height = 700)
fig.show()

In [None]:
#@title
loss1 = diz_of_models['G1']['checkpoint']['loss_test']
loss2 = diz_of_models['G2']['checkpoint']['loss_test']
loss3 = diz_of_models['G3']['checkpoint']['loss_test']
loss4 = diz_of_models['GAT1']['checkpoint']['loss_test']
loss5 = diz_of_models['GAT2']['checkpoint']['loss_test']
loss6 = diz_of_models['G1']['checkpoint']['loss_train']
loss7 = diz_of_models['G2']['checkpoint']['loss_train']
loss8 = diz_of_models['G3']['checkpoint']['loss_train']
loss9 = diz_of_models['GAT1']['checkpoint']['loss_train']
loss10 = diz_of_models['GAT2']['checkpoint']['loss_train']

acc1 = diz_of_models['G1']['checkpoint']['accuracy']
acc2 = diz_of_models['G2']['checkpoint']['accuracy']
acc3 = diz_of_models['G3']['checkpoint']['accuracy']
acc4 = diz_of_models['GAT1']['checkpoint']['accuracy']
acc5 = diz_of_models['GAT2']['checkpoint']['accuracy']


max_length = max(len(loss1), len(loss2), len(loss3), len(loss4), len(loss5), len(loss6), len(loss7), len(loss8), len(loss9), len(loss10))

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(max_length)), y=loss1,
                    mode='lines+markers',
                    name='SAGE1 (test)', showlegend = True))
fig.add_trace(go.Scatter(x=list(range(max_length)), y=loss2,
                    mode='lines+markers',
                    name='SAGE2 (test)', showlegend = True))
fig.add_trace(go.Scatter(x=list(range(max_length)), y=loss3,
                    mode='lines+markers',
                    name='SAGE3 (test)', showlegend = True))

fig.add_trace(go.Scatter(x=list(range(max_length)), y=loss4,
                    mode='lines+markers',
                    name='GAT1 (test)', showlegend = True))
fig.add_trace(go.Scatter(x=list(range(max_length)), y=loss5,
                    mode='lines+markers',
                    name='GAT2 (test)', showlegend = True))
fig.add_trace(go.Scatter(x=list(range(max_length)), y=loss6,
                    mode='lines',
                    name='SAGE1 (train)', showlegend = True))
fig.add_trace(go.Scatter(x=list(range(max_length)), y=loss7,
                    mode='lines',
                    name='SAGE2 (train)', showlegend = True))
fig.add_trace(go.Scatter(x=list(range(max_length)), y=loss8,
                    mode='lines',
                    name='SAGE3 (train)', showlegend = True))
fig.add_trace(go.Scatter(x=list(range(max_length)), y=loss9,
                    mode='lines',
                    name='GAT1 (train)', showlegend = True))
fig.add_trace(go.Scatter(x=list(range(max_length)), y=loss10,
                    mode='lines',
                    name='GAT2 (train)', showlegend = True))



fig.update_layout(title='Comparison of the training results: Loss function values and Accuracy values',
                   xaxis_title='Epochs',
                   yaxis_title='Train vs Test Loss',
                  width = 1200, height = 700)

fig.show()

fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=list(range(max_length)), y=acc1,
                    mode='lines+markers',
                    name='SAGE1 (test)', showlegend = True))
fig1.add_trace(go.Scatter(x=list(range(max_length)), y=acc2,
                    mode='lines+markers',
                    name='SAGE2 (test)', showlegend = True))
fig1.add_trace(go.Scatter(x=list(range(max_length)), y=acc3,
                    mode='lines+markers',
                    name='SAGE3 (test)', showlegend = True))

fig1.add_trace(go.Scatter(x=list(range(max_length)), y=acc4,
                    mode='lines+markers',
                    name='GAT1 (test)', showlegend = True))
fig1.add_trace(go.Scatter(x=list(range(max_length)), y=acc5,
                    mode='lines+markers',
                    name='GAT2 (test)', showlegend = True))


fig1.update_layout(
                   xaxis_title='Epochs',
                   yaxis_title='Normalized Discounted Cumulative Gain',
                  width = 1200, height = 700)

fig1.show()