# Artist Similarity with Graph Neural Network 1st Notebook

In this notebook are implemented the experiments that were previously conducted in the paper  ['Artist Similarity with Graph Neural Network'](https://archives.ismir.net/ismir2021/paper/000043.pdf).  
In addition to the architectures described in the research there are also experiments with the famous Graph Attention layer, in order to show how this approach can sharply outperform the GraphSAGE configuration.  
In this notebook are shown the performances from the different embeddings of artists.

In [1]:
# !pip install torchmetrics
# !pip install torch-scatter -f https://data.pyg.org/whl/torch-1.12.0+cu113.html
# !pip install torch-sparse -f https://data.pyg.org/whl/torch-1.12.0+cu113.html
# !pip install torch-geometric
# !pip install pytorch_metric_learning

In [1]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import json
import torch.nn as nn
import torch.nn.functional as F
from torchmetrics.functional import pairwise_euclidean_distance
from torch_geometric.nn import GATConv, SAGEConv
from torch.optim import lr_scheduler
import random
from random import choice,randrange
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
import math
import time
from torch_geometric import seed_everything
from sklearn.manifold import TSNE
import pandas as pd
from pytorch_metric_learning import losses
from utils import *
from sklearn.metrics import f1_score


random_seed=280085

seed_everything(random_seed)

2.5.1+cu118


In [2]:
import torch_geometric
print(torch_geometric.__version__)

2.6.1


In [3]:
from utils import *  #In this files are reported the most useful functions
from architectures import *

2.5.1+cu118


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


With the help of the [Torch geometric framework](https://pytorch-geometric.readthedocs.io/en/latest/) was really easy to handle the graph attributes and nodes and then the training of the GNNs.

In [6]:
random = False
if random == True:
  A1 = torch.load('adjacency').to(device)      # Normal adjacency matrix format is obtained with torch.load('adjacency')
  X = torch.load('random_instance').to(device)
else:
  X = torch.load('instance').T.to(device)      # Instance matrix
  A1 = torch.load('adjacency').to(device) 

A = torch.load('adjacencyCOO').to(device)    # Adjacency matrix in the COO format, that is that supported by torch geometric
filt = True


art_of_interest = torch.load('intrst_artists.pt')
labels = torch.load('labels.pt')
if filt:
  X = X[art_of_interest].detach()
  A1 = A1[art_of_interest, :][:, art_of_interest].detach()
  labels = labels[art_of_interest].detach()
  A = torch.nonzero(A1).T.type(torch.LongTensor).detach()

label_diz = load_data('data/encode_labels.json') # Genre to idx
label_diz2 = {label_diz[key] : key for key in label_diz} # Idx to genre
artists = load_data('data/artist_genres.json') # Artist names
family_diz = load_data('data/family_diz.json')


 
num_classes = torch.unique(labels).shape[0]


num_samples = X.shape[0]
print('The number of samples and classes is {} and {} respectively'.format(num_samples, num_classes))

  X = torch.load('instance').T.to(device)      # Instance matrix
  A1 = torch.load('adjacency').to(device)


The number of samples and classes is 10986 and 25 respectively


  A = torch.load('adjacencyCOO').to(device)    # Adjacency matrix in the COO format, that is that supported by torch geometric
  art_of_interest = torch.load('intrst_artists.pt')
  labels = torch.load('labels.pt')


In [7]:
''' These variables contain the information about the artists' names, and their position in the dataset, this makes easy to look for their name and to better draw conclusions at inference time '''
num2artist = load_data('data/dizofartist.json')
artist2num = {num2artist[key]:key for key in num2artist}

# print(num2artist)
# print(artist2num)

## Import the data with Torch geometric:


In [8]:
''' In order to conduct the experiments was fundamental to split the dataset, either the nodes and also the edges.
    The splitting was performed according to the information in the paper, and considering the fact that a lot of date were lost in the preprocessing part.'''

from torch_geometric.data import Data
from torch_geometric.utils import structured_negative_sampling


##############################################

data_for_train_ = int(len(art_of_interest)*0.8) if filt == True else 9022
data_for_train = int(len(art_of_interest)*0.8) + int(len(art_of_interest)*0.1) if filt == True else 10190
fin_val = len(art_of_interest) if filt == True else 11261


''' This variable contains the indices for the splitting, that are necessary to compute the masks, according to the torch geometric pipeline '''
data_summary = {'train_with_val' : {'low' : 0, 'high': data_for_train_},
                'train' : {'low' : 0, 'high' : data_for_train},
                'val' : {'low' : data_for_train_, 'high' : data_for_train},
                'test' : {'low' : data_for_train, 'high' : fin_val}}



total_mask = torch.zeros(X.shape[0], dtype = torch.bool)


vtrain_mask = total_mask.clone()
train_mask = total_mask.clone()
val_mask = total_mask.clone()
test_mask = total_mask.clone()
eval_val = total_mask.clone()


vtrain_mask[data_summary['train_with_val']['low']:data_summary['train_with_val']['high']] = True
val_mask[data_summary['val']['low']:data_summary['val']['high']] = 1
train_mask[data_summary['train']['low']:data_summary['train']['high']] = 1
test_mask[data_summary['test']['low']:data_summary['test']['high']] = 1

eval_val[data_summary['train_with_val']['low']:data_summary['val']['high']] = 1

kwargs = {'vtrain_mask':vtrain_mask, 'train_mask':train_mask, 'val_mask':val_mask, 'test_mask':test_mask}



data = Data(x=X.to(device), edge_index = A.to(device), y = labels.to(device), **kwargs)


class data_split:
  ''' This class shows an alternative to the torch geometric masks procedure, it was necessary at inference time, where was needed the whole graph for the embedding compuutation '''
  def __init__(self, data, low, high):
    ''' Starting from the index 0 to 11260, we choose the interval of intersting samples
        self.data: contains the whole dataset (nodes and edges)
        self.rang: define the boundaries
        self.get_split_from_bounds perform the splitting, returning a x and edge_index attribute resembling the torch geometric Data objects.'''
    self.data = data
    self.rang = torch.arange(low, high + 1,1, device = 'cuda' if torch.cuda.is_available() else 'cpu')
    self.get_split_from_bounds(low, high)

  def get_split_from_bounds(self, low, high):
    self.x= self.data.x[low:high]
    v1_0 = self.data.edge_index[0]
    v2_0 = self.data.edge_index[1]
    v1_1 = v1_0[v1_0 < high]
    v1_2 = v1_1[v1_1 >= low]

    v2_1 = v2_0[v1_0 < high]
    v2_2 = v2_1[v1_1 >= low]
    v2_3 = v2_2[v2_2 < high]
    v2_4 = v2_3[v2_3 >= low]
    v1_3 = v1_2[v2_2 < high]
    v1_4 = v1_3[v2_3 >= low]

    self.edge_index = torch.cat((v1_4.unsqueeze(0), v2_4.unsqueeze(0)), dim = 0)
    
    return self.x, self.edge_index

  def split_for_inference(self, low_train, low_test, high_train, high_test):
    ''' At inference time we need to compute the embedding through the train and test artists, but we don want to consider the linkings between the test artist, those must be predicted.
        This function takes as input the boundaries of the train, and test set, computes the edge indices by removing the undesired connection.
        This method will be used to compute the accuracy. '''
    
    final_edge_indices = torch.tensor([[],[]], device = 'cuda' if torch.cuda.is_available() else 'cpu')
    for edge in range(self.edge_index.shape[1]):
      up = self.edge_index[0][edge].item()
      down = self.edge_index[1][edge].item()

      if up in range(low_test,high_test) and down in range(low_test, high_test): # If the connection is between test artist we remove it from the edge indices.
        continue

      else:
        final_edge_indices = torch.cat((final_edge_indices, self.edge_index[:,edge].reshape((2,1))), dim = 1)

    if device.type == 'cuda':
      return self.x, final_edge_indices.type(torch.cuda.LongTensor)
    else:
      return self.x, final_edge_indices.type(torch.LongTensor)


  
# visualize(torch.randn((11261, 2)), labels)


In [9]:


def visualize(h, color, dim = 2):

    if not filt:
        h = h[art_of_interest]
        color = color[art_of_interest]

    z = TSNE(n_components = dim).fit_transform(h.detach().cpu().numpy())
    # z = h
    if dim == 2:
        p = pd.DataFrame(columns = ['x1', 'x2', 'y'])
        p['x1'] = z[:, 0]
        p['x2'] = z[:, 1]
    elif dim == 3:
        p = pd.DataFrame(columns = ['x1', 'x2', 'x3', 'y'])
        p['x1'] = z[:, 0]
        p['x2'] = z[:, 1]
        p['x3'] = z[:, 2]


    p['y'] = [label_diz[str(int(num.item()))] for num in color]
    p['family'] = [family_diz[str(int(num.item()))] for num in color]

    if not filt:
        p['name'] = [artists[str(int(num))][1] for num in range(color.shape[0])]
    else:
        p['name'] = [artists[str(int(art_of_interest[num].item()))][1] for num in range(color.shape[0])]

    if not filt:
        p['old genre'] = [[list(artists[str(int(num))][0][genre].values())[0] for genre in range(len(artists[str(int(num))][0]))] for num in range(color.shape[0])]
    else:
        p['old genre'] = [[list(artists[str(int(art_of_interest[num].item()))][0][genre].values())[0] for genre in range(len(artists[str(int(art_of_interest[num].item()))][0]))] for num in range(color.shape[0])]

    
    if dim == 2:
        fig = px.scatter(p, x="x1", y="x2", color="family", hover_data = ['name', 'old genre', 'y'], color_discrete_sequence=px.colors.qualitative.Light24)
    elif dim == 3:
        fig = px.scatter_3d(p, x="x1", y="x2", z = "x3", color="family", hover_data = ['name', 'old genre', 'y'], color_discrete_sequence=px.colors.qualitative.Light24)
        fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

    fig.show()
    return z, p


## Define the architectures

* GraphSage-based architecture is the one defined in the original paper [Artist similarity with Graph Neural Networks](https://arxiv.org/pdf/2107.14541.pdf).  
  1. SAGEConv(2613,256)
  2. SAGEConv(256,256) *
  3. SAGEConv(256,256) *
  4. Linear(256,100)  
  5. TripletLoss(a,p,n)
  
The second and third layer are optionals based on the desired configuration.

* The GAT-based architectures are the one used for our own experiments, they are defined as follows:  

GAT1: 
  1. GATConv(2613, 256)    *Multi-head attention mechanism
  2. GATConv(256 * n_heads, 256) *Multi-head attention mechanism
  3. Linear(256 * n_heads, 256)  
  4. Linear(256, 256)
  5. TripletLoss(a,p,n)

GAT2:  
  1. Linear(2613, 256)
  2. Linear(256,256)
  3. Linear(256, 256) 
  4. GATConv(256 , 256) *Multi-head attention mechanism
  5. GATConv(256* n_heads, 256)    *Multi-head attention mechanism
  6. TripletLoss(a,p,n)

According to the paper's authors the training was done with the aid of mini-batches of 512 size, in order to do so the torch-geometric framework offer the NeighborLoader method, that shuffles the data and look for the neighbors of each batch in a few seconds.

In [10]:
from torch_geometric.loader import NeighborLoader


In [11]:
lamb = 0.8 # This is the value used for the distance weighted sampling

def get_triplets(embedding, edges, batch_size, get_vecs = False, label_idx = None):
  ''' The loss function to minimize is  a triplet loss function, thus we need to look for positives and negatives for each sample in the batches.
      This function takes as input:
      * embedding:  the output of the GCN.
      * edges:      the edges for the mini-batch.
      * label_idx:  if class are available, this is chosen to improve the triplet selection
      * batch_size: the size for the batch.
      * get_vecs:   It is a boolean, if True the function returns the tensor of positives and negatives, otherwise only the indices are returned.  '''

  n_of_neigh = 25 # How many samples to consider for the sampling
  edges_n = edges.clone()
  edges[0] = edges_n[1]
  edges[1] = edges_n[0]
  
  total_triplets = structured_negative_sampling(edges)
  total_ancor = total_triplets[0]
  total_pos = total_triplets[1]
  total_negs = total_triplets[2]

 
  shape = batch_size
  positives = torch.zeros(shape)
  negatives = torch.zeros(shape)
  for ancor in range(shape):
    pos = total_pos[total_ancor == ancor] # We get the positives that are also neighbors for the anchor
    
    # pos = pos[pos < batch_size]  
    # if pos.shape[0] > n_of_neigh:
    #   pos = pos[:n_of_neigh]
    neg = total_negs[total_ancor == ancor] # We get the negatives that are also neighbors for the anchor
    # neg = neg[neg < batch_size]  

    # if neg.shape[0] > n_of_neigh:
    #   neg = neg[:n_of_neigh]
    p = 1
    n = 1

    if pos.shape[0] == 0:
      positives[ancor] == ancor
      p = 0
    
    if neg.shape[0] == 0:
      negatives[ancor] == ancor
      n = 0
    
    if p:

      pos_index = compute_idx_p(embedding, pos, ancor, label_idx)
      positives[ancor] = pos_index
    if n:
      neg_index = compute_idx_n(embedding, neg, ancor, label_idx)
      negatives[ancor] = neg_index
  
  
  if device.type == 'cuda':
    positives = positives.type(torch.cuda.LongTensor) 
    negatives = negatives.type(torch.cuda.LongTensor)
  else:
    positives = positives.type(torch.LongTensor)
    negatives = negatives.type(torch.LongTensor)

  
  if get_vecs:
    return embedding[positives], embedding[negatives] #Return the embedding vectors

  else:
    return positives, negatives                       # Or return the list of indices


def compute_idx_p(embedding, pos, ancor, label_idx):
# This function performs the distance weighted sampling.
# We look for all positives for the anchor sample and we weight their distance from it. We choose then the 'hard positive' namely one of the furthest positives for the anchor
  diz_pos_ = {}
  #print(pos)
  
  for idx in pos:
    if label_idx != None:
      # print(ancor, idx)
      # print(label_idx.shape)
      # print(pos)

      # if family_diz[str(label_idx[ancor].item())] != family_diz[str(label_idx[idx].item())]:
      #     continue
      if label_idx[ancor].item() != label_idx[idx].item():
          continue
    diz_pos_[idx.item()] = pairwise_euclidean_distance(embedding[ancor].unsqueeze(0), embedding[idx.item()].unsqueeze(0))[0][0].item()
  
  if len(diz_pos_) == 0:
    for idx in pos:

      # print('pos: ', pos.shape)


      # print(family_diz[str(label_idx[ancor].item())], family_diz[str(label_idx[idx].item())])

      diz_pos_[idx.item()] = pairwise_euclidean_distance(embedding[ancor].unsqueeze(0), embedding[idx.item()].unsqueeze(0))[0][0].item()

  
  max_dist = max(list(diz_pos_.values())) if max(list(diz_pos_.values()))!=0 else 1e-5
  keys = list(diz_pos_.keys())

  for key in keys:
    if diz_pos_[key]/max_dist > lamb and len(diz_pos_) != 1:
      diz_pos_.pop(key)


  return max(diz_pos_,key=diz_pos_.get)

def compute_idx_n(embedding, neg, ancor, label_idx = None):
# This function performs the distance weighted sampling.
# We look for all negatives for the anchor sample and we weight their distance from it. We choose then the 'hard negative' namely one of the closest negative for the anchor 
    diz_neg_ = {}
    #print(neg)
    for idx in neg:
      if label_idx != None:
        # if family_diz[str(label_idx[ancor].item())] == family_diz[str(label_idx[idx].item())]:
        #   continue
        if label_idx[ancor].item() == label_idx[idx].item():
          continue
      diz_neg_[idx.item()] = pairwise_euclidean_distance(embedding[ancor].unsqueeze(0), embedding[idx.item()].unsqueeze(0))[0][0].item()

    if len(diz_neg_) == 0:
      diz_neg_[ancor] = 0

    max_dist = max(list(diz_neg_.values())) if max(list(diz_neg_.values()))!=0 else 1e-5
    keys = list(diz_neg_.keys())
    for key in keys:
      if diz_neg_[key]/max_dist < 1 - lamb and len(diz_neg_) != 1:
        diz_neg_.pop(key)


    return min(diz_neg_,key=diz_neg_.get)


class Trainer:
  ''' This class contains all the method needed to train, test, evaluate, save and load the model.''' 
  def __init__(self, model, optimizer, scheduler, loss, num_epochs, mode, path, first, loss_mode = 'triplet'):

    self.model = model  # model class
    self.optimizer = optimizer # Adam optimizer
    self.scheduler = scheduler # Cosine learning rate scheduler
    self.loss = loss           # Triplet Loss function
    self.num_epochs = num_epochs # Num of epochs for the training
    self.mode = mode             # This must be specified to know what splitting to use in the model evaluation
    self.path = path             # This is the path where to save the model, if None the model won't be saved, otherwise it will be at the end of each epoch.
    self.first = first
    self.loss_mode = loss_mode

    if self.loss_mode == 'sphere':
      self.loss_optimizer = torch.optim.Adam(model.parameters(), lr = lr, weight_decay = weight_decay)

    if self.loss_mode == 'triplet+':
      self.loss2 = nn.CrossEntropyLoss()
      self.predictor = Predictor(n_heads).to(device)
      self.pred_opt = torch.optim.Adam(self.predictor.parameters(), lr = 0.01, weight_decay = weight_decay)
      self.scheduler_pred = lr_scheduler.CosineAnnealingLR(self.pred_opt, T_max=num_epochs, eta_min= 0, last_epoch= -1, verbose=True)


    if self.first != True:
      self.checkpoint = load_model(self.path, self.model, device) # If exists already a checkpoint for the model we load it.
    else:
      self.checkpoint = {}



  def train(self):
    self.model.train()

    for epoch in range(self.num_epochs):
      print("Processing {}-th epoch".format(epoch+1))
      print("Training step....")
      loss_train_list = []
      for batch in train_loader:
        self.optimizer.zero_grad()
        if self.loss_mode == 'triplet+':
          self.pred_opt.zero_grad()
        if self.loss_mode == 'sphere':
          self.loss_optimizer.zero_grad()

        out = self.model(batch.x, batch.edge_index.to(device))
        out_l = out[:batch.batch_size]

        if self.loss_mode.startswith('triplet'):
          positives, negatives = get_triplets(out.clone(), batch.edge_index.clone(), batch.batch_size, get_vecs = True, label_idx = batch.y.to(device) if self.loss_mode == 'triplet+' else None)
          


          loss_train = self.loss(out_l, positives, negatives)
          if self.loss_mode == 'triplet+':
            out_n = self.predictor(out_l)
            
            loss2 = self.loss2(out_n, batch.y[:batch.batch_size].to(device)) 

            loss_train +=  1 * loss2
          
        elif self.loss_mode == 'sphere':
    

          loss_train = self.loss(out_l, batch.y[:batch.batch_size].to(device))
          
        loss_train.backward()
        self.optimizer.step()
        if self.loss_mode == 'triplet+':
          self.pred_opt.step()
        if self.loss_mode == 'sphere':
          self.loss_optimizer.step()
        loss_train_list.append(loss_train.item())
        

      if self.loss_mode != 'triplet+':
        if self.loss_mode == 'sphere':
          self.loss_test_f, f1_s = self.test()
        else:
          self.loss_test_f = self.test()

      else:
        self.loss_test_f, self.loss_pred, f1_s = self.test()

      self.loss_train_f = sum(loss_train_list)/len(loss_train_list)
      self.accuracy_on_test = self.eval_accuracy(self.mode)

      self.end_epoch()
      self.scheduler.step()
      if self.loss_mode == 'triplet+':
        self.scheduler_pred.step()


      if self.loss_mode == 'triplet':
        print("At the {}-th epoch we have obtained: train_loss {:.6f} \t test_loss {:.6f} \t test_accuracy {:.6f}".format(epoch+1,self.loss_train_f, self.loss_test_f, self.accuracy_on_test))
        
      elif self.loss_mode == 'sphere':
        print("At the {}-th epoch we have obtained: train_loss {:.6f} \t test_loss {:.6f} \t test_accuracy {:.6f} \t f1_score {:.6f}".format(epoch+1,self.loss_train_f, self.loss_test_f, self.accuracy_on_test, f1_s))
      else:
        print("At the {}-th epoch we have obtained: train_loss {:.6f} \t test_loss {:.6f} \t test_loss_pred {:.6f} \t test_accuracy {:.6f} \t f1_score {:.6f}".format(epoch+1,self.loss_train_f, self.loss_test_f, self.loss_pred, self.accuracy_on_test, f1_s))

    if self.loss_mode == 'triplet+':
      return self.accuracy_on_test, f1_s
    return self.accuracy_on_test
    

  def test(self):
    self.model.eval()
    with torch.no_grad():
      print("Testing step....")
      loss_test_list = []
      loss_pred = []
      f1_list_pred = torch.tensor([], device = device)
      f1_list_true = torch.tensor([], device = device)

      for batch in test_loader:

        out = self.model(batch.x, batch.edge_index.to(device))
        out_l = out[:batch.batch_size]

        if self.loss_mode.startswith('triplet'):
            positives, negatives = get_triplets(out.clone(), batch.edge_index.clone(), batch.batch_size, get_vecs = True, label_idx = batch.y.to(device) if self.loss_mode == 'triplet+' else None)
            loss_test = self.loss(out_l, positives, negatives)
        
        if self.loss_mode == 'triplet+':
          out_n = self.predictor(out_l)
          loss2 = self.loss2(out_n, batch.y[:batch.batch_size].to(device))
          loss_pred.append(loss2)
          # f1_s = self.compute_f1_score(batch.y[:batch.batch_size], out_n)
          f1_list_pred = torch.cat([f1_list_pred, torch.argmax(F.softmax(out_n, dim = -1), dim = -1)])
          f1_list_true = torch.cat([f1_list_true, batch.y[:batch.batch_size]])



        elif self.loss_mode == 'sphere':
          loss_test = self.loss(out_l, batch.y[:batch.batch_size].to(device))
          f1_list_pred = torch.cat([f1_list_pred, torch.argmax(F.softmax(out_l, dim = -1), dim = -1)])
          f1_list_true = torch.cat([f1_list_true, batch.y[:batch.batch_size]])
        
        loss_test_list.append(loss_test.item())

    if self.loss_mode != 'triplet+':

      if self.loss_mode == 'sphere':
        f1_s = self.compute_f1_score(f1_list_true, f1_list_pred)
        return sum(loss_test_list)/len(loss_test_list), f1_s
      
      return sum(loss_test_list)/len(loss_test_list) 
    else:
      f1_s = self.compute_f1_score(f1_list_true, f1_list_pred)
      return sum(loss_test_list)/len(loss_test_list), sum(loss_pred)/len(loss_pred), f1_s

  def calcG(self,ID):  #This method is used for the evaluation of accuracy, in particular it computes the denominator
    if ID>200:       # as described in the paper.
        ID=200
    c=1
    somm=0
    while c<=ID:
        somm+=1/(math.log2(1+c))
        c+=1
    return somm
  
  def compute_f1_score(self, true, pred, avg = 'macro'):

    pred = pred.detach().cpu().numpy()
    true = true.cpu().numpy()

    f1_s = f1_score(true, pred, labels = np.arange(num_classes), average = avg)

    return f1_s

  def eval_accuracy(self, mode = 1): 
    ''' This function computes the Normalized Discounted Cumulative Gain, that is the metric adopted in the research. '''
    if mode == 1:
      low_train = data_summary['train_with_val']['low']
      high_train = data_summary['train_with_val']['high']
      low_test = data_summary['val']['low']
      high_test = data_summary['val']['high']

    else:
      low_train = data_summary['train']['low']
      high_train = data_summary['train']['high']
      low_test = data_summary['test']['low']
      high_test = data_summary['test']['high']      


    self.model.eval()
    with torch.no_grad():
      ''' It is necessary to compute the embedding by condisering the edges between train and test data, but without considering the linkings between test samples, because they ,must be predicted in the evaluation. '''
      inference_data = data_split(data, low = low_train, high = high_test).split_for_inference(low_train, low_test, high_train, high_test) # This function takes care of the link remotion.
      out = self.model(inference_data[0], inference_data[1].to(device))[torch.arange(low_test,high_test)]

      A_acc = A1[low_test:high_test, low_test:high_test]

      print("Evaluation step....")
      test_embs = out.to(torch.device('cpu')).numpy()
      
      neigh=NearestNeighbors(n_neighbors=(200+1),algorithm='ball_tree').fit(test_embs) #With the K-NN we get the nearest 
      
      dist,ind=neigh.kneighbors(test_embs) 

      acc=[]

      c=0
      for test_artist in range(high_test-low_test):
          summ=0
          # ideal=len([i for i in range(self.test[0],self.test[0]+A_acc[k,:].shape[0]) if A_acc[k,i]!=0]) 
          
          ideal = A_acc[ind[c][0]].sum().item()
          
          
          
          den = self.calcG(ideal)
          if den==0:
              c+=1
              continue  
          for j in range(len(ind[c][1:])):
              if A_acc[ind[c][0]][ind[c][1:][j]].item()!=0:
                  summ+= 1/(math.log2(1+(j+1)))
                  
              else:
                  continue
          c+=1    
          summ/=den
          acc.append(summ)
      return sum(acc)/len(acc)

  def end_epoch(self):

    ''' This method is called at the end of each epoch and its purpose is to save the model state, and metrics, in order to be loaded again when needed.'''
    if self.path != None:
      if self.first:
        self.checkpoint['loss_train'] = [self.loss_train_f]
        self.checkpoint['loss_test'] = [self.loss_test_f]
        self.checkpoint['accuracy'] = [self.accuracy_on_test]
        self.checkpoint['modelState'] = self.model.state_dict()
        if self.loss_mode == 'triplet+':
          self.checkpoint['modelState2'] = self.predictor.state_dict()
        save_model(self.checkpoint, self.path)
        self.first = False
      else:
        self.checkpoint['loss_train'] += [self.loss_train_f]
        self.checkpoint['loss_test'] += [self.loss_test_f]
        self.checkpoint['accuracy'] += [self.accuracy_on_test]
        self.checkpoint['modelState'] = self.model.state_dict()
        if self.loss_mode == 'triplet+':
          self.checkpoint['modelState2'] = self.predictor.state_dict()
        save_model(self.checkpoint, self.path)

      load_model(self.path, self.model, device)



    

In [12]:
set_mode = 'test'  # 'val' or 'test', in order to set up the specific training procedure

data = data

if set_mode == 'val':
  trainmask = data.vtrain_mask
  testmask = data.val_mask
  mode = 1
elif set_mode == 'test':
  trainmask = data.train_mask
  testmask = data.test_mask
  mode = 2


#Gat2Model = GAT2(n_heads = 1) #, GraphSage(n_layers = n_layers), FCL() #
first = True

''' NeighborLoader is really important when it comes to train with mini-batches, it provides easily the splittings and the training is faster. '''

' NeighborLoader is really important when it comes to train with mini-batches, it provides easily the splittings and the training is faster. '

In [13]:
model_name = 'GAT2'
seed_everything(30726)
n_heads = 1

lr = 6e-5

n_layers = 2
weight_decay = 0#1e-2 #1e-4
num_epochs = 20
model = GATSY(n_heads, n_layers).to('cpu') # GraphSage(), FCL()
# visualize(model(X.to('cpu'), A.to('cpu')), color = labels, dim = 3)
# visualize(model(X.to('cpu'), A.to('cpu')), color = labels, dim = 2)


model.to(device)
X.to(device)
A.to(device)
labels.to(device)
loss_mode = 'triplet+'

if loss_mode == 'triplet' or loss_mode == 'triplet+':
    loss = torch.nn.TripletMarginLoss(margin=0.2)
elif loss_mode == 'sphere':
    loss = losses.SphereFaceLoss(num_classes, 
                    256*n_heads, 
                    margin=100, 
                    scale=1)

train_loader = NeighborLoader(data, input_nodes = data.vtrain_mask, num_neighbors=[25]*n_layers, shuffle = True, batch_size = 512)
test_loader = NeighborLoader(data, input_nodes = data.val_mask, num_neighbors=[25]*n_layers, shuffle = False, batch_size = 512)

    
optimizer = torch.optim.Adam(model.parameters(), lr = lr, weight_decay = weight_decay)
scheduler=lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min= 0, last_epoch= -1, verbose=True)

path =  './models/GATSY.pt'#'./models/best_gat_random.pt'
trainer = Trainer(model.to(device), optimizer, scheduler, loss, num_epochs, mode, path, first, loss_mode)
accuracy = trainer.train()  # This cell make the training start.






Processing 1-th epoch
Training step....


KeyboardInterrupt: 

In [25]:
h = GATSYFC(1, 2)

load_model('models/GATSY.pt', h, 'cpu')

{'loss_train': [2.4773150417539807,
  1.748911162217458,
  1.5860115885734558,
  1.5091288553343878,
  1.4460652603043451,
  1.432824108335707,
  1.3829878436194525,
  1.3396363523271348,
  1.3344598809878032,
  1.2859657605489094,
  1.2468823856777616,
  1.2122200263871088,
  1.2186521689097087,
  1.163508481449551,
  1.1488256123330858,
  1.1161361237366993,
  1.105948805809021,
  1.08142974641588,
  1.075261394182841,
  1.0762389202912648],
 'loss_test': [0.026277852555116017,
  0.023726988040531676,
  0.0273049542059501,
  0.027900919939080875,
  0.019417823059484363,
  0.02236774160216252,
  0.01691449185212453,
  0.013937514973804355,
  0.030375607932607334,
  0.028609815364082653,
  0.018687915056943893,
  0.024205774689714115,
  0.027689316620429356,
  0.011943826529507836,
  0.01106167157801489,
  0.012547614130501946,
  0.010203591004634896,
  0.010696170773978034,
  0.01600884273648262,
  0.010127008271714052],
 'accuracy': [0.5304588944550673,
  0.5384717021863155,
  0.5528

In [27]:
z, p = visualize(h(X.to('cpu'), A.to('cpu')), labels)


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



In [23]:
torch.save(z, 'embeds/embedding.pt')

In [None]:
lista_seeds = [int(np.random.uniform(0,100000)) for i in range(10)]


In [21]:
import requests
model_name = 'GAT2_filtered_dataset_alltheneighbors'


n_heads = 1

lr = 6e-5


weight_decay = 1e-2#0 #0  #learning rate is realted with the f1-score, whereas the  
num_epochs = 50
loss_mode = 'triplet'

if loss_mode == 'triplet' or loss_mode == 'triplet+':
    loss = torch.nn.TripletMarginLoss(margin=0.2)

for seed in range(len(lista_seeds)):
    print(lista_seeds[seed])
    seed_everything(lista_seeds[seed])
    print('Starting the GAT training with {} heads'.format(n_heads))
    n_layers = 2
    model = GATSY(n_heads, n_layers)
      
    train_loader = NeighborLoader(data, input_nodes = data.vtrain_mask, num_neighbors=[25]*n_layers, shuffle = True, batch_size = 512, num_workers = 4, persistent_workers = True)
    test_loader = NeighborLoader(data, input_nodes = data.val_mask, num_neighbors=[25]*n_layers, shuffle = False, batch_size = 512, num_workers = 4, persistent_workers = True)

    

    optimizer = torch.optim.Adam(model.parameters(), lr = lr, weight_decay = weight_decay)
    scheduler=lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min= 0, last_epoch= -1, verbose=True)

    path =  None#'./models/GATSY.pt'#'./models/best_gat_random.pt'
    trainer = Trainer(model.to(device), optimizer, scheduler, loss, num_epochs, mode, path, first, loss_mode)
    if loss_mode == 'triplet+':
      accuracy, f1_s = trainer.train()  # This cell make the training start.
    else:  
      accuracy = trainer.train()  # This cell make the training start.
    

    if random:
      text = '{}r results is {}, with seed {}:{}'.format(model_name, accuracy, seed+1, lista_seeds[seed])
    else:
      text = '{} trials: accuracy is {} and f1-score is {}, with seed {}:{}'.format(model_name, accuracy, f1_s, seed+1, lista_seeds[seed])
    requests.get('https://api.telegram.org/bot5014268239:AAFKCrVdcf5VdrEyQiH3TiNnuAIysqh0_RA/sendmessage?chat_id=-697248386&text={}'.format(text))


30726
Starting the GAT training with 1 heads
Adjusting learning rate of group 0 to 6.0000e-05.
Processing 1-th epoch
Training step....
Testing step....
Evaluation step....
Adjusting learning rate of group 0 to 5.9941e-05.
At the 1-th epoch we have obtained: train_loss 0.305053 	 test_loss 0.056984 	 test_accuracy 0.534644
Processing 2-th epoch
Training step....
Testing step....
Evaluation step....
Adjusting learning rate of group 0 to 5.9763e-05.
At the 2-th epoch we have obtained: train_loss 0.045184 	 test_loss 0.040371 	 test_accuracy 0.541349
Processing 3-th epoch
Training step....
Testing step....
Evaluation step....
Adjusting learning rate of group 0 to 5.9469e-05.
At the 3-th epoch we have obtained: train_loss 0.039096 	 test_loss 0.028895 	 test_accuracy 0.548895
Processing 4-th epoch
Training step....
Testing step....
Evaluation step....
Adjusting learning rate of group 0 to 5.9057e-05.
At the 4-th epoch we have obtained: train_loss 0.030815 	 test_loss 0.035681 	 test_accurac

In [17]:



z, p = visualize(out.detach(), color = labels)




In [18]:
visualize(out.detach(), color = labels, dim = 3) # Artist can be classified in more than one genre.



The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



(array([[-12.842239 ,  -6.5154696,   3.713126 ],
        [ 20.77207  ,  -1.7452232,  -6.8654666],
        [-12.03238  ,  10.9852705,  16.966688 ],
        ...,
        [ -4.075374 ,  18.462294 , -12.170263 ],
        [ -1.5682812,   7.107685 ,  -5.462662 ],
        [  8.984141 , -11.241777 , -13.698984 ]], dtype=float32),
               x1         x2         x3          y        family  \
 0     -12.842239  -6.515470   3.713126      dance  disco family   
 1      20.772070  -1.745223  -6.865467      metal    heavy rock   
 2     -12.032380  10.985271  16.966688       soul   soul family   
 3      11.865769   5.122225   7.079499       folk  country folk   
 4     -11.862017  -7.628762   9.802443      dance  disco family   
 ...          ...        ...        ...        ...           ...   
 10981   0.245868 -16.791679 -16.993006       rock   rock family   
 10982  12.989983   5.395065 -16.555662  hard rock    heavy rock   
 10983  -4.075374  18.462294 -12.170263      indie         indie

In [None]:
model.to('cpu')

In [None]:
X = X.to('cpu')
A = A.to('cpu')
embedding = model(X, A)

In [None]:
losss(embedding, labels)

In [11]:
no_labels_accuracies = [0.5702102804547252,
                        0.5700736528406515,
                        0.5748801720056828,
                        0.5594285347682718,
                        0.5709628801941624,
                        0.5673125015419235,
                        0.5652883667995772,
                        0.576895892106536,
                        0.56275933969578,
                        0.5648742227326402]
labels_accuracies = [0.5681596505875104,
                    0.551769260448272,
                    0.5613982297153829,
                    0.5538530200266898,
                    0.5532030362992817,
                    0.5611319424342442,
                    0.557423869214372,
                    0.5674569369156667,
                    0.5452268054621623,
                    0.5640221890387122
                    ]

labels_f1_s = [0.5580585038812711,
               0.5492937661263206,
               0.5516235228614412,
               0.5295153123856763,
               0.5362882005801246,
               0.5364068537636376,
               0.5585590041127101,
               0.5384772398939248,
               0.53699690709336,
               0.5503526828625457]


np.std(labels_f1_s)

0.00969266808403347