In [2]:
import pandas as pd
import requests
import numpy as np
from requests_html import HTML,HTMLSession
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from random import choice
from sklearn.neighbors import NearestNeighbors
import math
import time

## Loading of the dataset Olga

In [3]:
olga=pd.read_csv('olga.csv')
#olga[olga.partition=='train'].count()   #train 0-14138, #val 14139-15905, #test 15906-17673

In [11]:
class DatasetOlga(): #In this class, we obtain through different methods the main characteristics of the graph of artists
                     # thanks to the available information in the olga dataset
    def __init__(self,olga):
        self.olga=olga
        self.mb=olga.musicbrainz_id
        self.artists={} #Needed for obtaining the mapping from musicbrainz to the allmusic ids
        self.l=len(self.mb)
        self.d={}       #Needed for obtaining a dict. where keys are artists, and values are the artists similar to them, based on self.artists
        self.NI={}      #Dict. that will contain the artist's features
    
    def get_mapping(self,i): #This method returns the allmusic page of an artist (if exists), given his id from the dataset 
        response = requests.get(f'https://musicbrainz.org/ws/2/artist/{str(self.mb[i])}?inc=url-rels&fmt=json')
        if response.ok:
            data = response.json()
            refs = [r['url']['resource'] for r in data['relations'] if r['type'] == 'allmusic']        
            return refs[0] if len(refs) != 0 else "Not found"

        

    def get_mappingList(self,init,end,increm=500):
        Lmusicbrainz_id=self.mb[init:end] #We can specify the range of the artists of our interest, for the purpose of this NN task
        length=len(Lmusicbrainz_id)       #we will take all of them into consideration.
        c=0
        for i in range(len(Lmusicbrainz_id)):
            mapp=self.get_mapping(i)   #get_mapping method again.
            if mapp==None:
                while mapp==None:
                    mapp=self.get_mapping(i)
                    
            if mapp!="Not found":   #Some of the ids has not a respective allmusic id, so we lose that information
                mapp=str(mapp)      #Mapp are strings of links
                key=mapp[-12:]
                self.artists[key]=i
            c+=1
            if c%increm==0 or c==30:
                    print("{}/{} artists were processed".format(c,length)) #This is just to keep track of the processed artist
                    
            
        self.save_data(self.artists,'MsbMapped1.json')  #We do save the Artists Ids map, this function, when called, takes a lot
                                                        #of time, for this reason its result is already saved in the file:
        return self.artists                             # 'MsbMapped1.json'
    
    
    def get_GraphDict(self,name='MsbMapped1.json',increm=500):
        session=HTMLSession()
        c=0 #Counter
        artID=self.load_data(name) #We load the mapped artists (between MusicBrainz Ids, and AllMusic Ids)
        length=len(artID.keys())
        for k in artID.keys(): #dict of mapped mbids, this has to be computed before from getmapping
            if k!=None:
                url='https://www.allmusic.com/artist/'+ k+ '/related' #k is just the code, every link for the artist is distinguished 
                r=session.get(url)                                    #by a unique code in the link.
                sess=r.html.find('body',first=True)
                div=sess.find('.overflow-container')                  #The information of the related artists are exctracted
                divn=div[0]                                           #from the html of the allmusic's related web page
                divn=divn.find('.content-container')
                divn=divn[0]
                divn=divn.find('.content')
                divn=divn[0]
                divn=divn.find('section',first=True)
                if divn==None:
                    self.d[artID[k]]=[] #That artist has not related artists (or we have missing information)
                    continue
                artists=divn.find('li')
                artistL=[]


                for i in range(len(artists)):
                    art=artists[i]
                    art=art.find('a')            #We look for all the k's related artists links
                    link=list(art[0].absolute_links)[0] #Absolute_link returns a one-element set, that we convert into a list and
                    link=str(link)[-12:]                #we get its code
                    if link in artID.keys(): #g is the dict of all the mapped musicbrainz_ids
                        artistL.append(self.artists[link]) #Some of the related artists may not be in the musicbrainz_ids list.
                self.d[artID[k]]=artistL
                c+=1
                if c%increm==0 or c==30:
                    print("{}/{} artists were processed".format(c,length))
        self.save_data(self.d,'graphSimilarities1.json') #Here we save the connection amongst the artists, obtained with this method
        print("Done...")     #Also it takes some time to process, for this reason the result of this method can be 
        return self.d        #found at the 'graphSimilarities.json' file.
    
    def save_data(self,dicti,name):
        jfile = open(name, "w")
        jfile = json.dump(dicti, jfile)
    
    def load_data(self,name):
        jfile = open(name, "r")
        dicti = json.load(jfile)
        return dicti


## Graph construction

In [5]:
n_features=2613
class Graph():  #The purpose of this class is to construct the graph of artists, in particular the Adjacency matrix A, and the  
                # node features tensor X
        
    def __init__(self,mapfile,gfile):  #The expected files are the ones mentioned before.
        self.mfile=self.load_data(mapfile)
        self.gfile=self.load_data(gfile)
        self.A=torch.zeros((len(self.mfile),len(self.mfile)))
        self.X=torch.zeros((n_features,len(self.mfile)))
        self.ord=sorted(list(map(int,self.gfile.keys())))
        self.enc1={}
        self.enc2={}
    
    #With the preprocessing step at the previous cell we have lost some information
    #and also the ordering of the artists, so i have defined a method that for each previous artist index
    #we can encode it to a new ordered list of artists.
    
    
    def encoding1(self):   #From ordered to unordered, Dict are not ordered data structures, so is better to order them before
        for k in range(len(self.mfile)): #This encoding is used to get the Instance matrix
            self.enc1[k]=self.ord[k]
        return self.enc1
    
    def encoding2(self):   #From unordered to ordered,  From real number, to ordered one.
        for k in range(len(self.mfile)): #This encoding is used to get the Adjacency matrix
            self.enc2[self.ord[k]]=k
        return self.enc2
    
    def get_instance(self,instances,df=False):#We take the features centroid, obtained from 25 track from artists discographies.
        X=np.load(instances)                  #The instances file is provided by the repository mentioned in the paper.
        X=torch.from_numpy(X).requires_grad_(True) #We take the allmusicIDs, which contain the key of the artists for which we haven't 
        c=0                                        # lost information
        enc=self.encoding1()
        for k in self.mfile:
            z=enc[c]
            self.X[:,c]=X[z] 
            c+=1
        return self.X
    
    def get_adjacency(self,symmetry=False,df=False):  #The hypothesis could be either a symmetric matrix (paper), or not.
        enc=self.encoding2()
        for k in self.gfile:
            c1=enc[int(k)]
            for j in self.gfile[k]:
                c2=enc[int(j)]
                if self.A[c2,c1]==1 and symmetry==True:
                    continue
                self.A[c1,c2]=1
                if symmetry:
                    self.A[c2,c1]=1

            
        return self.A
    
    
    def load_data(self,name):
        jfile = open(name, "r")
        dicti = json.load(jfile)
        return dicti
    
g=Graph('MsbMapped.json','graphSimilarities.json')
X1=g.get_instance('acousticbrainz.npy')
A1=g.get_adjacency(symmetry=True)


# GraphSAGE model

In [57]:
#Here we have some hyperparamters, and also the list of ordered artists, and their indices with respect their own set (train,val,test).
outf=[2613,512,128,32,100] #Train: 0:9021, #Val: 9022:10189, #Test: 10190:11260
train_=list(range(0,9021+1)) #Train set, without val
train=list(range(0,10189+1)) #Train set, with val
val=list(range(9022,10189+1))
test=list(range(10190,11260+1))
KNN=200       #K-nearest-neighbors for the evaluation metrics.
device=torch.device('cuda')
class GraphSAGE(nn.Module):
    
    def __init__(self,X,A):
        super(GraphSAGE,self).__init__()
        self.A=A #Tensors version of adjacency matrix and Instances
        self.X=X    
        self.V={}  #In this dictionary we insert the tracing of a certain mini_batch, needed for the forward step
        self.l11=nn.Linear(outf[0],1024)
        self.l12=nn.Linear(3637,outf[1])
        self.l21=nn.Linear(outf[1],256)
        self.l22=nn.Linear(768,outf[2])
        self.l31=nn.Linear(outf[2],64)
        self.l32=nn.Linear(192,outf[3])
        self.FC11=nn.Linear(outf[1],256)
        self.FC12=nn.Linear(outf[2],256)
        self.FC13=nn.Linear(outf[3],256)
        
        self.FC2=nn.Linear(256,256)
        self.out=nn.Linear(256,100) #final output layer
        
        
    
    def forward(self,V,L):
        K=L+1                      #We add a number, because if we have three layers, we need to count also the batch
        self.V[K]=set(V)
        for k in range(K-1,0,-1):
            d=set()
            for idx in self.V[k+1]: 
                d=d.union(self.get_n(idx))
                
            self.V[k]=d
        
        Es=self.select(self.X,set(),self.V[1])
        for k in range(0,K-1):                   #k starts from 0, 0 is associated with the first layer, 1 with the second and so on....
            t=self.tfunc(outf[k],Es,self.V[k+1]) #tfunc is a matrix that has 0 for the column outside the mini-batch sets,  
            Esn=self.select(t,set(),self.V[k+2]) #and has the transformed vectors for the columns that belongs to the mini_batch set
            An=self.select(self.A,self.V[k+1],self.V[k+2]) #We do select either the vectors from X, and from A
            
            #Graph convolution block, according to the GraphSAGE structure described in the paper.
            
            if k==0:
                N=torch.mm(torch.transpose(F.elu(self.l11(torch.transpose(Es,0,1))),0,1),An)
                
                Es=F.elu(self.l12(torch.transpose(torch.cat((N,Esn)),0,1)))
            if k==1:
                N=torch.mm(torch.transpose(F.elu(self.l21(torch.transpose(Es,0,1))),0,1),An)
                
                Es=F.elu(self.l22(torch.transpose(torch.cat((N,Esn)),0,1)))
            if k==2:
                N=torch.mm(torch.transpose(F.elu(self.l31(torch.transpose(Es,0,1))),0,1),An)
                
                Es=F.elu(self.l32(torch.transpose(torch.cat((N,Esn)),0,1)))
            
            Es=torch.transpose(F.normalize(Es,dim=0),0,1)
        
        #Fully connected layers. The input layer depends on the number of Graph layers.
        
        if L==1:                                         #There are different input dimensions, because they depend on the 
            Es=F.elu(self.FC11(torch.transpose(Es,0,1))) #output dimension returned by the GCN
        elif L==2:
            Es=F.elu(self.FC12(torch.transpose(Es,0,1)))
        elif L==3:
            Es=F.elu(self.FC13(torch.transpose(Es,0,1)))
        Es=F.elu(self.FC2(Es))
        Es=torch.transpose(self.out(Es),0,1) #Final linear layer that represents the obtained embedded space.

        return Es
            
    def get_n(self,idx):    #This function is the neighbor's function. Given a batch index we get its neighborhood.
        t=torch.nonzero(self.A[idx])
        s=set()
        
        for k in t:
            if t.shape[0]!=0:
                s.add(k.item())
        s.add(idx)     
                
        return s
    def select(self,mat,row,col):  #Given a set of indices for rows or column or both, we get the respective elements.
        col=sorted(list(col))      #This is applied when we get the t matrix.
        
        c=0
        if row==set():
            ma=torch.zeros((mat.shape[0],len(col)))
            for k in col:
                ma[:,c]=mat[:,k]
                c+=1
            return ma
        else:
            row=torch.tensor(sorted(list(row)))
            col=torch.tensor(col)
            ma=torch.index_select(mat,0,row)
            ma=torch.index_select(ma,1,col)
            return ma
    
    def tfunc(self,n_feat,es,V):               #This is the t function, which was previously described
        t=torch.zeros((n_feat,self.X.shape[1]))
        V=sorted(list(V))
        c=0
        for k in V:
            t[:,k]=es[:,c]
            c+=1
        return t
    def tfunc2(self,n_feat,es,V,prev):               #This tfunction is later used for the accuracy evaluation step
        V=sorted(list(V))
        c=0
        for k in V:
            prev[:,k]=es[:,c]
            c+=1
        return prev
    def mini_batches(self,indices,bs=32): #This function generates a list of minibatches, of size bs
        indicesN=indices.copy()           #sets are unordered data structure, so there is no need to shuffle them. 
        mbList=[]                         #Lists of lists of mini_batches indices 
        while len(indicesN)!=0:
            mb=set()                      #Inner list, with the indices of a particular mini_batch
            while len(mb)<bs:
                if len(indicesN)==0:
                    mbList.append(mb)
                    return mbList
                r=choice(indicesN)
                sample=indicesN.pop(indicesN.index(r))
                mb.add(sample)
            mbList.append(mb)
        return mbList          #obj.mini_batches(#,bs=128) #: train_,train,val,test, we get lists of list of batches from here
    
    
    def calcG(self,ID):  #This method is used for the evaluation of accuracy, in particular it computes the denominator
        if ID>200:       # as described in the paper.
            ID=200
        c=1
        somm=0
        while c<=ID:
            somm+=1/(math.log2(1+c))
            c+=1
        return somm

    def evalAcc(self,T,S,kneigh):  #This function is to compute accuracy for the test and train set. 
        T=T.detach().numpy().transpose()
        neigh=NearestNeighbors(n_neighbors=(kneigh+1),algorithm='ball_tree').fit(T)  #With the K-NN we get the nearest 
        dist,ind=neigh.kneighbors(T)                                                 #neighbors in the embedded.
        acc=[]                          
        for k in S:
            summ=0
            ideal=self.A[k,:].sum().item()  #gs
            den=self.calcG(ideal)
            c=1
            if den==0:#There is the problem of the distance for the people without neighbors
                continue  #1-(n/200) or ignore them.
            for j in ind[k][1:]:
                if self.A[k][j]!=0:
                    summ+= 1/(math.log2(1+c))
                else:
                    continue
                c+=1
            summ/=den
            acc.append(summ)
        return acc
    


gs=GraphSAGE(X1,A1)

### Here there will be the training step....

In [60]:
training=train_
testing=val
n_layer=1   #n_of graph conv.layer.
batch_size=512 #This is the batch size used in the paper which insired artist similarity
mbb=gs.mini_batches(training,bs=batch_size)
num_epochs=1 #According to the paper there will be 50 epochs for each experiment 


In [24]:
#With these lines of code we obtain the embedded space sample
start=time.time()
for epoch in range(num_epochs):
    print("Processing epoch n° ",epoch+1)
    num=int(len(training)/batch_size)+1     
    for k in range(len(mbb)):
        Ex=gs(mbb[k],n_layer)
        #TODO: Loss function, Optimizer
        name="C:\\Users\\Peppe\\OneDrive\\Desktop\\Università\\magistrale\\Neural_Networks\\MIR_project\\minibatches\\file"+str(k)+".pt"
        torch.save(Ex,name)

    for k in range(len(mbb)):
        name="C:\\Users\\Peppe\\OneDrive\\Desktop\\Università\\magistrale\\Neural_Networks\\MIR_project\\minibatches\\file"+str(k)+".pt"
        ex=torch.load(name)
        
        if k==0:
            t1=gs.tfunc(outf[-1],ex,mbb[k])

        else:
            t1=gs.tfunc2(outf[-1],ex,mbb[k],t1)
    print("Evaluating the epoch n° ",epoch+1)
    accL1=gs.evalAcc(t1[:,:training[-1]+1],set(training),KNN)
    t2=gs(set(testing),n_layer)
    t2=gs.tfunc2(outf[-1],t2,set(testing),t1) #We integrate the testing set to the previous training set
    accL2=gs.evalAcc(t2,set(testing),KNN)
    TestAcc=sum(accL1)/len(accL1)
    TrainAcc=sum(accL2)/len(accL2)
    print("Processesed epoch n° {}, \tTrain accuracy: {:.4f}, \tTest accuracy: {:.4f}".format((epoch+1),TrainAcc,TestAcc))
    print("done")
end=time.time()
print(end-start) #78 sec (1 graph layer), 132 sec (2 graph layers), 214 sec (3 graph layers).

Processing epoch n°  1
Evaluating the epoch n°  1
Processesed epoch n° 1, 	Train accuracy: 0.2365, 	Test accuracy: 0.3087
done
73.32841181755066
