In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from collections import Counter
%matplotlib inline

### Process data

In [2]:
dfs = [pd.read_csv(f'archive/dataset-of-{decade}0s.csv') for decade in ['6', '7', '8', '9', '0', '1']]

for i, decade in enumerate([1960, 1970, 1980, 1990, 2000, 2010]):
    dfs[i]['decade'] = pd.Series(decade, index=dfs[i].index)

data = pd.concat(dfs, axis=0).sample(frac=1.0, random_state=1).reset_index(drop=True)
data = data.dropna('columns')

In [3]:
def split_artists(data):
    for index, row in data.iterrows():
        if type(row['artist'])==list:
            break
        artists = [row['artist']]

        def check(s, artists):
            artists_list = []
            for item in artists:
                if s in item:
                    artists_list += item.split(s)
                    #print(s,' ', item,' ',artists_list)
                else:
                    artists_list.append(item)
            return artists_list

        artists = check('Featuring', artists)
        artists = check('featuring', artists)
        artists = check('Feat.', artists)
        artists = check(',', artists)
        #artists = check('+', artists)
        data.at[index, 'artist'] = artists
    return data

data = split_artists(data.copy())

In [4]:
data

Unnamed: 0,track,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,target,decade
0,Hard Working Woman - Live / 1983,[Otis Clay],spotify:track:2g8zPTDvSWlYTZs9ylQQ9L,0.574,0.7630,8,-11.921,0,0.0467,0.23400,0.000039,0.5720,0.916,126.158,364333,4,27.85422,11,0,1980
1,Quatuor pour la fin du temps (Quartet for the ...,[Olivier Messiaen],spotify:track:2Ow0yMUSccQ4vmfq8pyCCF,0.121,0.0238,4,-28.614,1,0.0400,0.97800,0.910000,0.0771,0.030,95.821,420040,4,32.71495,20,0,2000
2,Wokeuplikethis*,"[Playboi Carti , Lil Uzi Vert]",spotify:track:59J5nzL1KniFHnU120dQzt,0.785,0.6200,8,-6.668,1,0.2540,0.01380,0.000000,0.1500,0.478,78.476,235535,4,28.46889,13,1,2010
3,Plug Walk,[Rich The Kid],spotify:track:1ZAyjvIk9YiD76yYy0TEG6,0.876,0.5190,11,-6.531,1,0.1430,0.20200,0.000000,0.1080,0.158,94.981,175230,4,20.94400,9,1,2010
4,Key Largo,[Bertie Higgins],spotify:track:41v56RunWqxgqTD4H2cwOD,0.503,0.6590,0,-6.829,1,0.0356,0.44300,0.000000,0.1530,0.330,101.668,198947,4,21.74949,9,1,1980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41006,Kiko,[Jimmy McGriff],spotify:track:7oHFJ7VuwjeAvEgQbEmM8r,0.853,0.6300,5,-14.193,1,0.0894,0.18700,0.868000,0.1120,0.882,115.674,164467,4,47.68413,8,1,1960
41007,How We Do,"[The Game , 50 Cent]",spotify:track:4PmMVdIFpJTB9WQbmILf4p,0.862,0.6480,4,-7.401,0,0.2510,0.04550,0.000000,0.0332,0.637,98.012,235533,4,23.35783,12,1,2000
41008,G. Rossini: Der Barbier von Sevilla/Die Verleu...,[Staatskapelle Berlin],spotify:track:41C2tPBf2JmHafRunOkNVk,0.349,0.1230,0,-19.248,1,0.1000,0.91000,0.000035,0.1330,0.157,76.240,268907,3,48.43741,11,0,1960
41009,Bella Ciao (Lied der italienischen Partisanen),[Hannes Wader],spotify:track:2OYLSRaVD3Dp35NwoS1b8r,0.457,0.5740,9,-16.066,1,0.2590,0.79300,0.000000,0.7590,0.709,109.195,196333,4,37.54967,9,0,1970


In [5]:
d = dict(data['uri'].value_counts())
len(d.keys())

40560

In [7]:
# drop duplicate uri
drop_list = []
for k, v in d.items():
    if d[k]>1:
        decade = min(data.loc[data['uri']==k]['decade'])
        for index, row in data.loc[data['uri']==k].iterrows():
            if row['decade']>decade:
                drop_list.append(index)
data = data.drop(drop_list).reset_index()

In [8]:
data=data.drop(['index'], axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40560 entries, 0 to 40559
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track             40560 non-null  object 
 1   artist            40560 non-null  object 
 2   uri               40560 non-null  object 
 3   danceability      40560 non-null  float64
 4   energy            40560 non-null  float64
 5   key               40560 non-null  int64  
 6   loudness          40560 non-null  float64
 7   mode              40560 non-null  int64  
 8   speechiness       40560 non-null  float64
 9   acousticness      40560 non-null  float64
 10  instrumentalness  40560 non-null  float64
 11  liveness          40560 non-null  float64
 12  valence           40560 non-null  float64
 13  tempo             40560 non-null  float64
 14  duration_ms       40560 non-null  int64  
 15  time_signature    40560 non-null  int64  
 16  chorus_hit        40560 non-null  float6

## Construct Graph

In [51]:
#graphData = data.loc[data['decade']==2010]
scalar = StandardScaler()
scalar.fit(data.drop(['track', 'artist', 'uri','target'], axis=1).values)

def getDecadeData(decade):
    graphData = data.loc[data['decade']==decade]
    label = graphData['target']
    artist = graphData['artist']
    uri = graphData['uri']
    track = graphData['track']
    features = graphData.drop(['track', 'artist', 'uri','target'], axis=1).values
#     scalar = StandardScaler()
#     scalar.fit(features)
    features = scalar.transform(features)
    return features, list(label), list(artist), list(uri), list(track)

#getDecadeData(2010)
decadesList = [1960,1970,1980,1990,2000,2010]
nodeIdOffset = [0]

In [52]:
data['decade'].value_counts()

1960    8625
1970    7685
1980    6803
2010    6262
2000    5768
1990    5417
Name: decade, dtype: int64

In [53]:
def constructGraphWithinDecade(G, threshold):
    for i, decade in enumerate(decadesList):
        features, label, artist, uri, track = getDecadeData(decade)
        n = len(list(label))
        offset = nodeIdOffset[i]
        for i in range(offset, offset+n):
            G.add_node(i, feature=features[i-offset], label=label[i-offset], artist=artist[i-offset], uri=uri[i-offset], track=track[i-offset])
        #G.add_nodes_from(list(range(offset, offset+n)))
        nodeIdOffset.append(offset+n)
        print('Decade ',decade, ' nodes: ',n, ' id range: ',offset,offset+n)
  
        nfeatures = normalize(features, norm="l2")
        similarity = np.triu(np.array(nfeatures@nfeatures.T),1)
        t = np.max(similarity)*threshold
        x, y = np.where(similarity>t)
        x, y = x+offset, y+offset
        edges = list(zip(list(x), list(y)))
        print('edges: ',len(edges))
        G.add_edges_from(edges)

In [54]:
def constructGraphCrossDecade(G, threshold):
    for i in range(1, len(decadesList)):
        features1 = getDecadeData(decadesList[i-1])[0]
        features2 = getDecadeData(decadesList[i])[0]
        
        nfeatures1 = normalize(features1, norm="l2")
        nfeatures2 = normalize(features2, norm="l2")
        #print(nfeatures1.shape, nfeatures2.shape)
        similarity = np.triu(np.array(nfeatures1@nfeatures2.T),1)
        #print(similarity.shape)
        t = np.max(similarity)*threshold
        x, y = np.where(similarity>t)
        x, y = x+nodeIdOffset[i-1], y+nodeIdOffset[i]
        edges = list(zip(list(x), list(y)))
        print('Decades: ',decadesList[i-1] ,decadesList[i] ,'edges: ',len(edges))
        G.add_edges_from(edges)

In [55]:
trackNet = nx.Graph()
constructGraphWithinDecade(trackNet, 0.8)
print("node: ", trackNet.number_of_nodes(), ' edges: ',trackNet.number_of_edges())
constructGraphCrossDecade(trackNet, 0.85)
print("node: ", trackNet.number_of_nodes(), ' edges: ',trackNet.number_of_edges())

Decade  1960  nodes:  8625  id range:  0 8625
edges:  483225
Decade  1970  nodes:  7685  id range:  8625 16310
edges:  164462
Decade  1980  nodes:  6803  id range:  16310 23113
edges:  135458
Decade  1990  nodes:  5417  id range:  23113 28530
edges:  78610
Decade  2000  nodes:  5768  id range:  28530 34298
edges:  166550
Decade  2010  nodes:  6262  id range:  34298 40560
edges:  278399
node:  40560  edges:  1306704
Decades:  1960 1970 edges:  64479
Decades:  1970 1980 edges:  33345
Decades:  1980 1990 edges:  23110
Decades:  1990 2000 edges:  34275
Decades:  2000 2010 edges:  73741
node:  40560  edges:  1535654


In [56]:
density = nx.density(trackNet)
transitivity = nx.transitivity(trackNet)
average_clustering_coefficient = nx.average_clustering(trackNet)

#############################################################
print (f'=== Network statistics ===')

print (f'Number of nodes: {trackNet.number_of_nodes()}')
print (f'Number of edges: {trackNet.number_of_edges()}')
print (f'Network density: {density:.4f}')
print (f'Transitivity: {transitivity:.4f}')
print (f'Average Clustering Coefficient: {average_clustering_coefficient:.4f}')
print (f'======')

=== Network statistics ===
Number of nodes: 40560
Number of edges: 1535654
Network density: 0.0019
Transitivity: 0.3645
Average Clustering Coefficient: 0.3466


In [57]:
trackNet.nodes[0]

{'feature': array([-1.40407257, -0.47644003,  0.505762  ,  0.55749513,  0.66558662,
        -0.43150688,  1.5279161 , -0.51229509, -0.65380731,  1.10037491,
         2.91386167, -0.87203517,  0.25250372, -0.80540805, -0.91354678,
        -1.29538647]),
 'label': 0,
 'artist': ['Gerardo Reyes'],
 'uri': 'spotify:track:5k718uRIW64ojlBMDXo0jH',
 'track': 'Libro Abierto'}

In [58]:
#map trackNet node id and artist node id
data_artist = []
trackNet_ids = trackNet.nodes()
for i in trackNet_ids:
    data_artist.append(trackNet.nodes[i]['artist'])
artist_id_map = {}
for index, dlist in enumerate(data_artist):
    for d in dlist:
        if d not in artist_id_map:
            artist_id_map[d] = [index]
        else:
            artist_id_map[d].append(index)

In [59]:
#build artist graph
artist_node = []
artist_edge = []
artist_net = nx.Graph()
for dlist in data_artist:
    for i, d in enumerate(dlist):
        if (d not in artist_node):
            artist_node.append(d)
            artist_net.add_node(d)
        for j in range(len(dlist[i+1:-1])):
            if [d, dlist[j]] not in artist_edge and [dlist[j],d] not in artist_edge:
                artist_edge.append([d, dlist[j]])
                artist_net.add_edge(d, dlist[j])

In [60]:
def print_network_statistics (net):
    nodes = len(net.nodes())
    edges = len(net.edges())
    density = nx.density(net)
    transitivity = nx.transitivity(net)
    average_clustering_coefficient = nx.average_clustering(net)

    print (f'=== Network statistics ===')
    print (f'Number of nodes: {nodes}')
    print (f'Number of edges: {edges}')
    print (f'Network density: {density:.4f}')
    print (f'Transitivity: {transitivity:.4f}')
    print (f'Average Clustering Coefficient: {average_clustering_coefficient:.4f}')
    print (f'======')
    
print_network_statistics(artist_net)

=== Network statistics ===
Number of nodes: 12133
Number of edges: 166
Network density: 0.0000
Transitivity: 0.2364
Average Clustering Coefficient: 0.0009


In [61]:
def GetArtistClusters(artist_net, artist_id_map):
    artist_clusters = []
    for node in artist_net.nodes():
        artists = sorted(list(nx.bfs_tree(artist_net,node)))
        if len(artists) > 1 and artists not in artist_clusters:
            artist_clusters.append(artists)
    artist_clusters_id = []
    for cluster in artist_clusters:
        id_cluster = []
        for name in cluster:
            for e in artist_id_map.get(name):
                if e not in id_cluster:
                    id_cluster.append(e)
        artist_clusters_id.append(id_cluster)
    return artist_clusters_id
artist_clusters_id = GetArtistClusters(artist_net, artist_id_map)

In [62]:
def AddEdgesFromRelatedArtists(trackNet, artist_clusters_id, threthold):
    for cluster in artist_clusters_id:
        for i in range(len(cluster)):
            for j in range(i,len(cluster)):
                nfeatures1 = trackNet.nodes[i]['feature']
                nfeatures2 = trackNet.nodes[j]['feature']
                similarity = nfeatures1@nfeatures2.T / (np.linalg.norm(nfeatures1) * np.linalg.norm(nfeatures2))
                if similarity >= threthold:
                    trackNet.add_edge(i,j)
AddEdgesFromRelatedArtists(trackNet, artist_clusters_id, 0.6)

In [63]:
print_network_statistics(trackNet)

=== Network statistics ===
Number of nodes: 40560
Number of edges: 1537494
Network density: 0.0019
Transitivity: 0.3641
Average Clustering Coefficient: 0.3464


### Construct DGL Graph for Graph Convolutional Networks

In [64]:
import dgl
import dgl.function as fn
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph

In [65]:
from sklearn.model_selection import train_test_split
training_size = 0.9

In [66]:
#convert the networkx graph to dgl graph
trackNet_selfloops = trackNet.copy()
# print([e for e in nx.selfloop_edges(trackNet_selfloops)])
trackNet_selfloops.remove_edges_from(nx.selfloop_edges(trackNet_selfloops))
# print(nx.info(trackNet_selfloops))
source = [e[0] for e in trackNet_selfloops.edges()]
target = [e[1] for e in trackNet_selfloops.edges()]
source_ids = torch.tensor(source)
target_ids = torch.tensor(target)
g_trackNet = dgl.graph((source_ids,target_ids))

In [67]:
trackNet_features = []
trackNet_labels = []
trackNet_ids = trackNet.nodes()
for i in trackNet_ids:
    trackNet_features.append(trackNet.nodes[i]['feature'])
    trackNet_labels.append(trackNet.nodes[i]['label'])
g_trackNet.ndata['feature'] = torch.FloatTensor(trackNet_features)
g_trackNet.ndata['label'] = torch.LongTensor(trackNet_labels)

In [68]:
# trackNet_train_mask = []
# trackNet_test_mask = []
# trackNet_ids = list(trackNet_ids)
# X_train, X_test, y_train, y_test = train_test_split(trackNet_ids,trackNet_labels,train_size=training_size,random_state=0)
# for i in trackNet_ids:
#     if trackNet_ids[i] in X_train:
#         trackNet_train_mask.append(True)
#         trackNet_test_mask.append(False)
#     else:
#         trackNet_train_mask.append(False)
#         trackNet_test_mask.append(True)
# # print(trackNet_train_mask[:10])
# # print(trackNet_test_mask[:10])
# g_trackNet.ndata['train_mask'] = torch.BoolTensor(trackNet_train_mask)
# g_trackNet.ndata['test_mask'] = torch.BoolTensor(trackNet_test_mask)

In [69]:
trackNet_train_mask = []
trackNet_test_mask = []
trackNet_ids = list(trackNet_ids)
for i in range(len(nodeIdOffset) - 1):
    decade_size = nodeIdOffset[i+1] - nodeIdOffset[i]
#     train_size_decade = int(decade_size * training_size)
#     print(train_size_decade)
    nodes_train, nodes_test = train_test_split(trackNet_ids[nodeIdOffset[i]:nodeIdOffset[i+1]],train_size=training_size)
    for n in range(nodeIdOffset[i],nodeIdOffset[i+1]):
        if trackNet_ids[n] in nodes_train:
            trackNet_train_mask.append(True)
            trackNet_test_mask.append(False)
        else:
            trackNet_train_mask.append(False)
            trackNet_test_mask.append(True)
    
#     trackNet_train_mask.extend([True] * train_size_decade)
#     trackNet_train_mask.extend([False] * (decade_size - train_size_decade))
#     trackNet_test_mask.extend([False] * train_size_decade)
#     trackNet_test_mask.extend([True] * (decade_size - train_size_decade))

g_trackNet.ndata['train_mask'] = torch.BoolTensor(trackNet_train_mask)
g_trackNet.ndata['test_mask'] = torch.BoolTensor(trackNet_test_mask)

## Graph Convolutional Network

In [79]:
def load_dgl_data(g):
    g = g
    features = g.ndata['feature']
    labels = g.ndata['label']
    train_mask = g.ndata['train_mask']
    test_mask = g.ndata['test_mask']
    return g, features, labels, train_mask, test_mask

In [80]:
gcn_msg = fn.copy_u(u='h', out='m')
gcn_reduce = fn.sum(msg='m', out='h')

In [81]:
class GCNLayer(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(GCNLayer, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)

    def forward(self, g, feature):
        # Create a local scope so that all the stored ndata and edata
        # (such as the `'h'` ndata below) are automatically popped out
        # when the scope exits.
        with g.local_scope():
            g.ndata['h'] = feature
            g.update_all(gcn_msg, gcn_reduce)
            h = g.ndata['h']
            return self.linear(h)

In [82]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layer1 = GCNLayer(16, 64)
        self.layer2 = GCNLayer(64, 16)
        self.layer3 = GCNLayer(16,16)
        self.layerNN1 = nn.Linear(16,64)
        self.layerNN2 = nn.Linear(64,16)
        self.layerNN3 = nn.Linear(16,2)

    def forward(self, g, features):
        x = F.logsigmoid(self.layer1(g, features))
        x = F.logsigmoid(self.layer2(g, x))
        x = F.logsigmoid(self.layer3(g, x))
        x = F.logsigmoid(self.layerNN1(x))
        x = F.logsigmoid(self.layerNN2(x))
        x = self.layerNN3(x)
        return x
net = Net()
print(net)

Net(
  (layer1): GCNLayer(
    (linear): Linear(in_features=16, out_features=64, bias=True)
  )
  (layer2): GCNLayer(
    (linear): Linear(in_features=64, out_features=16, bias=True)
  )
  (layer3): GCNLayer(
    (linear): Linear(in_features=16, out_features=16, bias=True)
  )
  (layerNN1): Linear(in_features=16, out_features=64, bias=True)
  (layerNN2): Linear(in_features=64, out_features=16, bias=True)
  (layerNN3): Linear(in_features=16, out_features=2, bias=True)
)


In [83]:
# evaluate accuracy when tuning parameters
def evaluate(model, g, features, labels, mask):
    model.eval()
    with torch.no_grad():
        logits = model(g, features)
        logits = logits[mask]
        labels = labels[mask]
        _, indices = torch.max(logits, dim=1)
        correct = torch.sum(indices == labels)
        return correct.item() * 1.0 / len(labels)

In [84]:
g, features, labels, train_mask, test_mask = load_dgl_data(g_trackNet)
# Add edges between each node and itself to preserve old node representations
g.add_edges(g.nodes(), g.nodes())

In [134]:
# torch.manual_seed(0)
# np.random.seed(0)
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)

In [135]:
for epoch in range(200):

    net.train()
    logits = net(g, features)
    logp = F.log_softmax(logits, 1)
    loss = F.nll_loss(logp[train_mask], labels[train_mask])

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    acc = evaluate(net, g, features, labels, test_mask)
    if epoch%10 == 0:
        print("Epoch {:05d} | Loss {:.4f} | Test Acc {:.4f}".format(epoch, loss.item(), acc))

Epoch 00000 | Loss 0.5200 | Test Acc 0.7260
Epoch 00010 | Loss 0.5142 | Test Acc 0.7317
Epoch 00020 | Loss 0.5081 | Test Acc 0.7364
Epoch 00030 | Loss 0.5004 | Test Acc 0.7448
Epoch 00040 | Loss 0.4895 | Test Acc 0.7512
Epoch 00050 | Loss 0.4809 | Test Acc 0.7583
Epoch 00060 | Loss 0.4750 | Test Acc 0.7632
Epoch 00070 | Loss 0.4712 | Test Acc 0.7662
Epoch 00080 | Loss 0.4683 | Test Acc 0.7630
Epoch 00090 | Loss 0.4661 | Test Acc 0.7620
Epoch 00100 | Loss 0.4645 | Test Acc 0.7635
Epoch 00110 | Loss 0.4630 | Test Acc 0.7657
Epoch 00120 | Loss 0.4630 | Test Acc 0.7650
Epoch 00130 | Loss 0.4616 | Test Acc 0.7660
Epoch 00140 | Loss 0.4605 | Test Acc 0.7664
Epoch 00150 | Loss 0.4597 | Test Acc 0.7672
Epoch 00160 | Loss 0.4589 | Test Acc 0.7669
Epoch 00170 | Loss 0.4583 | Test Acc 0.7652
Epoch 00180 | Loss 0.4578 | Test Acc 0.7642
Epoch 00190 | Loss 0.4571 | Test Acc 0.7660


In [136]:
# calc metrics for final model
def evaluate2(model, g, features, labels, mask):
    model.eval()
    with torch.no_grad():
        logits = model(g, features)
        logits = logits[mask]
        labels = labels[mask]
        _, indices = torch.max(logits, dim=1)
        correct = torch.sum(indices == labels)
        true_labels = indices[torch.where(indices == labels)]
        false_labels = indices[torch.where(indices != labels)]
        true_negative = torch.sum(true_labels == 0)
        true_positive = torch.sum(true_labels == 1)
        false_positive = torch.sum(false_labels == 1)
        false_negative = torch.sum(false_labels == 0)
        acc = correct.item() * 1.0 / len(labels)
        precision = true_positive.item() * 1.0 / (true_positive.item() + false_positive.item())
        recall = true_positive.item() * 1.0 / (true_positive.item() + false_negative.item())
        F1 = 2 * ((precision * recall)/(precision + recall))
        return acc, precision, recall, F1

In [137]:
acc, precision, recall, F1 = evaluate2(net, g, features, labels, test_mask)
print("GCN: accuracy {:.2f}%".format(acc * 100) + ", F1 {:.2f}".format(F1) + ", precision {:.2f}%".format(precision*100) + ", recall {:.2f}%".format(recall*100))

GCN: accuracy 76.47%, F1 0.78, precision 72.26%, recall 84.49%


In [120]:
# print(torch.seed())

8573421760787922685
