<center><h1>Music recommendation using graphs</h1>
<h2>MLNS PROJECT</h2>
<h3>Coded by Chloé Daems, Amir Mahmoudi and Anne-Claire Laisney</h3>
</center>

This is the main notebook to create a benchmark of graph based music recommendation systems inspired by the *Katarya, R., Verma, O.P. Efficient music recommender system using context graph and particle swarm. Multimed Tools Appl 77, 2673–2687 (2018).* [paper](URL 'https://link.springer.com/article/10.1007/s11042-017-4447-x'), using data from the user.getRecentTracks of the [Last.fm](URL 'https://www.last.fm/api/show/user.getRecentTracks') API.

In [1]:
#Import the libraries
from os.path import exists
import pandas as pd
import networkx as nx
import numpy as np
import datetime
from scipy.sparse import *
from IPython.display import clear_output
from networkx.algorithms import bipartite

#from scratch methods
from dataframe_utils import *
from graph_utils import *

## Create the graph

**Get the dataset**

**Initially, There are too many track-ids missing, we are going to recreate them using the uuid library**

In [2]:
user_id_profile = pd.read_csv('lastfm-dataset-1K/userid-profile.tsv', sep = '\t')
user_id_logs = users('lastfm-dataset-1K/user_id_logs_v2.tsv') #all the process is done in the users function check dataframe_utils.py

  mask |= (ar1 == a)


In [3]:
user_id_profile.head()

Unnamed: 0,#id,gender,age,country,registered
0,user_000001,m,,Japan,"Aug 13, 2006"
1,user_000002,f,,Peru,"Feb 24, 2006"
2,user_000003,m,22.0,United States,"Oct 30, 2005"
3,user_000004,f,,,"Apr 26, 2006"
4,user_000005,m,,Bulgaria,"Jun 29, 2006"


In [4]:
user_id_logs.head()

Unnamed: 0,userid,timestamp,artist-id,artist-name,track-id,track-name
0,user_000001,2009-05-04 23:08:57,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,7369ec4f-b377-5683-86bd-f02897317103,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,user_000001,2009-05-04 13:54:10,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,8a0799b1-2f64-5e7b-9436-2228c9d65637,Composition 0919 (Live_2009_4_15)
2,user_000001,2009-05-04 13:52:04,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,44da66dc-6a34-54de-a4d9-686bc38ede0f,Mc2 (Live_2009_4_15)
3,user_000001,2009-05-04 13:42:52,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,e625acbe-1360-528d-8afe-4ad88424e0c0,Hibari (Live_2009_4_15)
4,user_000001,2009-05-04 13:42:11,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,fa332ed7-b701-5669-9e8e-0961658cdb43,Mc1 (Live_2009_4_15)


**We create a train and test set**

In the test set, we would have only the two last month of listening for each users.

In [5]:
try:
    test_user_id_logs = user_id_logs[user_id_logs['timestamp'] > datetime.datetime(2009, 3, 4).replace(tzinfo=datetime.timezone.utc)]
    train_user_id_logs = user_id_logs[user_id_logs['timestamp'] < datetime.datetime(2009, 3, 4).replace(tzinfo=datetime.timezone.utc)]
except:
    test_user_id_logs = user_id_logs[user_id_logs['timestamp'] > datetime.datetime(2009, 3, 4)]
    train_user_id_logs = user_id_logs[user_id_logs['timestamp'] < datetime.datetime(2009, 3, 4)]

In [6]:
print(f'train shape : ({train_user_id_logs.shape} and test shape : ({test_user_id_logs.shape})')

train shape : ((8519568, 6) and test shape : ((613400, 6))


In [7]:
try:
    val_user_id_logs = train_user_id_logs[train_user_id_logs['timestamp'] > datetime.datetime(2009, 1, 4).replace(tzinfo=datetime.timezone.utc)]
    train_user_id_logs = train_user_id_logs[train_user_id_logs['timestamp'] < datetime.datetime(2009, 1, 4).replace(tzinfo=datetime.timezone.utc)]
except:
    val_user_id_logs = train_user_id_logs[train_user_id_logs['timestamp'] > datetime.datetime(2009, 1, 4)]
    train_user_id_logs = train_user_id_logs[train_user_id_logs['timestamp'] < datetime.datetime(2009, 1, 4)]

In [8]:
print(f'train shape : ({train_user_id_logs.shape}, val shape : ({val_user_id_logs.shape}) and test shape : ({test_user_id_logs.shape})')

train shape : ((7975441, 6), val shape : ((544126, 6)) and test shape : ((613400, 6))


**Let's only take the n most listened songs of each users**

In [9]:
if not exists('./saved_data/train_user_top_logs.tsv'):
    train_user_top_logs = get_only_top(train_user_id_logs,user_id_profile['#id'], n_top = 50)
    train_user_top_logs = train_user_top_logs.nlargest(10000,'count')
    train_user_top_logs.to_csv('./saved_data/train_user_top_logs.tsv')

else : 
    train_user_top_logs = pd.read_csv('./saved_data/train_user_top_logs.tsv', index_col=0)

train_user_top_logs.head(2)

Unnamed: 0,track-name,artist-name,userid,artist-id,track-id,count
43661,Jolene,Cake,user_000949,fa7b9055-3703-473a-8a09-adf2fe031a24,60f0bfa4-8da9-4840-b5fe-23c1fc470f34,1456.0
43662,Staring At The Sun,Tv On The Radio,user_000949,eb872766-98f6-453d-883f-2ae908a18315,64581a21-566d-4b24-99ae-c5f48b75e660,1409.0


**Transform the dataset into a bipartite graph**

In [10]:
track_df,artist_df=bipartite_graph('./saved_data/track_df.tsv','./saved_data/artists_df.tsv',train_user_top_logs)
track_df.head(2)

Unnamed: 0,track-name,artist-name,artist-id,track-id
0,Music,Cornelius,df765d93-621c-437f-99fe-fda9e135f89a,52bef5e2-17b6-5742-b846-09a6b750e857
1,Gum,Cornelius,df765d93-621c-437f-99fe-fda9e135f89a,bb9a7981-016d-596e-b17f-ba07a346d2d4


In [11]:
artist_df.head(2)

Unnamed: 0,artist-name,artist-id
0,Cornelius,df765d93-621c-437f-99fe-fda9e135f89a
1,Gilles Peterson,4c4e3121-4d12-4f7a-a77c-5becd849fb3c


In [12]:
train_G=graph_generator(train_user_top_logs,train_user_top_logs,user_id_profile,track_df,artist_df)

**Now we transform the validation and tests data sets in bipartite graphs**

In [13]:
#First we select only the music tracks that are in the train set
val_user_id_logs = val_user_id_logs[val_user_id_logs['track-id'].isin(track_df['track-id'])]
val_user_id_logs_top = get_only_top(val_user_id_logs,user_id_profile['#id'], n_top = -1)

test_gt_user_id_logs = test_user_id_logs[test_user_id_logs['track-id'].isin(track_df['track-id'])]
test_gt_user_id_logs_top = get_only_top(test_user_id_logs,user_id_profile['#id'], n_top = -1)

Just finished for user_001000


"\ntest_gt_user_id_logs = test_user_id_logs[test_user_id_logs['track-id'].isin(track_df['track-id'])]\ntest_gt_user_id_logs_top = get_only_top(test_user_id_logs,user_id_profile['#id'], n_top = -1)"

In [17]:

test_gt_user_id_logs = test_user_id_logs[test_user_id_logs['track-id'].isin(track_df['track-id'])]
test_gt_user_id_logs_top = get_only_top(test_user_id_logs,user_id_profile['#id'], n_top = -1)

Just finished for user_001000


In [21]:
"""#test part :
temp_test = get_only_top(val_user_id_logs,user_id_profile['#id'], n_top = 50)
temp_test = temp_test.nlargest(10000,'count')
temp_test.to_csv('./saved_data/val_user_top_logs.tsv')"""
#Concatenate the train and val sets summing the count for the same track and the same user
test_user_id_logs_top = pd.concat([train_user_top_logs, val_user_id_logs_top], ignore_index=True)
#sum count column for same track and same user
#test_user_id_logs_top.groupby(['userid', 'track-id']).sum('count')

In [22]:
test_user_id_logs_top

Unnamed: 0,track-name,artist-name,userid,artist-id,track-id,count
0,Jolene,Cake,user_000949,fa7b9055-3703-473a-8a09-adf2fe031a24,60f0bfa4-8da9-4840-b5fe-23c1fc470f34,1456.0
1,Staring At The Sun,Tv On The Radio,user_000949,eb872766-98f6-453d-883f-2ae908a18315,64581a21-566d-4b24-99ae-c5f48b75e660,1409.0
2,Wings Of Words,Chemistry,user_000141,c8524763-e7d7-4225-8a9a-e7b64db5a1e2,2caf09d4-aad9-5f2d-b66d-616f2c2d0e35,1365.0
3,Heartbeats,The Knife,user_000949,bf710b71-48e5-4e15-9bd6-96debb2e4e98,db4c9220-df76-4b42-b6f5-8bf52cc80f77,1362.0
4,Anthems For A Seventeen Year Old Girl,Broken Social Scene,user_000949,2eada8f8-056a-4093-bbc2-004909ce743b,91951530-d978-4648-95b1-08b1f49ffba5,1352.0
...,...,...,...,...,...,...
74120,Karma,Kamelot,user_000507,2449300a-6ca7-45da-8102-22789d256475,058c4d03-5244-5c3b-b09c-0ad9cdd3618c,1.0
74121,March Of Mephisto,Kamelot,user_000507,2449300a-6ca7-45da-8102-22789d256475,c55d46c8-20bb-5c22-b07a-bc72ec402caa,1.0
74122,Hero In A Dream,Ensiferum,user_000507,6e64cbfa-1a60-450e-81f4-c044c868ab24,3e3d20e3-9226-51ec-9ef6-a9b314076bbb,1.0
74123,Wasted Years,Anton Maiden,user_000507,51086134-0896-4c00-b54a-c5c37aeaf2bf,6420dc6c-c1b1-5f42-b0e2-f6207415abd3,1.0


In [23]:
#Then we create the graph
val_G=graph_generator(val_user_id_logs_top,train_user_top_logs,user_id_profile,track_df,artist_df)
G_test_gt=graph_generator(test_gt_user_id_logs_top,train_user_top_logs,user_id_profile,track_df,artist_df)
G_test=graph_generator(test_user_id_logs_top,train_user_top_logs,user_id_profile,track_df,artist_df)

## Clean the graph
The graph is too big to be used for training and validation

We can remove the low degree nodes

In [24]:
G_test, train_G,val_G,G_test_gt = remove_low_degree_nodes(G_test,train_G,val_G,G_test_gt)
len(G_test.nodes),len(train_G.nodes),len(val_G.nodes),len(G_test_gt.nodes)

(9359, 9359, 9359, 178556)

In [25]:
G_test, train_G,val_G,G_test_gt = remove_low_artists(G_test,train_G,val_G,G_test_gt)
len(G_test.nodes),len(train_G.nodes),len(val_G.nodes),len(G_test_gt.nodes)

(686, 686, 686, 169883)

## Create the methods
Pearson coefficient, Bellman ford algorithm ...

#### Unsupervised link prediction

ALL THE METHODS ARE TO BE PUT IN A UTIL.PY FILE

In [26]:
def preferential_attachement(graph, edges=train_G.edges()):
    PA = {}
    
    for edge in edges:
        PA[edge] = graph.degree(edge[0]) * graph.degree(edge[1])
        
    return PA
    
pa = preferential_attachement(train_G)

In [27]:
def Jaccard(graph, edges=train_G.edges()):
    Jaccard = {}
    # Compute Jaccard metric for each non_edge of the graph
    
    for edge in edges: 
        inter_size = len(list(nx.common_neighbors(graph, edge[0], edge[1])))
        union_size = len(set(graph[edge[0]]) | set(graph[edge[1]]))

        if union_size != 0:
            Jaccard[edge] = inter_size / union_size
        else : 
            Jaccard[edge] = 0

    
    return Jaccard

In [28]:
def AdamicAdar(graph, edges=train_G.edges()):
    AdamicAdar = {}
    
    for edge in edges: 
        inter_list = nx.common_neighbors(graph, edge[0], edge[1])
        AdamicAdar[edge] = sum( [1/np.log(graph.degree(node)) for node in inter_list] )
    
    return AdamicAdar

In [29]:
#the result in the state of the art is bad as the one we obtained here
def evaluation(G_train, G_val):
    R = G_val.copy()
    R.remove_edges_from(e for e in G_val.edges if e in G_train.edges)
    gt = R.edges
    k = len(gt)
    print(k)
    print("Starting predictions")
    # --- Apply each method defined above and calculate its accuracy ---
    methods = ['Jaccard', 'AdamicAdar', 'preferential_attachement']
    
    # For each method, compute the similarity scores between all non-edges
    # Predict k node pairs with highest score 
    # Compute accuracy wrt edges actually removed 
    for method in methods: 
        res = eval(method)(G_train, nx.non_edges(G_train))
        print("Predicting...")
        pred = sorted(res.items(), key = lambda x:x[1], reverse=True)[:k]
        pred = [el[0] for el in pred]
        accuracy = len(set(pred).intersection(set(gt))) / k
        print(method,"with an accuracy of ", accuracy,"\n")

In [30]:
evaluation(train_G, val_G)

1287
Starting predictions
Predicting...
Jaccard with an accuracy of  0.0 

Predicting...
AdamicAdar with an accuracy of  0.0 

Predicting...
preferential_attachement with an accuracy of  0.00777000777000777 



#### Supervized link prediction

In [31]:
from sklearn.metrics import accuracy_score,recall_score,precision_score
import collections
from tqdm import tqdm

In [32]:
def feature_extractor(graph, samples):
    """
    Creates a feature vector for each edge of the graph contained in samples 
    """
    feature_vector = []
    
    # --- Extract manually diverse features relative to each edge contained in samples --- 
    # Fill in the blanks

    # Degree Centrality measure
    deg_centrality = nx.degree_centrality(graph)
    
    # Betweeness centrality measure
    betweeness_centrality = nx.betweenness_centrality(graph)

    for edge in tqdm(samples):
        source_node, target_node = edge[0], edge[1]

        # Degree Centrality
        source_degree_centrality = deg_centrality[source_node]
        target_degree_centrality = deg_centrality[target_node]
        
        # Betweeness centrality measure 
        diff_bt = betweeness_centrality[target_node] - betweeness_centrality[source_node]

        # Preferential Attachement 
        pref_attach = list(nx.preferential_attachment(graph, [(source_node, target_node)]))[0][2]

        # AdamicAdar
        aai = list(nx.adamic_adar_index(graph, [(source_node, target_node)]))[0][2]

        # Jaccard
        jacard_coeff = list(nx.jaccard_coefficient(graph, [(source_node, target_node)]))[0][2]
        
        # Create edge feature vector with all metric computed above
        feature_vector.append(np.array([source_degree_centrality, target_degree_centrality, 
                                        diff_bt, pref_attach, aai, jacard_coeff]) ) 
        
    return feature_vector

##### Lets try with only detecting for non edges

In [33]:
def get_neg_edges_bipartite(G):
    top_nodes = set(n for n,d in G.nodes(data=True) if d['bipartite']==0)
    low_nodes = set(n for n,d in G.nodes(data=True) if d['bipartite']==1)
    adj_matrix = bipartite.biadjacency_matrix(G, row_order=top_nodes, column_order=low_nodes)

    negative_edges = []
    top = list(top_nodes)
    low = list(low_nodes)
    for i in tqdm(range(len(top_nodes))):
        for j in range(len(low_nodes)):
            if adj_matrix[i,j]==0:
                negative_edges.append([top[i],low[j]])
    return negative_edges

In [34]:
train_neg_edges = get_neg_edges_bipartite(train_G)

100%|██████████| 352/352 [00:01<00:00, 189.02it/s]


In [35]:
def get_edges_values(G, edges):
    all_positive_edges = G.edges()
    output = []
    for edge in edges:
        if(edge in all_positive_edges): 
            #output.append(G.edges[edge[0],edge[1]]['weight'])
            output.append(1)  
        else:
            output.append(0)

    return output

In [36]:
new_links = get_edges_values(val_G, train_neg_edges)
new_links.count(1)

1287

In [37]:
# --- Create feature vector for all edges in training set and test set ---
train_features = feature_extractor(train_G, train_neg_edges)

100%|██████████| 77352/77352 [00:02<00:00, 27221.88it/s]


In [38]:
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

In [39]:
def train(train_features, train_labels):
    """
    Downstream ML task using edge embeddings to classify them 
    """
    
    # --- Build the model and train it ---
    # Fill in the blanks
    clf = RandomForestClassifier(random_state= 1999)
    clf.fit(train_features, train_labels)

    #compute accuracy
    train_preds=clf.predict(train_features)

    print(collections.Counter(np.array(train_preds)))
    acc=accuracy_score(train_labels,train_preds)
    recall=recall_score(train_labels,train_preds)
    precision=precision_score(train_labels,train_preds)

    print('Accuracy:',acc, 'Recall:',recall, 'Precision:',precision)
    
    return clf

In [40]:
RF_model = train(train_features, new_links)

KeyboardInterrupt: 

### We test the model on the test set

In [None]:
def predict(model, test_features, test_labels):
    """
    Downstream ML task using edge embeddings to classify them 
    """

    #compute accuracy
    test_preds=model.predict(test_features)

    print(collections.Counter(np.array(test_preds)))
    acc=accuracy_score(test_labels,test_preds)
    recall=recall_score(test_labels,test_preds)
    precision=precision_score(test_labels,test_preds)

    print('Accuracy:',acc, 'Recall:',recall, 'Precision:',precision)

In [None]:
test_neg_edges = get_neg_edges_bipartite(G_test)
# --- Create feature vector for all edges in training set and test set ---
test_features = feature_extractor(G_test, test_neg_edges)

100%|██████████| 961/961 [00:51<00:00, 18.68it/s]
100%|██████████| 2049112/2049112 [03:35<00:00, 9508.51it/s] 


In [None]:
test_labels = get_edges_values(G_test_gt, test_neg_edges)

In [None]:
test_labels.count(1)

10184

In [None]:
predict(RF_model,test_features,test_labels)

Counter({0: 2017932, 1: 31180})
Accuracy: 0.9808112001686584 Recall: 0.10035349567949725 Precision: 0.03277742142398974
