<center><h1>Music recommendation using graphs</h1>
<h2>MLNS PROJECT</h2>
<h3>Coded by Chloé Daems, Amir Mahmoudi and Anne-Claire Laisney</h3>
</center>

This is the main notebook to create a benchmark of graph based music recommendation systems inspired by the *Katarya, R., Verma, O.P. Efficient music recommender system using context graph and particle swarm. Multimed Tools Appl 77, 2673–2687 (2018).* [paper](URL 'https://link.springer.com/article/10.1007/s11042-017-4447-x'), using data from the user.getRecentTracks of the [Last.fm](URL 'https://www.last.fm/api/show/user.getRecentTracks') API.

In [1]:
#Import the libraries
from os.path import exists
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import datetime
from scipy.sparse import *
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, auc

from IPython.display import clear_output

## Create the graph

**Get the dataset**

In [2]:
user_id_profile = pd.read_csv('lastfm-dataset-1K/userid-profile.tsv', sep = '\t')
if not exists('lastfm-dataset-1K/user_id_logs_v2.tsv'):
    logs_columns = ['userid', 'timestamp', 'artist-id', 'artist-name', 'track-id', 'track-name']
    user_id_logs = pd.read_csv('lastfm-dataset-1K/userid-logs.tsv', sep = '\t', header = None, names =  logs_columns )
    user_id_logs = user_id_logs.dropna(subset=['track-name','artist-name', 'artist-id'])
else : 
    user_id_logs = pd.read_csv('lastfm-dataset-1K/user_id_logs_v2.tsv',index_col=0)
    user_id_logs = user_id_logs.dropna(subset=['track-name','artist-name', 'artist-id'])
    
#user_id_logs['timestamp'] = pd.to_datetime(user_id_logs['timestamp'], format='%Y-%m-%dT%H:%M:%SZ')
user_id_logs['timestamp'] = pd.to_datetime(user_id_logs['timestamp'], format='%Y-%m-%d')# %H:%M:%S')

  mask |= (ar1 == a)


In [3]:
user_id_profile.head()

Unnamed: 0,#id,gender,age,country,registered
0,user_000001,m,,Japan,"Aug 13, 2006"
1,user_000002,f,,Peru,"Feb 24, 2006"
2,user_000003,m,22.0,United States,"Oct 30, 2005"
3,user_000004,f,,,"Apr 26, 2006"
4,user_000005,m,,Bulgaria,"Jun 29, 2006"


In [4]:
user_id_logs.head()

Unnamed: 0,userid,timestamp,artist-id,artist-name,track-id,track-name
0,user_000001,2009-05-04 23:08:57+00:00,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,7369ec4f-b377-5683-86bd-f02897317103,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,user_000001,2009-05-04 13:54:10+00:00,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,8a0799b1-2f64-5e7b-9436-2228c9d65637,Composition 0919 (Live_2009_4_15)
2,user_000001,2009-05-04 13:52:04+00:00,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,44da66dc-6a34-54de-a4d9-686bc38ede0f,Mc2 (Live_2009_4_15)
3,user_000001,2009-05-04 13:42:52+00:00,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,e625acbe-1360-528d-8afe-4ad88424e0c0,Hibari (Live_2009_4_15)
4,user_000001,2009-05-04 13:42:11+00:00,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,fa332ed7-b701-5669-9e8e-0961658cdb43,Mc1 (Live_2009_4_15)


**There are too many track-ids missing, we are going to recreate them using the uuid library**

In [5]:
# Really long : 40 min
import tqdm
import uuid
if not exists('lastfm-dataset-1K/user_id_logs_v2.tsv'):
    for idx, row in tqdm.tqdm(user_id_logs.iterrows()):
        row['track-id'] = uuid.uuid5(uuid.NAMESPACE_DNS, row['artist-name'] + "," + row['track-name'])
        row['artist-id'] = uuid.uuid5(uuid.NAMESPACE_DNS, str(row['artist-name']))
    #We save the file
    user_id_logs.to_csv('lastfm-dataset-1K/user_id_logs_v2.tsv')

**We create a train and test set**

In the test set, we would have only the last month of listening for each users.

In [7]:
try:
    test_user_id_logs = user_id_logs[user_id_logs['timestamp'] > datetime.datetime(2009, 4, 4).replace(tzinfo=datetime.timezone.utc)]
    train_user_id_logs = user_id_logs[user_id_logs['timestamp'] < datetime.datetime(2009, 4, 4).replace(tzinfo=datetime.timezone.utc)]
except:
    test_user_id_logs = user_id_logs[user_id_logs['timestamp'] > datetime.datetime(2009, 4, 4)]
    train_user_id_logs = user_id_logs[user_id_logs['timestamp'] < datetime.datetime(2009, 4, 4)]

In [8]:
print(f'train shape : ({train_user_id_logs.shape} and test shape : ({test_user_id_logs.shape})')

train shape : ((17815729, 6) and test shape : ((682269, 6))


In [10]:
try:
    val_user_id_logs = train_user_id_logs[train_user_id_logs['timestamp'] > datetime.datetime(2009, 3, 4).replace(tzinfo=datetime.timezone.utc)]
    train_user_id_logs = train_user_id_logs[train_user_id_logs['timestamp'] < datetime.datetime(2009, 3, 4).replace(tzinfo=datetime.timezone.utc)]
except:
    val_user_id_logs = train_user_id_logs[train_user_id_logs['timestamp'] > datetime.datetime(2009, 3, 4)]
    train_user_id_logs = train_user_id_logs[train_user_id_logs['timestamp'] < datetime.datetime(2009, 3, 4)]

In [11]:
print(f'train shape : ({train_user_id_logs.shape} and val shape : ({val_user_id_logs.shape})')

train shape : ((17217876, 6) and val shape : ((597852, 6))


**Let's only take the n most listened songs of each users**

In [12]:
def get_only_top(df_logs,df_profile, n_top):
    new_df = pd.DataFrame(columns = ['track-name','artist-name'], dtype= np.str)
    for user_id in df_profile.values:
        test = df_logs[df_logs['userid']== user_id]
        try:
            test['count'] = test.groupby(['track-id'])[['track-id']].transform(lambda x: x.count())['track-id']
            test = test.sort_values(by = 'count', ascending = False)
            test = test.drop('timestamp', axis = 1)
            test = test.drop_duplicates()
            new_df = pd.concat([new_df, test[:n_top]], ignore_index=True)
        except:
            pass
        clear_output(wait = True)
        print("Just finished for",user_id)
    return new_df

In [13]:
if not exists('./saved_data/train_user_top_logs.tsv'):
    train_user_top_logs = get_only_top(train_user_id_logs,user_id_profile['#id'], n_top = 50)
    train_user_top_logs = train_user_top_logs.nlargest(10000,'count')
    train_user_top_logs.to_csv('./saved_data/train_user_top_logs.tsv')

else : 
    train_user_top_logs = pd.read_csv('./saved_data/train_user_top_logs.tsv', index_col=0)

train_user_top_logs.head()

Unnamed: 0,track-name,artist-name,userid,artist-id,track-id,count
0,Music,Cornelius,user_000001,df765d93-621c-437f-99fe-fda9e135f89a,52bef5e2-17b6-5742-b846-09a6b750e857,70.0
1,Gum,Cornelius,user_000001,df765d93-621c-437f-99fe-fda9e135f89a,bb9a7981-016d-596e-b17f-ba07a346d2d4,63.0
2,Mario Basanov & Vidis ‘Test’,Gilles Peterson,user_000001,4c4e3121-4d12-4f7a-a77c-5becd849fb3c,7434fb0f-1245-5a58-b343-cca4d0e2c107,50.0
3,Child Song,The Cinematic Orchestra,user_000001,7c158ea8-c0aa-410e-bdc1-20bba9759577,4562ff4f-b619-5557-8600-87f6d0d9f348,44.0
4,To Build A Home,The Cinematic Orchestra,user_000001,7c158ea8-c0aa-410e-bdc1-20bba9759577,dcf825de-85a9-5b53-9d2b-a9d574b57470,41.0


**Transform the dataset into a bipartite graph**

In [14]:
if not exists('./saved_data/track_df.tsv'):
    track_df = train_user_top_logs.copy()
    track_df.drop(['count', 'userid'], axis = 1, inplace=True)
    track_df.drop_duplicates('track-id', inplace=True)
    track_df.reset_index(drop=True, inplace=True)
    track_df.to_csv('./saved_data/track_df.tsv')
else : 
    track_df = pd.read_csv('./saved_data/track_df.tsv', index_col=0)

track_df.head()

Unnamed: 0,track-name,artist-name,artist-id,track-id
0,Music,Cornelius,df765d93-621c-437f-99fe-fda9e135f89a,52bef5e2-17b6-5742-b846-09a6b750e857
1,Gum,Cornelius,df765d93-621c-437f-99fe-fda9e135f89a,bb9a7981-016d-596e-b17f-ba07a346d2d4
2,Mario Basanov & Vidis ‘Test’,Gilles Peterson,4c4e3121-4d12-4f7a-a77c-5becd849fb3c,7434fb0f-1245-5a58-b343-cca4d0e2c107
3,Child Song,The Cinematic Orchestra,7c158ea8-c0aa-410e-bdc1-20bba9759577,4562ff4f-b619-5557-8600-87f6d0d9f348
4,To Build A Home,The Cinematic Orchestra,7c158ea8-c0aa-410e-bdc1-20bba9759577,dcf825de-85a9-5b53-9d2b-a9d574b57470


In [15]:
if not exists('./saved_data/artists_df.tsv'):
    artist_df = train_user_top_logs.copy()
    artist_df.drop(['count', 'userid', 'track-name', 'track-id'], axis = 1, inplace=True)
    artist_df.drop_duplicates('artist-id', inplace=True)
    artist_df.reset_index(drop=True, inplace=True)
    artist_df.to_csv('./saved_data/artists_df.tsv')
else : 
    artist_df = pd.read_csv('./saved_data/artists_df.tsv', index_col=0)

artist_df.head()

Unnamed: 0,artist-name,artist-id
0,Cornelius,df765d93-621c-437f-99fe-fda9e135f89a
1,Gilles Peterson,4c4e3121-4d12-4f7a-a77c-5becd849fb3c
2,The Cinematic Orchestra,7c158ea8-c0aa-410e-bdc1-20bba9759577
3,Radiohead,a74b1b7f-71a5-4011-9441-d0b5e4122711
4,A Hundred Birds,f5341587-6c20-4e0f-bdfd-62b2122825f2


In [16]:
from networkx.algorithms import bipartite
#We transform the train set into a bipartite graph
G= nx.Graph()
edges_1 = np.array(train_user_top_logs[['userid', 'track-id','count']].values)
edges_2 = np.array(train_user_top_logs[['artist-id', 'track-id']].values)
G.add_nodes_from(user_id_profile['#id'], bipartite=0)
G.add_nodes_from(track_df['track-id'], bipartite=1)
G.add_nodes_from(artist_df['artist-id'], bipartite=2)
     
G.add_weighted_edges_from(edges_1)
#weighted graph
G.add_edges_from(edges_2)
#save the edges
nx.write_gml(G, "saved_data/graph_user_tracks_train.gml")

**Now we transform the validation data set in a bipartite graph**

In [17]:
#We transform the val set into a bipartite graph

#First we select only the music tracks that are in the train set
val_user_id_logs = val_user_id_logs[val_user_id_logs['track-id'].isin(track_df['track-id'])]
val_user_id_logs_top = get_only_top(val_user_id_logs,user_id_profile['#id'], n_top = -1)

Just finished for user_001000


In [19]:
#Then we create the graph

G_val = nx.Graph()
edges_1 = np.array(val_user_id_logs_top[['userid', 'track-id','count']].values)
edges_2 = np.array(train_user_top_logs[['artist-id', 'track-id']].values)

G_val.add_nodes_from(user_id_profile['#id'], bipartite=0)
G_val.add_nodes_from(track_df['track-id'], bipartite=1)
G_val.add_nodes_from(artist_df['artist-id'], bipartite=2)
     
G_val.add_weighted_edges_from(edges_1)

#weighted graph
G_val.add_edges_from(edges_2)
#save the graph
nx.write_gml(G_val, "saved_data/graph_user_tracks_val.gml")

In [58]:
#We transform the val set into a bipartite graph

#First we select only the music tracks that are in the train set
test_user_id_logs = val_user_id_logs[val_user_id_logs['track-id'].isin(track_df['track-id'])]
test_user_id_logs_top = get_only_top(val_user_id_logs,user_id_profile['#id'], n_top = -1)

Just finished for user_001000


In [59]:
#Then we create the graph

G_test_gt = nx.Graph()
edges_1 = np.array(test_user_id_logs_top[['userid', 'track-id','count']].values)
edges_2 = np.array(train_user_top_logs[['artist-id', 'track-id']].values)

G_test_gt.add_nodes_from(user_id_profile['#id'], bipartite=0)
G_test_gt.add_nodes_from(track_df['track-id'], bipartite=1)
G_test_gt.add_nodes_from(artist_df['artist-id'], bipartite=2)
     
G_test_gt.add_weighted_edges_from(edges_1)

#weighted graph
G_test_gt.add_edges_from(edges_2)
#save the graph
nx.write_gml(G_test_gt, "saved_data/graph_user_tracks_test_groundtruth.gml")

## Create the methods
Pearson coefficient, Bellman ford algorithm ...

**Get the graph**

In [21]:
train_G  = nx.read_gml("saved_data/graph_user_tracks_train.gml")
val_G  = nx.read_gml("saved_data/graph_user_tracks_val.gml")

### We try the methods from TP2

#### Unsupervised link prediction

ALL THE METHODS ARE TO BE PUT IN A UTIL.PY FILE

In [22]:
def preferential_attachement(graph, edges=train_G.edges()):
    PA = {}
    
    for edge in edges:
        PA[edge] = graph.degree(edge[0]) * graph.degree(edge[1])
        
    return PA
    
pa = preferential_attachement(train_G)

In [23]:
def Jaccard(graph, edges=train_G.edges()):
    Jaccard = {}
    # Compute Jaccard metric for each non_edge of the graph
    
    for edge in edges: 
        inter_size = len(list(nx.common_neighbors(graph, edge[0], edge[1])))
        union_size = len(set(graph[edge[0]]) | set(graph[edge[1]]))

        if union_size != 0:
            Jaccard[edge] = inter_size / union_size
        else : 
            Jaccard[edge] = 0

    
    return Jaccard

In [24]:
def AdamicAdar(graph, edges=train_G.edges()):
    AdamicAdar = {}
    
    for edge in edges: 
        inter_list = nx.common_neighbors(graph, edge[0], edge[1])
        AdamicAdar[edge] = sum( [1/np.log(graph.degree(node)) for node in inter_list] )
    
    return AdamicAdar

In [25]:
def predict_edges(metric, k):
    
    # Shuffle randomly entries of dictionnary 
    l = list(metric.items())
    np.random.seed(10) # fix random seed to obtain same random shuffling when repeating experiment
    np.random.shuffle(l)
    metric = dict(l)

    # Retrieve top k value 
    metric = dict(sorted(metric.items(), key=lambda x:x[1], reverse=True)[:k])
    print(metric.items())

predict_edges(pa, 10)

dict_items([(('88ff31ff-07d3-5909-b8d4-942377de3c04', 'a74b1b7f-71a5-4011-9441-d0b5e4122711'), 3000), (('1f5924d3-c1b3-5d5e-9ca4-4e062e9019be', 'a74b1b7f-71a5-4011-9441-d0b5e4122711'), 2400), (('90744886-c2db-590b-bfb6-50d409cddabf', 'a74b1b7f-71a5-4011-9441-d0b5e4122711'), 2040), (('ca1bbb67-f89e-5043-889c-e6f46add205f', 'a74b1b7f-71a5-4011-9441-d0b5e4122711'), 1920), (('d8da1e04-5582-561e-8b81-41b67ee46460', 'a74b1b7f-71a5-4011-9441-d0b5e4122711'), 1800), (('4acc0486-b446-5428-87b4-1e7dd46d070d', 'a74b1b7f-71a5-4011-9441-d0b5e4122711'), 1800), (('af21c041-7bb4-5fdc-abbd-efd9b40df73c', '83d91898-7763-47d7-b03b-b92132375c47'), 1674), (('ef283a65-240f-58b5-a127-4a2b44c617aa', '03ad1736-b7c9-412a-b442-82536d63a5c4'), 1638), (('0e938086-1f83-5242-944b-7315de233b57', '0039c7ae-e1a7-4a7d-9b49-0cbc716821a6'), 1608), (('59671b3e-76a0-5da7-af83-d593dc187dd5', 'a74b1b7f-71a5-4011-9441-d0b5e4122711'), 1560)])


In [27]:
import tqdm
#1 hour to compute NOT TRIED -- (but the result in the state of the art is bad)
def evaluation(G_train, G_val):
    R = G_val.copy()
    R.remove_edges_from(e for e in G_val.edges if e in G_train.edges)
    gt = R.edges
    k = len(gt)
    print(k)
    print("Starting predictions")
    # --- Apply each method defined above and calculate its accuracy ---
    methods = ['Jaccard', 'AdamicAdar', 'preferential_attachement']
    
    # For each method, compute the similarity scores between all non-edges
    # Predict k node pairs with highest score 
    # Compute accuracy wrt edges actually removed 
    for method in methods: 
        res = eval(method)(G_train, nx.non_edges(G_train))
        print("predicting 4 real ")
        pred = sorted(res.items(), key = lambda x:x[1], reverse=True)[:k]
        print("I finished predicting")
        pred = [el[0] for el in pred]
        #print('pred', pred)
        #print('gt',gt)
        accuracy = len(set(pred).intersection(set(gt))) / k
        print(method, accuracy)

In [28]:
#The graph is too big to be used for training and validation
# We can remove the low degree nodes
def remove_low_degree_nodes(train_graph, val_graph):
    t_graph, v_graph = train_graph.copy(), val_graph.copy()
    to_be_removed = [x for  x, d in t_graph.nodes(data=True) if (t_graph.degree(x) <= 3 and d['bipartite'] != 2)]
    print(len(to_be_removed))
    t_graph.remove_nodes_from(to_be_removed)
    v_graph.remove_nodes_from(to_be_removed)
    return t_graph, v_graph

In [29]:
# We can remove artist with no more edges
def remove_low_artists(train_graph, val_graph):
    t_graph, v_graph = train_graph.copy(), val_graph.copy()
    to_be_removed = [x for  x, d in t_graph.nodes(data=True) if (t_graph.degree(x) ==0 and d['bipartite'] == 2)]
    t_graph.remove_nodes_from(to_be_removed)
    v_graph.remove_nodes_from(to_be_removed)
    return t_graph, v_graph

In [30]:
train_G,val_G = remove_low_degree_nodes(train_G,val_G)
len(train_G.nodes),len(val_G.nodes)

34568


(11905, 11905)

In [31]:
train_G,val_G = remove_low_artists(train_G,val_G)
len(train_G.nodes),len(val_G.nodes)

(3757, 3757)

In [32]:
evaluation(train_G, val_G)

19162
Starting predictions
predicting 4 real 
I finished predicting
Jaccard 0.0
predicting 4 real 
I finished predicting
AdamicAdar 0.0
predicting 4 real 
I finished predicting
preferential_attachement 0.0036008767352050932


#### Supervized link prediction

In [33]:
from sklearn.metrics import accuracy_score,recall_score,precision_score
import collections
from tqdm import tqdm

In [34]:
def feature_extractor(graph, samples):
    """
    Creates a feature vector for each edge of the graph contained in samples 
    """
    feature_vector = []
    
    # --- Extract manually diverse features relative to each edge contained in samples --- 
    # Fill in the blanks

    # Degree Centrality measure
    deg_centrality = nx.degree_centrality(graph)
    
    # Betweeness centrality measure
    betweeness_centrality = nx.betweenness_centrality(graph)

    for edge in tqdm(samples):
        source_node, target_node = edge[0], edge[1]

        # Degree Centrality
        source_degree_centrality = deg_centrality[source_node]
        target_degree_centrality = deg_centrality[target_node]
        
        # Betweeness centrality measure 
        diff_bt = betweeness_centrality[target_node] - betweeness_centrality[source_node]

        # Preferential Attachement 
        pref_attach = list(nx.preferential_attachment(graph, [(source_node, target_node)]))[0][2]

        # AdamicAdar
        aai = list(nx.adamic_adar_index(graph, [(source_node, target_node)]))[0][2]

        # Jaccard
        jacard_coeff = list(nx.jaccard_coefficient(graph, [(source_node, target_node)]))[0][2]
        
        # Create edge feature vector with all metric computed above
        feature_vector.append(np.array([source_degree_centrality, target_degree_centrality, 
                                        diff_bt, pref_attach, aai, jacard_coeff]) ) 
        
    return feature_vector

In [36]:
def get_sets(graph):
    pos_sample=list(graph.edges)
    neg_sample=list(nx.non_edges(graph))
    labels = [1 for _ in pos_sample] + [0 for _ in neg_sample]
    features=pos_sample+neg_sample
    return features,labels

##### Lets try with only detecting for non edges

In [37]:
def get_neg_edges_bipartite(G):
    top_nodes = set(n for n,d in G.nodes(data=True) if d['bipartite']==0)
    low_nodes = set(n for n,d in G.nodes(data=True) if d['bipartite']==1)
    adj_matrix = bipartite.biadjacency_matrix(G, row_order=top_nodes, column_order=low_nodes)

    negative_edges = []
    top = list(top_nodes)
    low = list(low_nodes)
    for i in tqdm(range(len(top_nodes))):
        for j in range(len(low_nodes)):
            if adj_matrix[i,j]==0:
                negative_edges.append([top[i],low[j]])
    return negative_edges

In [38]:
from networkx.algorithms import bipartite
train_neg_edges = get_neg_edges_bipartite(train_G)

100%|██████████| 961/961 [00:49<00:00, 19.46it/s]


In [39]:
def get_edges_values(G, edges):
    all_positive_edges = G.edges()
    output = []
    for edge in edges:
        if(edge in all_positive_edges): 
            #output.append(G.edges[edge[0],edge[1]]['weight'])
            output.append(1)  
        else:
            output.append(0)

    return output

In [None]:
def labelling(G, edges):
    all_positive_edges = G.edges()
    all_neg_edges=nx.non_edges(G)
    all_edges = all_positive_edges + all_neg_edges
    output=[1 for i in all_positive_edges ] + [0 for i in all_neg_edges ]
   
    return output,all_positive_edges,all_neg_edges

In [40]:
new_links = get_edges_values(val_G, train_neg_edges)
new_links.count(1)

19162

In [41]:
# --- Create feature vector for all edges in training set and test set ---
train_features = feature_extractor(train_G, train_neg_edges)

100%|██████████| 2065151/2065151 [02:09<00:00, 15903.09it/s]


In [42]:
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

In [64]:
def train(train_features, train_labels):
    """
    Downstream ML task using edge embeddings to classify them 
    """
    
    # --- Build the model and train it ---
    # Fill in the blanks
    clf = RandomForestClassifier(random_state= 1999)
    clf.fit(train_features, train_labels)

    #compute accuracy
    train_preds=clf.predict(train_features)

    print(collections.Counter(np.array(train_preds)))
    acc=accuracy_score(train_labels,train_preds)
    recall=recall_score(train_labels,train_preds)
    precision=precision_score(train_labels,train_preds)

    print('Accuracy:',acc, 'Recall:',recall, 'Precision:',precision)
    
    return clf

In [44]:
RF_model = train(train_features, new_links)

Counter({0: 2055024, 1: 10127})
Accuracy: 0.9954274530046471 Recall: 0.5178478238179731 Precision: 0.9798558309469735


### We test the model on the test set

In [48]:
#Concatenate the train and val sets summing the count for the same track and the same user
new_val_user_id_logs_top = pd.concat([train_user_top_logs, val_user_id_logs_top], ignore_index=True)
#sum count column for same track and same user
new_val_user_id_logs_top.groupby(['userid', 'track-id']).sum('count')
new_val_user_id_logs_top

Unnamed: 0,track-name,artist-name,userid,artist-id,track-id,count
0,Music,Cornelius,user_000001,df765d93-621c-437f-99fe-fda9e135f89a,52bef5e2-17b6-5742-b846-09a6b750e857,70.0
1,Gum,Cornelius,user_000001,df765d93-621c-437f-99fe-fda9e135f89a,bb9a7981-016d-596e-b17f-ba07a346d2d4,63.0
2,Mario Basanov & Vidis ‘Test’,Gilles Peterson,user_000001,4c4e3121-4d12-4f7a-a77c-5becd849fb3c,7434fb0f-1245-5a58-b343-cca4d0e2c107,50.0
3,Child Song,The Cinematic Orchestra,user_000001,7c158ea8-c0aa-410e-bdc1-20bba9759577,4562ff4f-b619-5557-8600-87f6d0d9f348,44.0
4,To Build A Home,The Cinematic Orchestra,user_000001,7c158ea8-c0aa-410e-bdc1-20bba9759577,dcf825de-85a9-5b53-9d2b-a9d574b57470,41.0
...,...,...,...,...,...,...
125862,How To Disappear Completely,Radiohead,user_001000,a74b1b7f-71a5-4011-9441-d0b5e4122711,ccad68b7-c5ad-4f29-b169-2531fbf53f63,1.0
125863,Everything In Its Right Place,Radiohead,user_001000,a74b1b7f-71a5-4011-9441-d0b5e4122711,60bd9d53-01ff-4562-8058-eb44b3940317,1.0
125864,Under The Blacklight,Rilo Kiley,user_001000,eaf6a7ca-105d-4a94-ba02-8c3e4040319a,b912987f-8a21-4c7b-8412-f70ccd6dcd4d,1.0
125865,Dreamworld,Rilo Kiley,user_001000,eaf6a7ca-105d-4a94-ba02-8c3e4040319a,d1561c20-50c2-4a48-9f5d-a92572347ddc,1.0


In [51]:
#Then we create the graph

G_test = nx.Graph()
edges_1 = np.array(new_val_user_id_logs_top[['userid', 'track-id','count']].values)
edges_2 = np.array(train_user_top_logs[['artist-id', 'track-id']].values)

G_test.add_nodes_from(user_id_profile['#id'], bipartite=0)
G_test.add_nodes_from(track_df['track-id'], bipartite=1)
G_test.add_nodes_from(artist_df['artist-id'], bipartite=2)
     
G_test.add_weighted_edges_from(edges_1)

#weighted graph
G_test.add_edges_from(edges_2)
#save the graph
nx.write_gml(G_test, "saved_data/graph_user_tracks_test.gml")

In [62]:
def predict(model, test_features, test_labels):
    """
    Downstream ML task using edge embeddings to classify them 
    """

    #compute accuracy
    test_preds=model.predict(test_features)

    print(collections.Counter(np.array(test_preds)))
    acc=accuracy_score(test_labels,test_preds)
    recall=recall_score(test_labels,test_preds)
    precision=precision_score(test_labels,test_preds)

    print('Accuracy:',acc, 'Recall:',recall, 'Precision:',precision)

In [53]:
G_test.remove_nodes_from([n for n in G_test.nodes() if n not in train_G.nodes()])

In [56]:
test_neg_edges = get_neg_edges_bipartite(G_test)
# --- Create feature vector for all edges in training set and test set ---
test_features = feature_extractor(G_test, test_neg_edges)

100%|██████████| 961/961 [00:48<00:00, 19.89it/s]
100%|██████████| 2045989/2045989 [04:04<00:00, 8380.44it/s] 


In [60]:
test_labels = get_edges_values(G_test_gt, test_neg_edges)

In [69]:
test_labels.count(1)

0

In [65]:
train(test_features,test_labels)

Counter({0: 2045989})


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 1.0 Recall: 0.0 Precision: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


RandomForestClassifier(random_state=1999)

In [63]:
predict(RF_model,test_features,test_labels)

Counter({0: 2028691, 1: 17298})


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9915454090906647 Recall: 0.0 Precision: 0.0
