<center><h1>Music recommendation using graphs</h1>
<h2>MLNS PROJECT</h2>
<h3>Coded by Chloé Daems, Amir Mahmoudi and Anne-Claire Laisney</h3>
</center>

This is the main notebook to create a benchmark of graph based music recommendation systems inspired by the *Katarya, R., Verma, O.P. Efficient music recommender system using context graph and particle swarm. Multimed Tools Appl 77, 2673–2687 (2018).* [paper](URL 'https://link.springer.com/article/10.1007/s11042-017-4447-x'), using data from the user.getRecentTracks of the [Last.fm](URL 'https://www.last.fm/api/show/user.getRecentTracks') API.

In [1]:
#Import the libraries
from os.path import exists
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import datetime
from scipy.sparse import *
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, auc

from IPython.display import clear_output

## Create the graph

**Get the dataset**

In [2]:
user_id_profile = pd.read_csv('lastfm-dataset-1K/userid-profile.tsv', sep = '\t')
if not exists('lastfm-dataset-1K/user_id_logs_v2.tsv'):
    logs_columns = ['userid', 'timestamp', 'artist-id', 'artist-name', 'track-id', 'track-name']
    user_id_logs = pd.read_csv('lastfm-dataset-1K/userid-logs.tsv', sep = '\t', header = None, names =  logs_columns )
    user_id_logs = user_id_logs.dropna(subset=['track-name','artist-name', 'artist-id'])
else : 
    user_id_logs = pd.read_csv('lastfm-dataset-1K/user_id_logs_v2.tsv',index_col=0)
    user_id_logs = user_id_logs.dropna(subset=['track-name','artist-name', 'artist-id'])
    
#user_id_logs['timestamp'] = pd.to_datetime(user_id_logs['timestamp'], format='%Y-%m-%dT%H:%M:%SZ')
user_id_logs['timestamp'] = pd.to_datetime(user_id_logs['timestamp'], format='%Y-%m-%d')# %H:%M:%S')

  mask |= (ar1 == a)


In [3]:
user_id_profile.head()

Unnamed: 0,#id,gender,age,country,registered
0,user_000001,m,,Japan,"Aug 13, 2006"
1,user_000002,f,,Peru,"Feb 24, 2006"
2,user_000003,m,22.0,United States,"Oct 30, 2005"
3,user_000004,f,,,"Apr 26, 2006"
4,user_000005,m,,Bulgaria,"Jun 29, 2006"


In [4]:
user_id_logs.head()

Unnamed: 0,userid,timestamp,artist-id,artist-name,track-id,track-name
0,user_000001,2009-05-04 23:08:57,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,7369ec4f-b377-5683-86bd-f02897317103,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,user_000001,2009-05-04 13:54:10,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,8a0799b1-2f64-5e7b-9436-2228c9d65637,Composition 0919 (Live_2009_4_15)
2,user_000001,2009-05-04 13:52:04,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,44da66dc-6a34-54de-a4d9-686bc38ede0f,Mc2 (Live_2009_4_15)
3,user_000001,2009-05-04 13:42:52,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,e625acbe-1360-528d-8afe-4ad88424e0c0,Hibari (Live_2009_4_15)
4,user_000001,2009-05-04 13:42:11,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,fa332ed7-b701-5669-9e8e-0961658cdb43,Mc1 (Live_2009_4_15)


**There are too many track-ids missing, we are going to recreate them using the uuid library**

In [5]:
# Really long : 40 min
import tqdm
import uuid
if not exists('lastfm-dataset-1K/user_id_logs_v2.tsv'):
    for idx, row in tqdm.tqdm(user_id_logs.iterrows()):
        row['track-id'] = uuid.uuid5(uuid.NAMESPACE_DNS, row['artist-name'] + "," + row['track-name'])
        row['artist-id'] = uuid.uuid5(uuid.NAMESPACE_DNS, str(row['artist-name']))
    #We save the file
    user_id_logs.to_csv('lastfm-dataset-1K/user_id_logs_v2.tsv')

**We create a train and test set**

In the test set, we would have only the last month of listening for each users.

In [6]:
try:
    test_user_id_logs = user_id_logs[user_id_logs['timestamp'] > datetime.datetime(2009, 3, 4).replace(tzinfo=datetime.timezone.utc)]
    train_user_id_logs = user_id_logs[user_id_logs['timestamp'] < datetime.datetime(2009, 3, 4).replace(tzinfo=datetime.timezone.utc)]
except:
    test_user_id_logs = user_id_logs[user_id_logs['timestamp'] > datetime.datetime(2009, 3, 4)]
    train_user_id_logs = user_id_logs[user_id_logs['timestamp'] < datetime.datetime(2009, 3, 4)]

In [7]:
print(f'train shape : ({train_user_id_logs.shape} and test shape : ({test_user_id_logs.shape})')

train shape : ((8519568, 6) and test shape : ((613400, 6))


In [8]:
try:
    val_user_id_logs = train_user_id_logs[train_user_id_logs['timestamp'] > datetime.datetime(2009, 1, 4).replace(tzinfo=datetime.timezone.utc)]
    train_user_id_logs = train_user_id_logs[train_user_id_logs['timestamp'] < datetime.datetime(2009, 1, 4).replace(tzinfo=datetime.timezone.utc)]
except:
    val_user_id_logs = train_user_id_logs[train_user_id_logs['timestamp'] > datetime.datetime(2009, 1, 4)]
    train_user_id_logs = train_user_id_logs[train_user_id_logs['timestamp'] < datetime.datetime(2009, 1, 4)]

In [9]:
print(f'train shape : ({train_user_id_logs.shape} and val shape : ({val_user_id_logs.shape})')

train shape : ((7975441, 6) and val shape : ((544126, 6))


**Let's only take the n most listened songs of each users**

In [10]:
def get_only_top(df_logs,df_profile, n_top):
    new_df = pd.DataFrame(columns = ['track-name','artist-name'], dtype= np.str)
    for user_id in df_profile.values:
        test = df_logs[df_logs['userid']== user_id]
        try:
            test['count'] = test.groupby(['track-id'])[['track-id']].transform(lambda x: x.count())['track-id']
            test = test.sort_values(by = 'count', ascending = False)
            test = test.drop('timestamp', axis = 1)
            test = test.drop_duplicates()
            new_df = pd.concat([new_df, test[:n_top]], ignore_index=True)
        except:
            pass
        clear_output(wait = True)
        print("Just finished for",user_id)
    return new_df

In [11]:
if not exists('./saved_data/train_user_top_logs.tsv'):
    train_user_top_logs = get_only_top(train_user_id_logs,user_id_profile['#id'], n_top = 50)
    train_user_top_logs = train_user_top_logs.nlargest(10000,'count')
    train_user_top_logs.to_csv('./saved_data/train_user_top_logs.tsv')

else : 
    train_user_top_logs = pd.read_csv('./saved_data/train_user_top_logs.tsv', index_col=0)

train_user_top_logs.head()

Unnamed: 0,track-name,artist-name,userid,artist-id,track-id,count
43661,Jolene,Cake,user_000949,fa7b9055-3703-473a-8a09-adf2fe031a24,60f0bfa4-8da9-4840-b5fe-23c1fc470f34,1456.0
43662,Staring At The Sun,Tv On The Radio,user_000949,eb872766-98f6-453d-883f-2ae908a18315,64581a21-566d-4b24-99ae-c5f48b75e660,1409.0
6809,Wings Of Words,Chemistry,user_000141,c8524763-e7d7-4225-8a9a-e7b64db5a1e2,2caf09d4-aad9-5f2d-b66d-616f2c2d0e35,1365.0
43663,Heartbeats,The Knife,user_000949,bf710b71-48e5-4e15-9bd6-96debb2e4e98,db4c9220-df76-4b42-b6f5-8bf52cc80f77,1362.0
43664,Anthems For A Seventeen Year Old Girl,Broken Social Scene,user_000949,2eada8f8-056a-4093-bbc2-004909ce743b,91951530-d978-4648-95b1-08b1f49ffba5,1352.0


**Transform the dataset into a bipartite graph**

In [12]:
if not exists('./saved_data/track_df.tsv'):
    track_df = train_user_top_logs.copy()
    track_df.drop(['count', 'userid'], axis = 1, inplace=True)
    track_df.drop_duplicates('track-id', inplace=True)
    track_df.reset_index(drop=True, inplace=True)
    track_df.to_csv('./saved_data/track_df.tsv')
else : 
    track_df = pd.read_csv('./saved_data/track_df.tsv', index_col=0)

track_df.head()

Unnamed: 0,track-name,artist-name,artist-id,track-id
0,Music,Cornelius,df765d93-621c-437f-99fe-fda9e135f89a,52bef5e2-17b6-5742-b846-09a6b750e857
1,Gum,Cornelius,df765d93-621c-437f-99fe-fda9e135f89a,bb9a7981-016d-596e-b17f-ba07a346d2d4
2,Mario Basanov & Vidis ‘Test’,Gilles Peterson,4c4e3121-4d12-4f7a-a77c-5becd849fb3c,7434fb0f-1245-5a58-b343-cca4d0e2c107
3,Child Song,The Cinematic Orchestra,7c158ea8-c0aa-410e-bdc1-20bba9759577,4562ff4f-b619-5557-8600-87f6d0d9f348
4,To Build A Home,The Cinematic Orchestra,7c158ea8-c0aa-410e-bdc1-20bba9759577,dcf825de-85a9-5b53-9d2b-a9d574b57470


In [13]:
if not exists('./saved_data/artists_df.tsv'):
    artist_df = train_user_top_logs.copy()
    artist_df.drop(['count', 'userid', 'track-name', 'track-id'], axis = 1, inplace=True)
    artist_df.drop_duplicates('artist-id', inplace=True)
    artist_df.reset_index(drop=True, inplace=True)
    artist_df.to_csv('./saved_data/artists_df.tsv')
else : 
    artist_df = pd.read_csv('./saved_data/artists_df.tsv', index_col=0)

artist_df.head()

Unnamed: 0,artist-name,artist-id
0,Cornelius,df765d93-621c-437f-99fe-fda9e135f89a
1,Gilles Peterson,4c4e3121-4d12-4f7a-a77c-5becd849fb3c
2,The Cinematic Orchestra,7c158ea8-c0aa-410e-bdc1-20bba9759577
3,Radiohead,a74b1b7f-71a5-4011-9441-d0b5e4122711
4,A Hundred Birds,f5341587-6c20-4e0f-bdfd-62b2122825f2


In [14]:
from networkx.algorithms import bipartite
#We transform the train set into a bipartite graph
G= nx.Graph()
edges_1 = np.array(train_user_top_logs[['userid', 'track-id','count']].values)
edges_2 = np.array(train_user_top_logs[['artist-id', 'track-id']].values)
G.add_nodes_from(user_id_profile['#id'], bipartite=0)
G.add_nodes_from(track_df['track-id'], bipartite=1)
G.add_nodes_from(artist_df['artist-id'], bipartite=2)
     
G.add_weighted_edges_from(edges_1)
#weighted graph
G.add_edges_from(edges_2)
#save the edges
nx.write_gml(G, "saved_data/graph_user_tracks_train.gml")

**Now we transform the validation data set in a bipartite graph**

In [15]:
#We transform the val set into a bipartite graph

#First we select only the music tracks that are in the train set
val_user_id_logs = val_user_id_logs[val_user_id_logs['track-id'].isin(track_df['track-id'])]
val_user_id_logs_top = get_only_top(val_user_id_logs,user_id_profile['#id'], n_top = -1)

Just finished for user_001000


In [17]:
#Then we create the graph

G_val = nx.Graph()
edges_1 = np.array(val_user_id_logs_top[['userid', 'track-id','count']].values)
edges_2 = np.array(train_user_top_logs[['artist-id', 'track-id']].values)

G_val.add_nodes_from(user_id_profile['#id'], bipartite=0)
G_val.add_nodes_from(track_df['track-id'], bipartite=1)
G_val.add_nodes_from(artist_df['artist-id'], bipartite=2)
     
G_val.add_weighted_edges_from(edges_1)

#weighted graph
G_val.add_edges_from(edges_2)
#save the graph
nx.write_gml(G_val, "saved_data/graph_user_tracks_val.gml")

In [18]:
#We transform the test set into a bipartite graph

#First we select only the music tracks that are in the train set
test_user_id_logs = test_user_id_logs[test_user_id_logs['track-id'].isin(track_df['track-id'])]
test_user_id_logs_top = get_only_top(test_user_id_logs,user_id_profile['#id'], n_top = -1)

Just finished for user_001000


In [19]:
#Then we create the graph

G_test_gt = nx.Graph()
edges_1 = np.array(test_user_id_logs_top[['userid', 'track-id','count']].values)
edges_2 = np.array(train_user_top_logs[['artist-id', 'track-id']].values)

G_test_gt.add_nodes_from(user_id_profile['#id'], bipartite=0)
G_test_gt.add_nodes_from(track_df['track-id'], bipartite=1)
G_test_gt.add_nodes_from(artist_df['artist-id'], bipartite=2)
     
G_test_gt.add_weighted_edges_from(edges_1)

#weighted graph
G_test_gt.add_edges_from(edges_2)
#save the graph
nx.write_gml(G_test_gt, "saved_data/graph_user_tracks_test_groundtruth.gml")

## Create the methods
Pearson coefficient, Bellman ford algorithm ...

**Get the graph**

In [20]:
train_G  = nx.read_gml("saved_data/graph_user_tracks_train.gml")
val_G  = nx.read_gml("saved_data/graph_user_tracks_val.gml")

In [21]:
#Concatenate the train and val sets summing the count for the same track and the same user
new_val_user_id_logs_top = pd.concat([train_user_top_logs, val_user_id_logs_top], ignore_index=True)
#sum count column for same track and same user
new_val_user_id_logs_top.groupby(['userid', 'track-id']).sum('count')
new_val_user_id_logs_top.head()

Unnamed: 0,track-name,artist-name,userid,artist-id,track-id,count
0,Jolene,Cake,user_000949,fa7b9055-3703-473a-8a09-adf2fe031a24,60f0bfa4-8da9-4840-b5fe-23c1fc470f34,1456.0
1,Staring At The Sun,Tv On The Radio,user_000949,eb872766-98f6-453d-883f-2ae908a18315,64581a21-566d-4b24-99ae-c5f48b75e660,1409.0
2,Wings Of Words,Chemistry,user_000141,c8524763-e7d7-4225-8a9a-e7b64db5a1e2,2caf09d4-aad9-5f2d-b66d-616f2c2d0e35,1365.0
3,Heartbeats,The Knife,user_000949,bf710b71-48e5-4e15-9bd6-96debb2e4e98,db4c9220-df76-4b42-b6f5-8bf52cc80f77,1362.0
4,Anthems For A Seventeen Year Old Girl,Broken Social Scene,user_000949,2eada8f8-056a-4093-bbc2-004909ce743b,91951530-d978-4648-95b1-08b1f49ffba5,1352.0


In [22]:
#Then we create the graph

G_test = nx.Graph()
edges_1 = np.array(new_val_user_id_logs_top[['userid', 'track-id','count']].values)
edges_2 = np.array(train_user_top_logs[['artist-id', 'track-id']].values)

G_test.add_nodes_from(user_id_profile['#id'], bipartite=0)
G_test.add_nodes_from(track_df['track-id'], bipartite=1)
G_test.add_nodes_from(artist_df['artist-id'], bipartite=2)
     
G_test.add_weighted_edges_from(edges_1)

#weighted graph
G_test.add_edges_from(edges_2)
#save the graph
nx.write_gml(G_test, "saved_data/graph_user_tracks_test.gml")

### We try the methods from TP2

In [23]:
#The graph is too big to be used for training and validation
# We can remove the low degree nodes
def remove_low_degree_nodes(train_graph, val_graph):
    t_graph, v_graph = train_graph.copy(), val_graph.copy()
    to_be_removed = [x for  x, d in t_graph.nodes(data=True) if (t_graph.degree(x) <= 3 and d['bipartite'] != 2)]
    print(len(to_be_removed))
    t_graph.remove_nodes_from(to_be_removed)
    v_graph.remove_nodes_from(to_be_removed)
    return t_graph, v_graph

In [24]:
# We can remove artist with no more edges
def remove_low_degree_nodes2(test_graph, train_graph, val_graph, G_test_gt):
    tst_graph, t_graph, v_graph, G_test_gt = test_graph.copy(), train_graph.copy(), val_graph.copy(), G_test_gt.copy()
    to_be_removed = []
    for  x, d in t_graph.nodes(data=True):
        try:
            if d['bipartite'] != 2:
                if t_graph.degree(x) <= 3:
                    to_be_removed.append(x)
        except:
            to_be_removed.append(x)
    t_graph.remove_nodes_from(to_be_removed)
    v_graph.remove_nodes_from(to_be_removed)
    tst_graph.remove_nodes_from(to_be_removed)
    G_test_gt.remove_nodes_from(to_be_removed)
    return tst_graph, t_graph, v_graph, G_test_gt

In [25]:
# We can remove artist with no more edges
def remove_low_artists(test_graph, train_graph, val_graph,G_test_gt):
    tst_graph, t_graph, v_graph, G_test_gt = test_graph.copy(), train_graph.copy(), val_graph.copy(), G_test_gt.copy()
    to_be_removed = [x for  x, d in t_graph.nodes(data=True) if (t_graph.degree(x) ==0 and d['bipartite'] == 2)]
    t_graph.remove_nodes_from(to_be_removed)
    v_graph.remove_nodes_from(to_be_removed)
    tst_graph.remove_nodes_from(to_be_removed)
    G_test_gt.remove_nodes_from(to_be_removed)
    return tst_graph, t_graph, v_graph,G_test_gt

In [26]:
G_test, train_G,val_G, G_test_gt = remove_low_degree_nodes2(G_test,train_G,val_G, G_test_gt)
len(G_test.nodes),len(train_G.nodes),len(val_G.nodes), len(G_test_gt.nodes)

(9359, 9359, 9359, 9359)

In [27]:
G_test, train_G,val_G,G_test_gt = remove_low_artists(G_test,train_G,val_G, G_test_gt)
len(G_test.nodes),len(train_G.nodes),len(val_G.nodes), len(G_test_gt.nodes)

(686, 686, 686, 686)

#### Unsupervised link prediction

ALL THE METHODS ARE TO BE PUT IN A UTIL.PY FILE

In [28]:
def preferential_attachement(graph, edges=train_G.edges()):
    PA = {}
    
    for edge in edges:
        PA[edge] = graph.degree(edge[0]) * graph.degree(edge[1])
        
    return PA
    
pa = preferential_attachement(train_G)

In [29]:
def Jaccard(graph, edges=train_G.edges()):
    Jaccard = {}
    # Compute Jaccard metric for each non_edge of the graph
    
    for edge in edges: 
        inter_size = len(list(nx.common_neighbors(graph, edge[0], edge[1])))
        union_size = len(set(graph[edge[0]]) | set(graph[edge[1]]))

        if union_size != 0:
            Jaccard[edge] = inter_size / union_size
        else : 
            Jaccard[edge] = 0

    
    return Jaccard

In [30]:
def AdamicAdar(graph, edges=train_G.edges()):
    AdamicAdar = {}
    
    for edge in edges: 
        inter_list = nx.common_neighbors(graph, edge[0], edge[1])
        AdamicAdar[edge] = sum( [1/np.log(graph.degree(node)) for node in inter_list] )
    
    return AdamicAdar

In [31]:
def predict_edges(metric, k):
    
    # Shuffle randomly entries of dictionnary 
    l = list(metric.items())
    np.random.seed(10) # fix random seed to obtain same random shuffling when repeating experiment
    np.random.shuffle(l)
    metric = dict(l)

    # Retrieve top k value 
    metric = dict(sorted(metric.items(), key=lambda x:x[1], reverse=True)[:k])
    print(metric.items())

predict_edges(pa, 10)

dict_items([(('user_000787', '0e938086-1f83-5242-944b-7315de233b57'), 198), (('user_000610', '487023bc-e43f-5739-b62d-d634b0e12346'), 154), (('user_000610', '587e8902-dd97-5f4b-b86a-ecf97cdc24a7'), 154), (('user_000714', '4350fd70-bfce-531c-b9f9-1bd7261697ff'), 133), (('user_000714', 'a863f1e4-9c63-5920-af72-f5ab39179dd8'), 133), (('user_000610', '2b81406d-5d45-5710-b177-598f5b3942ab'), 132), (('user_000610', 'e8f0781d-5c0f-5d78-bc63-9f05ba93f6fd'), 132), (('user_000610', '72368dce-d68a-5e76-95e7-f783751419f2'), 132), (('user_000493', '90744886-c2db-590b-bfb6-50d409cddabf'), 128), (('user_000787', '96b089dc-8abd-58ab-b188-283814072b22'), 126)])


In [32]:
import tqdm
#1 hour to compute NOT TRIED -- (but the result in the state of the art is bad)
def evaluation(G_train, G_val):
    R = G_val.copy()
    R.remove_edges_from(e for e in G_val.edges if e in G_train.edges)
    gt = R.edges
    k = len(gt)
    print(k)
    print("Starting predictions")
    # --- Apply each method defined above and calculate its accuracy ---
    methods = ['Jaccard', 'AdamicAdar', 'preferential_attachement']
    
    # For each method, compute the similarity scores between all non-edges
    # Predict k node pairs with highest score 
    # Compute accuracy wrt edges actually removed 
    for method in methods: 
        res = eval(method)(G_train, nx.non_edges(G_train))
        print("predicting 4 real ")
        pred = sorted(res.items(), key = lambda x:x[1], reverse=True)[:k]
        print("I finished predicting")
        pred = [el[0] for el in pred]
        #print('pred', pred)
        #print('gt',gt)
        accuracy = len(set(pred).intersection(set(gt))) / k
        print(method, accuracy)

In [33]:
evaluation(train_G, val_G)

1275
Starting predictions
predicting 4 real 
I finished predicting
Jaccard 0.0
predicting 4 real 
I finished predicting
AdamicAdar 0.0
predicting 4 real 
I finished predicting
preferential_attachement 0.007058823529411765


#### Supervized link prediction

In [34]:
from sklearn.metrics import accuracy_score,recall_score,precision_score
import collections
from tqdm import tqdm

In [35]:
user_id_profile = pd.get_dummies(user_id_profile, columns = ['gender','country']).drop(['age','registered'], axis = 1 )
user_id_profile.set_index('#id', inplace= True)

In [36]:
def feature_extractor(graph, samples):
    """
    Creates a feature vector for each edge of the graph contained in samples 
    """
    feature_vector = []
    
    # --- Extract manually diverse features relative to each edge contained in samples --- 
    # Fill in the blanks

    # Degree Centrality measure
    deg_centrality = nx.degree_centrality(graph)
    
    # Betweeness centrality measure
    betweeness_centrality = nx.betweenness_centrality(graph)

    # PageRank measure
    page_rank = nx.pagerank(graph)

    # Closeness centrality measure
    closeness_centrality = nx.closeness_centrality(graph)

    # Resource Allocation measure
    #resource_allocation = nx.resource_allocation_index(graph)

    # --- Extract features relative to each edge contained in samples ---

    for edge in tqdm(samples):
        source_node, target_node = edge[0], edge[1]

        # Degree Centrality
        source_degree_centrality = deg_centrality[source_node]
        target_degree_centrality = deg_centrality[target_node]
        
        # Betweeness centrality measure 
        diff_bt = betweeness_centrality[target_node] - betweeness_centrality[source_node]

        # Closeness centrality measure
        diff_cl = closeness_centrality[target_node] - closeness_centrality[source_node]

        # Preferential Attachement 
        pref_attach = list(nx.preferential_attachment(graph, [(source_node, target_node)]))[0][2]

        # Get info on the user
        source_user_info = user_id_profile.loc[source_node].values

        # Distance between the two nodes
        try:
            distance = nx.shortest_path_length(graph, source_node, target_node)
        except:
            distance = 0

        # AdamicAdar
        #aai = list(nx.adamic_adar_index(graph, [(source_node, target_node)]))[0][2]

        # Jaccard
        #jacard_coeff = list(nx.jaccard_coefficient(graph, [(source_node, target_node)]))[0][2]

        # PageRank
        source_page_rank = page_rank[source_node]
        
        # Create edge feature vector with all metric computed above
        feature_vector.append(np.array([source_degree_centrality, target_degree_centrality, 
                                        diff_bt, pref_attach, source_page_rank, diff_cl, distance] + source_user_info.tolist())) 
        
    return feature_vector

In [37]:
def get_sets(graph):
    pos_sample=list(graph.edges)
    neg_sample=list(nx.non_edges(graph))
    labels = [1 for _ in pos_sample] + [0 for _ in neg_sample]
    features=pos_sample+neg_sample
    return features,labels

##### Lets try with only detecting for non edges

In [38]:
def get_neg_edges_bipartite(G):
    top_nodes = set(n for n,d in G.nodes(data=True) if d['bipartite']==0)
    low_nodes = set(n for n,d in G.nodes(data=True) if d['bipartite']==1)
    adj_matrix = bipartite.biadjacency_matrix(G, row_order=top_nodes, column_order=low_nodes)

    negative_edges = []
    top = list(top_nodes)
    low = list(low_nodes)
    for i in tqdm(range(len(top_nodes))):
        for j in range(len(low_nodes)):
            if adj_matrix[i,j]==0:
                negative_edges.append([top[i],low[j]])
    return negative_edges

In [39]:
from networkx.algorithms import bipartite
train_neg_edges = get_neg_edges_bipartite(train_G)

100%|██████████| 352/352 [00:01<00:00, 185.49it/s]


In [40]:
def get_edges_values(G, edges):
    all_positive_edges = G.edges()
    output = []
    for edge in edges:
        if(edge in all_positive_edges): 
            #output.append(G.edges[edge[0],edge[1]]['weight'])
            output.append(1)  
        else:
            output.append(0)

    return output

In [41]:
def labelling(G, edges):
    all_positive_edges = G.edges()
    all_neg_edges=nx.non_edges(G)
    all_edges = all_positive_edges + all_neg_edges
    output=[1 for i in all_positive_edges ] + [0 for i in all_neg_edges ]
   
    return output,all_positive_edges,all_neg_edges

In [42]:
new_links = get_edges_values(val_G, train_neg_edges)
new_links.count(1)

1275

In [43]:
# --- Create feature vector for all edges in training set and test set ---
train_features = feature_extractor(train_G, train_neg_edges)

100%|██████████| 77352/77352 [00:11<00:00, 6576.66it/s]


In [44]:
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier

In [45]:
def train(train_features, train_labels):
    """
    Downstream ML task using edge embeddings to classify them 
    """
    
    # --- Build the model and train it ---
    # Fill in the blanks
    clf = GradientBoostingClassifier(random_state= 1999, n_estimators=100, learning_rate=0.1, max_depth=5)

    clf.fit(train_features, train_labels)

    #compute accuracy
    train_preds=clf.predict(train_features)

    print(collections.Counter(np.array(train_preds)))
    acc=accuracy_score(train_labels,train_preds)
    recall=recall_score(train_labels,train_preds)
    precision=precision_score(train_labels,train_preds)

    print('Accuracy:',acc, 'Recall:',recall, 'Precision:',precision)
    
    return clf

In [46]:
RF_model = train(train_features, new_links)

Counter({0: 77179, 1: 173})
Accuracy: 0.9852104664391353 Recall: 0.1192156862745098 Precision: 0.8786127167630058


### We test the model on the test set

In [47]:
def top_N_experiment(features,edges, N, clf):
    """
    Compute the top N edges in the graph G using the classifier clf
    """
    # --- Predict the labels of the edges ---
    preds = clf.predict_proba(features)

    # --- Get the top N edges ---
    top_N_edges = []
    for i in range(N):
        top_N_edges.append(edges[np.argmax(preds[:,1])])
        preds[np.argmax(preds[:,1])] = -1

    return top_N_edges

In [54]:
test_neg_edges = np.array(get_neg_edges_bipartite(G_test))
precisions = []
recalls = []
for user in tqdm(set(test_neg_edges[:,0])):
    user_edges = test_neg_edges[test_neg_edges[:,0]==user]
    user_labels = get_edges_values(G_test_gt, user_edges)
    if user_labels.count(1) != 0:
        user_features = feature_extractor(G_test, user_edges)
        top_N_edges = top_N_experiment(user_features, user_edges, 50, RF_model)
        top_N_edges = [list(edge) for edge in top_N_edges]
        user_edges = [list(edge) for edge in user_edges]
        '''print(top_N_edges)
        print(user_edges)'''
        #prediction = user_edges[user_edges in top_N_edges].astype(int)
        prediction = []
        for i in range(len(user_edges)):
            if user_edges[i] in top_N_edges:
                prediction.append(1)
            else:
                prediction.append(0)

        print(prediction.count(1), prediction.count(0))


        print(user_labels.count(1))
        precision = precision_score(user_labels, prediction)
        recall = recall_score(user_labels, prediction)
        print("Precision : ",precision," Recall : ", recall)
        precisions.append(precision)
        recalls.append(recall)
    clear_output(wait = True)
    

100%|██████████| 352/352 [04:09<00:00,  1.41it/s]


In [55]:
#3.9 % top 10
#2.7 % top 20
np.mean(precisions)

0.023925233644859812

In [None]:
def predict(model, test_features, test_labels):
    """
    Downstream ML task using edge embeddings to classify them 
    """

    #compute accuracy
    test_preds=model.predict(test_features)

    print(collections.Counter(np.array(test_preds)))
    acc=accuracy_score(test_labels,test_preds)
    recall=recall_score(test_labels,test_preds)
    precision=precision_score(test_labels,test_preds)

    print('Accuracy:',acc, 'Recall:',recall, 'Precision:',precision)
    return test_preds

In [None]:
test_neg_edges = get_neg_edges_bipartite(G_test)
# --- Create feature vector for all edges in training set and test set ---
test_features = feature_extractor(G_test, test_neg_edges)

100%|██████████| 352/352 [00:01<00:00, 180.10it/s]
100%|██████████| 76065/76065 [00:11<00:00, 6490.84it/s]


In [None]:
test_labels = get_edges_values(G_test_gt, test_neg_edges)

In [None]:
test_labels.count(1)

632

In [None]:
preds = predict(RF_model,test_features,test_labels)

Counter({0: 74901, 1: 1164})
Accuracy: 0.976835601130612 Recall: 0.02689873417721519 Precision: 0.014604810996563574


In [None]:
RF_model.predict_proba(test_features)

array([[0.9919923 , 0.0080077 ],
       [0.97712059, 0.02287941],
       [0.9938698 , 0.0061302 ],
       ...,
       [0.85943043, 0.14056957],
       [0.87213615, 0.12786385],
       [0.85943043, 0.14056957]])

In [None]:
train(test_features,test_labels)

Counter({0: 75589, 1: 476})
Accuracy: 0.9978965358574903 Recall: 0.75 Precision: 0.9957983193277311


BaggingClassifier(random_state=1999)

In [None]:
predict(RF_model,test_features,test_labels)

Counter({0: 75693, 1: 372})
Accuracy: 0.9868533491093144 Recall: 0.0031645569620253164 Precision: 0.005376344086021506
