<center><h1>Music recommendation using graphs</h1>
<h2>MLNS PROJECT</h2>
<h3>Coded by Chloé Daems, Amir Mahmoudi and Anne-Claire Laisney</h3>
</center>

This is the main notebook to create a benchmark of graph based music recommendation systems inspired by the *Katarya, R., Verma, O.P. Efficient music recommender system using context graph and particle swarm. Multimed Tools Appl 77, 2673–2687 (2018).* [paper](URL 'https://link.springer.com/article/10.1007/s11042-017-4447-x'), using data from the user.getRecentTracks of the [Last.fm](URL 'https://www.last.fm/api/show/user.getRecentTracks') API.

In [1]:
#Import the libraries
from os.path import exists
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import datetime

from IPython.display import clear_output

## Create the graph

**Get the dataset**

In [2]:
user_id_profile = pd.read_csv('lastfm-dataset-1K/userid-profile.tsv', sep = '\t')

if not exists('lastfm-dataset-1K/user_id_logs_v2.tsv'):
    logs_columns = ['userid', 'timestamp', 'artist-id', 'artist-name', 'track-id', 'track-name']
    user_id_logs = pd.read_csv('lastfm-dataset-1K/userid-logs.tsv', sep = '\t', header = None, names =  logs_columns )
    user_id_logs = user_id_logs.dropna(subset=['track-name','artist-name'])
else : 
    user_id_logs = pd.read_csv('lastfm-dataset-1K/user_id_logs_v2.tsv',index_col=0)
    
user_id_logs['timestamp'] = pd.to_datetime(user_id_logs['timestamp'], format='%Y-%m-%dT%H:%M:%SZ')


  mask |= (ar1 == a)


In [3]:
user_id_profile.head()

Unnamed: 0,#id,gender,age,country,registered
0,user_000001,m,,Japan,"Aug 13, 2006"
1,user_000002,f,,Peru,"Feb 24, 2006"
2,user_000003,m,22.0,United States,"Oct 30, 2005"
3,user_000004,f,,,"Apr 26, 2006"
4,user_000005,m,,Bulgaria,"Jun 29, 2006"


In [4]:
user_id_logs.head()

Unnamed: 0,userid,timestamp,artist-id,artist-name,track-id,track-name
0,user_000001,2009-05-04 23:08:57,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,7369ec4f-b377-5683-86bd-f02897317103,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,user_000001,2009-05-04 13:54:10,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,8a0799b1-2f64-5e7b-9436-2228c9d65637,Composition 0919 (Live_2009_4_15)
2,user_000001,2009-05-04 13:52:04,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,44da66dc-6a34-54de-a4d9-686bc38ede0f,Mc2 (Live_2009_4_15)
3,user_000001,2009-05-04 13:42:52,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,e625acbe-1360-528d-8afe-4ad88424e0c0,Hibari (Live_2009_4_15)
4,user_000001,2009-05-04 13:42:11,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,fa332ed7-b701-5669-9e8e-0961658cdb43,Mc1 (Live_2009_4_15)


**There are too many track-ids missing, we are going to recreate them using the uuid library**

In [5]:
# Really long : 40 min
import tqdm
import uuid
if not exists('lastfm-dataset-1K/user_id_logs_v2.tsv'):
    for idx, row in tqdm.tqdm(user_id_logs.iterrows()):
        row['track-id'] = uuid.uuid5(uuid.NAMESPACE_DNS, row['artist-name'] + "," + row['track-name'])
    #We save the file
    user_id_logs.to_csv('lastfm-dataset-1K/user_id_logs_v2.tsv')

**We create a train and test set**

In the test set, we would have only the last month of listening for each users.

In [6]:
test_user_id_logs = user_id_logs[user_id_logs['timestamp'] > datetime.datetime(2009, 4, 4)]
train_user_id_logs = user_id_logs[user_id_logs['timestamp'] < datetime.datetime(2009, 4, 4)]

In [7]:
print(f'train shape : ({train_user_id_logs.shape} and test shape : ({test_user_id_logs.shape})')

train shape : ((18391647, 6) and test shape : ((707202, 6))


**Let's only take the n most listened songs of each users**

In [10]:
def get_only_top_v2(df_logs,df_profile, n_top):
    new_df = pd.DataFrame(columns = ['track-name','artist-name'], dtype= np.str)
    for user_id in df_profile.values:
        test = df_logs[df_logs['userid']== user_id]
        test['count'] = test.groupby(['track-id'])[['track-id']].transform(lambda x: x.count())['track-id']
        test = test.sort_values(by = 'count', ascending = False)
        test = test.drop('timestamp', axis = 1)
        test = test.drop_duplicates()
        try:
            new_df = pd.concat([new_df, test[:n_top]], ignore_index=True)
        except:
            pass
        clear_output(wait = True)
        print("Just finished for",user_id)
    return new_df

In [11]:
user_top_logs = get_only_top_v2(train_user_id_logs,user_id_profile['#id'], n_top = 50)
user_top_logs.head()

Just finished for user_000096


KeyboardInterrupt: 

**Transform the dataset into a multilayer graph**

In [None]:
G= nx.Graph()
color_list = []
edges = np.array(user_top_logs[['userid', 'track-id', 'artist-id']].values)
lastuser = ""
musics = []
artists = []
for edge in edges:
    G.add_node(edge[0])
    G.add_node(edge[1])
    G.add_node(edge[2])

    if lastuser != edge[0]:
        color_list.append('red')
        lastuser = edge[0]

    if edge[1] not in musics:
        musics.append(edge[1])
        color_list.append("blue")
    
    if edge[2] not in artists:
        artists.append(edge[2])
        color_list.append("green")

        
G.add_edges_from(edges[:,:-1])
G.add_edges_from(edges[:,1:])


In [None]:
np.save("./edge_saved/edges_list.npy",edges)

In [None]:
len(G.nodes)

47748

In [None]:
len(color_list)

47748

In [None]:
"""pos = nx.spring_layout(G, k=0.3, iterations = 45)
nx.draw(G,node_color=color_list, with_labels=False, pos = pos, node_size=50)
plt.show()"""

'pos = nx.spring_layout(G, k=0.3, iterations = 45)\nnx.draw(G,node_color=color_list, with_labels=False, pos = pos, node_size=50)\nplt.show()'

In [None]:
from colorama import Fore
new_list = list(G.degree())
new_list.sort(key=lambda y: y[1])
for node, degree in new_list:
    if degree > 20 and node in musics:print(node)
    """if node in artists:
        print(Fore.GREEN + "(" + str(node) + "," + str(degree)+")")
    elif node in musics:
        print(Fore.BLUE + "(" + str(node) + "," + str(degree)+")")
    else:
        print(Fore.RED + "(" + str(node) + "," + str(degree)+")")"""


e2869cc0-d89a-545e-a8c9-c6ec76c529ef
cd44f7af-fac5-5770-aea3-162c3471e0f3
74b403a2-01d9-5a95-a35e-55a5313763fc
0e938086-1f83-5242-944b-7315de233b57
88ff31ff-07d3-5909-b8d4-942377de3c04
779ca2b4-e926-577f-9457-7aa0e82594f7
e8f0781d-5c0f-5d78-bc63-9f05ba93f6fd
fe561758-dfcc-55f4-a550-514924dcccf5
