<a href="https://colab.research.google.com/github/fernandoariel/LastFM_rec_system/blob/main/song_rec_sis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
pip install surprise 

In [5]:
from surprise import Dataset, Reader
from surprise.accuracy import rmse
from surprise.model_selection import cross_validate, train_test_split
from surprise import SVD, accuracy
from surprise import SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
artist_data=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/artists.dat', sep='\t')
artist_data= artist_data.rename(columns={'id': 'artistID'})
artist_data.head()

Unnamed: 0,artistID,name,url,pictureURL
0,1,MALICE MIZER,http://www.last.fm/music/MALICE+MIZER,http://userserve-ak.last.fm/serve/252/10808.jpg
1,2,Diary of Dreams,http://www.last.fm/music/Diary+of+Dreams,http://userserve-ak.last.fm/serve/252/3052066.jpg
2,3,Carpathian Forest,http://www.last.fm/music/Carpathian+Forest,http://userserve-ak.last.fm/serve/252/40222717...
3,4,Moi dix Mois,http://www.last.fm/music/Moi+dix+Mois,http://userserve-ak.last.fm/serve/252/54697835...
4,5,Bella Morte,http://www.last.fm/music/Bella+Morte,http://userserve-ak.last.fm/serve/252/14789013...


In [8]:
user_artist_plays = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/lastfm/user_artists.dat", sep="\t")
user_artist_plays.head()

Unnamed: 0,userID,artistID,weight
0,2,51,13883
1,2,52,11690
2,2,53,11351
3,2,54,10300
4,2,55,8983


In [9]:
tags=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/lastfm/tags.dat', sep='\t', engine='python')
tags.head()

Unnamed: 0,tagID,tagValue
0,1,metal
1,2,alternative metal
2,3,goth rock
3,4,black metal
4,5,death metal


In [10]:
user_taggedartists=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/lastfm/user_taggedartists.dat', sep='\t')
user_taggedartists.head()

Unnamed: 0,userID,artistID,tagID,day,month,year
0,2,52,13,1,4,2009
1,2,52,15,1,4,2009
2,2,52,18,1,4,2009
3,2,52,21,1,4,2009
4,2,52,41,1,4,2009


## 1 - Análisis Exploratorio de datos

In [11]:
top_artist= pd.merge(left=user_artist_plays,right=artist_data, left_on='artistID', right_on='artistID')
top_artist.groupby(by='name').sum().sort_values(by='weight', ascending=False).head(10)

Unnamed: 0_level_0,userID,artistID,weight
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Britney Spears,536585,150858,2393140
Depeche Mode,282739,20304,1301308
Lady Gaga,621257,54379,1291387
Christina Aguilera,397259,118844,1058405
Paramore,416630,198702,963449
Madonna,424473,28743,921198
Rihanna,493104,139392,905423
Shakira,312770,223619,688529
The Beatles,516563,108960,662116
Katy Perry,487287,141900,532545


La artista más escuchada fue Britney Spears con 2393140 reproducciones. Le siguen Depeche Mode y Lady Gaga con 1301308 y 1291387 reproducciones respectivamente.

¿Como es distribución de cantidad de listens per user?

In [12]:
user_artist_plays.groupby('userID').sum().sort_values('weight', ascending=False).head(10)

Unnamed: 0_level_0,artistID,weight
userID,Unnamed: 1_level_1,Unnamed: 2_level_1
757,104544,480039
2000,354767,468409
1418,224538,416349
1642,551588,388251
1094,201719,379125
1942,373698,348527
2071,133895,338400
2031,106389,329980
514,93246,329782
387,156500,322661


El usuario de ID 757 es el que más reproducciones realizo

¿Es posible ver el género más escuchado?

In [13]:
# Se decidió trabajar con el archivo "user_taggedartists.dat"
df_tags2 = tags.copy()
tagID_tagValue = df_tags2.set_index('tagID')['tagValue'].to_dict()

serie_top_ranking_tag = user_taggedartists['tagID'].value_counts(ascending=False)


df_top_ranking_tag = serie_top_ranking_tag.to_frame()
df_top_ranking_tag = df_top_ranking_tag.reset_index()

df_top_ranking_tag.columns = ['tagID','total_listened']
df_top_ranking_tag['tagID'] = df_top_ranking_tag['tagID'].replace(tagID_tagValue)
df_top_ranking_tag.columns = ['type_of_music','total_listened']
df_top_ranking_tag.head(10)

Unnamed: 0,type_of_music,total_listened
0,rock,7503
1,pop,5418
2,alternative,5251
3,electronic,4672
4,indie,4458
5,female vocalists,4228
6,80s,2791
7,dance,2739
8,alternative rock,2631
9,classic rock,2287


## 2 Sistema de Recomendación

### Filtrado colaborativo

En primer lugar preparamos el dataset

In [15]:
# convertimos la columna weight a float
user_artist_plays['weight']=user_artist_plays['weight'].astype(float)
user_artist_plays.dtypes

userID        int64
artistID      int64
weight      float64
dtype: object

In [16]:
#normalización de los datos
user_artist_plays['weight_scaled']=(user_artist_plays['weight']-user_artist_plays['weight'].mean()) /user_artist_plays['weight'].std()
user_plays=  user_artist_plays.drop('weight', 1)
user_plays.head()

Unnamed: 0,userID,artistID,weight_scaled
0,2,51,3.502167
1,2,52,2.917573
2,2,53,2.827205
3,2,54,2.547037
4,2,55,2.195961


In [17]:
user_plays.weight_scaled.describe()

count    9.283400e+04
mean    -1.496697e-17
std      1.000000e+00
min     -1.983951e-01
25%     -1.701384e-01
50%     -1.293528e-01
75%     -3.498605e-02
max      9.382099e+01
Name: weight_scaled, dtype: float64

In [19]:
#eliminamos los usuarios con num_raitings menor a 5
min_ratings=5
grouped_df=user_plays.groupby('artistID', as_index=False).count().iloc[:,lambda df: [0,2]].sort_values('weight_scaled', ascending=False)
grouped_df= grouped_df.loc[grouped_df['weight_scaled'] >= min_ratings]
grouped_df.columns=['artistID', 'num_raitings']
filtered_df=user_plays.merge(grouped_df, how='inner', on = 'artistID') #datos filtrados
filtered_df

Unnamed: 0,userID,artistID,weight_scaled,num_raitings
0,2,51,3.502167,111
1,4,51,-0.137883,111
2,27,51,-0.176003,111
3,28,51,-0.195996,111
4,62,51,-0.057911,111
...,...,...,...,...
71421,1479,14692,-0.108294,6
71422,1737,14692,-0.124821,6
71423,1759,14692,0.072443,6
71424,1890,14692,-0.159475,6


In [20]:
filtered_df.describe()

Unnamed: 0,userID,artistID,weight_scaled,num_raitings
count,71426.0,71426.0,71426.0,71426.0
mean,1032.297861,1527.646627,0.026171,111.22664
std,609.30519,1878.657582,1.130231,130.262313
min,2.0,2.0,-0.198395,5.0
25%,498.0,321.0,-0.166673,19.0
50%,1024.0,808.0,-0.122155,60.0
75%,1559.0,1936.75,-0.017392,150.0
max,2100.0,14692.0,93.820991,611.0


In [21]:
filtered_df=filtered_df.fillna(value=0)
#tabla que combina los datos que utilizaremos
combined_data= pd.merge(left=filtered_df,right=artist_data, left_on='artistID', right_on='artistID')
combined_data=combined_data.drop(['url','pictureURL'],axis= 1)
combined_data.sample(5)


Unnamed: 0,userID,artistID,weight_scaled,num_raitings,name
5075,1200,163,-0.145614,258,Pink Floyd
65721,236,2526,0.22119,10,Sunrise Avenue
13772,1965,292,-0.103762,407,Christina Aguilera
48541,816,1441,-0.196796,14,Eva Simons
57275,1782,2521,-0.172804,85,Robyn


Utilizamos la libreria surprise

In [22]:
reader=Reader(rating_scale=(combined_data.weight_scaled.min(), combined_data.weight_scaled.max()))
df_user=Dataset.load_from_df(combined_data[['userID', 'artistID', 'weight_scaled']], reader) #train data

In [23]:
benchmark = []
# Iteramos sobre los algoritmos de filtrado colaborativo
for algorithm in [ SlopeOne(), NMF(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), CoClustering()]:
    # Hacemos cross validation
    results = cross_validate(algorithm, df_user, measures=['RMSE', 'MAE'], cv=3, verbose=False)
    
    # Obtenemos el resultado y agregamos el nombre del algoritmo 
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') 

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KNNBaseline,1.111771,0.218508,0.408812,2.828207
SlopeOne,1.123342,0.2299,0.356493,0.778294
KNNWithMeans,1.127642,0.234134,0.320931,2.424576
KNNBasic,1.138541,0.228384,0.292508,2.267067
KNNWithZScore,1.201316,0.222917,0.398788,2.623768
CoClustering,1.27885,0.273461,1.294327,0.160982
NMF,1.927985,0.313631,3.312569,0.160915


El menor RMSE esta dado por KNNBaseline, por lo tanto es el que se usará para recomendar las peliculas.

In [24]:
#Entrenamos el modelo de recomendación: Paso 1
def getSimModle():
  trainset=df_user.build_full_trainset()
  #usamos pearson_baseline para calcular la similitud entre los artistas 
  sim_options={'name':'pearson_baseline', 'user_based':False}
  #usamos KNNBaseline
  algo=KNNBaseline(sim_options=sim_options)
  #entrenamos el modelo
  algo.fit(trainset)
  return algo

In [26]:
#creacion de diccionarios: Paso2
name_to_rid = dict(zip(combined_data['name'],combined_data['artistID']))
rid_to_name = dict(zip(combined_data['artistID'],combined_data['name']))

In [27]:
# Recomendación de artistas relacionados basados en el modelo definido anteriormente: Paso 3
def showSimilarArtist(algo, rid_to_name, target_artist):
    # Obtenemos la raw_id del artista seleccionado
    target_artist_raw_id = name_to_rid.get(target_artist)
    
    #convertimos la raw_id del artista a la ID interna del modelo
    target_artist_inner_id = algo.trainset.to_inner_iid(target_artist_raw_id)
    
    #Obtenemos los artistas recomendados por el modelo, aquí solo 10.
    target_artist_neighbors = algo.get_neighbors(target_artist_inner_id, 10)
    
    #La id interna del modelo es convertida a la id actual del artista
    neighbors_raw_ids = [algo.trainset.to_raw_iid(inner_id) for inner_id in target_artist_neighbors]
    #Obtenemos una lista de artist-id o una lista de recomendaciones
    neighbors_artist = [rid_to_name[raw_id] for raw_id in neighbors_raw_ids]
    print('The 10 nearest neighbors of the artist are:')
    for artist in neighbors_artist:
       print(artist)

In [28]:
#entrenamos el modelo de recomendación
algo=getSimModle()

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [29]:
#mostramos los artistas relacionados
target_artist = 'AC/DC'
showSimilarArtist(algo ,rid_to_name, target_artist)

The 10 nearest neighbors of the artist are:
Scorpions
Aerosmith
Queen
Black Label Society
Motörhead
KISS
The Strokes
Jimi Hendrix
The Killers
R.E.M.


In [30]:
#mostramos los artistas relacionados
target_artist = 'Ramones'
showSimilarArtist(algo ,rid_to_name, target_artist)

The 10 nearest neighbors of the artist are:
Black Sabbath
Matanza
Blur
Judas Priest
Rise Against
Nirvana
Pearl Jam
The Clash
Arcade Fire
Echo & The Bunnymen


In [31]:
#mostramos los artistas relacionados
target_artist = 'Justin Bieber'
showSimilarArtist(algo ,rid_to_name, target_artist)

The 10 nearest neighbors of the artist are:
Black Eyed Peas
Mariah Carey
Selena Gomez & the Scene
Paramore
Ke$ha
McFly
Kelly Clarkson
Hilary Duff
3OH!3
OneRepublic


In [32]:
#mostramos los artistas relacionados
target_artist = 'Joy Division'
showSimilarArtist(algo ,rid_to_name, target_artist)

The 10 nearest neighbors of the artist are:
The Smiths
Arcade Fire
Sonic Youth
Coldplay
New Order
The xx
Martin L. Gore
Metallica
White Lies
Kaiser Chiefs


In [33]:
##mostramos los artistas relacionados
target_artist = 'Muse'
showSimilarArtist(algo ,rid_to_name, target_artist)

The 10 nearest neighbors of the artist are:
Avril Lavigne
Good Charlotte
Ke$ha
Arctic Monkeys
Deftones
Lady Gaga
The Veronicas
The Prodigy
Rise Against
Justin Timberlake


In [34]:
#mostramos los artistas relacionados
target_artist = 'Radiohead'
showSimilarArtist(algo ,rid_to_name, target_artist)

The 10 nearest neighbors of the artist are:
Pixies
Oasis
Red Hot Chili Peppers
Death Cab for Cutie
The Cranberries
The Who
Interpol
Keane
The Mars Volta
David Bowie


## Sistema de recomendación basado en contenido

In [38]:
#datos que utilizaremos en esta parte
useful_artist = artist_data
useful_artist=useful_artist.drop(['url', 'pictureURL'],axis=1)
user_taggedartists=user_taggedartists.drop(['userID','day', 'month', 'year'],axis= 1)

In [39]:
content_vector=pd.merge(left=user_taggedartists,right=tags, left_on='tagID', right_on='tagID')
content_vector.sample(5)

Unnamed: 0,artistID,tagID,tagValue
166595,1519,5309,can't stop listening
160273,17063,3657,independent
40460,1075,79,alternative
138696,997,959,po-kraftwerkski
19067,1014,16,new wave


In [43]:
len(content_vector)

186479

In [40]:
#revisamos los 10 tag que más aparecen
content_vector['tagValue'].value_counts().head(10)

rock                7503
pop                 5418
alternative         5251
electronic          4672
indie               4458
female vocalists    4228
80s                 2791
dance               2739
alternative rock    2631
classic rock        2287
Name: tagValue, dtype: int64

In [41]:
#eliminamos los tags con apariciones menor a 2100 para reducir los generos a 10
sub_df = content_vector[content_vector.groupby('tagValue').tagValue.transform('count')>2100].copy() 

In [42]:
len(sub_df)

41978

In [44]:
df_final= pd.merge(left=sub_df,right=useful_artist, left_on='artistID', right_on='artistID')
#agrupamos los tags para cada artista
df_final=df_final.groupby(['artistID', 'name'], as_index=False)['tagValue'].apply(','.join)

In [45]:
df_final.sample(5)

Unnamed: 0,artistID,name,tagValue
3091,6245,Мумий Тролль,rock
5721,16633,There Will Be Fireworks,indie
1476,2462,Reik,"pop,pop"
909,1343,Queensrÿche,"80s,80s,80s"
1544,2572,Minus the Bear,"electronic,rock,rock,alternative,alternative,a..."


In [46]:
#  Iniciamos tfidf vectorizer
tf=TfidfVectorizer(analyzer='word',ngram_range=(1,1),stop_words='english', min_df=0)
# Fit and transform 
tfidf_matrix = tf.fit_transform(df_final['tagValue'])

In [47]:
tfidf_matrix.shape

(6111, 10)

In [49]:
cos_similarities = cosine_similarity(tfidf_matrix,tfidf_matrix)

In [50]:
df_final_2=df_final.reset_index()
art_name=df_final_2['name']
indices=pd.Series(df_final_2.index, index=df_final_2['name'])

In [51]:
def get_recommendations(artist_name): #armamos un sistema de recomendacion basado en contenido
  idx=indices[artist_name]
  sim_scores=list(enumerate(cos_similarities[idx]))
  sim_scores=sorted(sim_scores, key=lambda x: x[1], reverse= True)
  sim_scores=sim_scores[1:31]
  artist_indices= [i[0] for i in sim_scores]
  return art_name.iloc[artist_indices]

In [52]:
get_recommendations('Nirvana').head(10)

1421                        Creed
294         The Smashing Pumpkins
1200                   Audioslave
1598                     Anberlin
1965               Counting Crows
280     Móveis Coloniais de Acaju
337            30 Seconds to Mars
128           My Chemical Romance
3691                         Live
110             Poets of the Fall
Name: name, dtype: object

In [53]:
get_recommendations('The Strokes').head(10)

2269         Cold War Kids
144          Kings of Leon
2539                   Ash
94           Stereophonics
145            The Killers
4795         Cheap Mondays
5212      Sleeping at Last
5268           Ecos Falsos
1963        Phantom Planet
2624    Ocean Colour Scene
Name: name, dtype: object