## Trabajo Práctico 2 - Recommender Systems

DiploDatos 2018 - Aprendizaje No Supervisado

Mario Ferreyra - Emiliano Kokic

# Music Recommender

https://grouplens.org/datasets/hetrec-2011/ $\Longrightarrow$ Last.FM

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

%matplotlib inline

In [2]:
sns.set_style('whitegrid')
sns.set_context('talk')

In [3]:
with open('../../Datasets/hetrec2011-lastfm-2k/readme.txt', encoding='ISO-8859-1') as f:
    lines = f.readlines()

lines

['\n',
 'hetrec2011-lastfm-2k\n',
 '\n',
 '-------\n',
 'Version\n',
 '-------\n',
 '\n',
 'Version 1.0 (May 2011)\n',
 '\n',
 '-----------\n',
 'Description\n',
 '-----------\n',
 '\n',
 '    This dataset contains social networking, tagging, and music artist listening information \n',
 '    from a set of 2K users from Last.fm online music system.\n',
 '    http://www.last.fm \n',
 '\n',
 '    The dataset is released in the framework of the 2nd International Workshop on \n',
 '    Information Heterogeneity and Fusion in Recommender Systems (HetRec 2011) \n',
 '    http://ir.ii.uam.es/hetrec2011 \n',
 '    at the 5th ACM Conference on Recommender Systems (RecSys 2011)\n',
 '    http://recsys.acm.org/2011 \n',
 '\n',
 '---------------\n',
 'Data statistics\n',
 '---------------\n',
 '\n',
 '    1892 users\n',
 '   17632 artists\n',
 '      \n',
 '   12717 bi-directional user friend relations, i.e. 25434 (user_i, user_j) pairs\n',
 '         avg. 13.443 friend relations per user\n',
 '   

In [4]:
artists_df = pd.read_csv('../../Datasets/hetrec2011-lastfm-2k/artists.dat', sep='\t')
artists_df = artists_df[['id', 'name']]
artists_df = artists_df.rename(index=str, columns={'id': 'artistID'})

artists_df.head(10)

Unnamed: 0,artistID,name
0,1,MALICE MIZER
1,2,Diary of Dreams
2,3,Carpathian Forest
3,4,Moi dix Mois
4,5,Bella Morte
5,6,Moonspell
6,7,Marilyn Manson
7,8,DIR EN GREY
8,9,Combichrist
9,10,Grendel


In [5]:
tags_df = pd.read_csv('../../Datasets/hetrec2011-lastfm-2k/tags.dat', sep='\t', encoding='ISO-8859-1')
tags_df.head(10)

Unnamed: 0,tagID,tagValue
0,1,metal
1,2,alternative metal
2,3,goth rock
3,4,black metal
4,5,death metal
5,6,industrial metal
6,7,gothic metal
7,8,terror ebm
8,9,electro-industrial
9,10,harsh ebm


In [6]:
user_tag_art_df = pd.read_csv('../../Datasets/hetrec2011-lastfm-2k/user_taggedartists.dat', sep='\t')
user_tag_art_df = user_tag_art_df[['artistID', 'tagID', 'year']]

user_tag_art_df.head(10)

Unnamed: 0,artistID,tagID,year
0,52,13,2009
1,52,15,2009
2,52,18,2009
3,52,21,2009
4,52,41,2009
5,63,13,2009
6,63,14,2009
7,63,23,2009
8,63,40,2009
9,73,13,2009


In [7]:
art_tag_df = pd.merge(artists_df, user_tag_art_df, on='artistID')

art_tag_df.head(10)

Unnamed: 0,artistID,name,tagID,year
0,1,MALICE MIZER,552,2008
1,1,MALICE MIZER,1219,2008
2,1,MALICE MIZER,139,2008
3,1,MALICE MIZER,141,2008
4,1,MALICE MIZER,2850,2010
5,1,MALICE MIZER,139,2008
6,1,MALICE MIZER,141,2008
7,1,MALICE MIZER,179,2008
8,1,MALICE MIZER,541,2008
9,1,MALICE MIZER,139,2008


In [8]:
art_song_df = pd.merge(art_tag_df, tags_df, on='tagID')
art_song_df = art_song_df[['name', 'tagValue']]

art_song_df.head(10)

Unnamed: 0,name,tagValue
0,MALICE MIZER,weeabo
1,DIR EN GREY,weeabo
2,宇多田ヒカル,weeabo
3,ASIAN KUNG-FU GENERATION,weeabo
4,MALICE MIZER,jrock
5,DIR EN GREY,jrock
6,DIR EN GREY,jrock
7,DIR EN GREY,jrock
8,UVERworld,jrock
9,UVERworld,jrock


In [9]:
art_song_df[art_song_df.name == 'Daddy Yankee']

Unnamed: 0,name,tagValue
61428,Daddy Yankee,dance
79396,Daddy Yankee,guilty pleasures
82853,Daddy Yankee,pop
117888,Daddy Yankee,good old times
117951,Daddy Yankee,lpa
117991,Daddy Yankee,great moments at et7
121859,Daddy Yankee,world
127273,Daddy Yankee,viaje de egresados
143892,Daddy Yankee,world music
160574,Daddy Yankee,great memories with my siss


In [10]:
df = art_song_df.groupby([art_song_df.name]).tagValue.unique().reset_index(name='tagList')

In [11]:
print("Shape DF = {}".format(df.shape))
df.head(10)

Shape DF = (12133, 2)


Unnamed: 0,name,tagList
0,!!!,"[seen live, electronic, alternative, electro, ..."
1,#####,"[seen live, alternative, hardcore, metalcore, ..."
2,$lick,"[sexy, hip-hop, rap, hip hop, gangsta rap, gan..."
3,(hed) Planet Earth,"[metal, rock, favorite, american, nu metal, 00..."
4,*NSYNC,"[seen live, male vocalists, 90s, dance, beauti..."
5,+44,"[rock, great lyrics, alternative, punk, love, ..."
6,+\-,"[experimental, mathcore, chaotic hardcore]"
7,...And The Earth Swarmed With Them,"[post-rock, shoegaze]"
8,...And You Will Know Us by the Trail of Dead,"[rock, alternative rock, alternative, emo, usa..."
9,.38 Special,"[rock, hard rock, 80s, classic rock, 70s, grea..."


In [12]:
#df = df[df.name.str.isalpha()]
#print("Shape DF = {}".format(df.shape))
#df.head(10)

In [13]:
df['tagList'] = df['tagList'].apply(lambda x: list(map(str.lower, x)))
df['tagList'] = df['tagList'].fillna('').astype('str')

In [14]:
df.head(20)

Unnamed: 0,name,tagList
0,!!!,"['seen live', 'electronic', 'alternative', 'el..."
1,#####,"['seen live', 'alternative', 'hardcore', 'meta..."
2,$lick,"['sexy', 'hip-hop', 'rap', 'hip hop', 'gangsta..."
3,(hed) Planet Earth,"['metal', 'rock', 'favorite', 'american', 'nu ..."
4,*NSYNC,"['seen live', 'male vocalists', '90s', 'dance'..."
5,+44,"['rock', 'great lyrics', 'alternative', 'punk'..."
6,+\-,"['experimental', 'mathcore', 'chaotic hardcore']"
7,...And The Earth Swarmed With Them,"['post-rock', 'shoegaze']"
8,...And You Will Know Us by the Trail of Dead,"['rock', 'alternative rock', 'alternative', 'e..."
9,.38 Special,"['rock', 'hard rock', '80s', 'classic rock', '..."


In [15]:
df[df.name == 'Daddy Yankee']

Unnamed: 0,name,tagList
2496,Daddy Yankee,"['dance', 'guilty pleasures', 'pop', 'good old..."


In [16]:
tfidf_vect = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1, 2),
    min_df=0,
    stop_words='english'
)

tfidf_matrix = tfidf_vect.fit_transform(df['tagList'])

print("Shape TfIdf Matrix = {}".format(tfidf_matrix.shape))

Shape TfIdf Matrix = (12133, 47221)


In [17]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

print("Shape Cosine_Sim = {}".format(cosine_sim.shape))
display(cosine_sim[:4, :4])

Shape Cosine_Sim = (12133, 12133)


array([[1.        , 0.04290558, 0.        , 0.01644941],
       [0.04290558, 1.        , 0.        , 0.05483301],
       [0.        , 0.        , 1.        , 0.        ],
       [0.01644941, 0.05483301, 0.        , 1.        ]])

In [18]:
# Build a 1-dimensional array with movie titles
artists = df['name']
indices = pd.Series(df.index, index=df['name'])

display(indices.head(10))

name
!!!                                             0
#####                                           1
$lick                                           2
(hed) Planet Earth                              3
*NSYNC                                          4
+44                                             5
+\-                                             6
...And The Earth Swarmed With Them              7
...And You Will Know Us by the Trail of Dead    8
.38 Special                                     9
dtype: int64

In [19]:
def tag_recommendations(title):
    """
    Get artist recommendations based on the
    cosine similarity score of music tags.
    """
    idx = indices[title]
    # print(idx)
    sim_scores = list(enumerate(cosine_sim[idx]))
    # print(sim_scores)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1: 21]
    movie_indices = [i[0] for i in sim_scores]

    return artists.iloc[movie_indices]

In [20]:
tag_recommendations('Daddy Yankee').head(20)

7139                               Natalia Oreiro
6826                                     Miranda!
8351                                 Ricky Martin
2551                              Daniela Herrero
2006                                     Chayanne
2364                              Cristian Castro
11314                              Wisin & Yandel
8268                                         Reik
3425                              Eros Ramazzotti
8433                                Rogue Traders
2114     Christina Aguilera, Lil' Kim, Mya & Pink
8319                               Ricardo Arjona
8218                                    Ray Smith
3003                                     Don Omar
3781                                 Four Seasons
8128                                          RBD
1036                    Baaba Maal & Mansour Seck
4949                             Janet & Jak Esim
5689                                 Kultur Shock
11962                                    אביב גפן


In [21]:
tag_recommendations('Don Omar').head(20)

7139                        Natalia Oreiro
9624                         Tego Calderon
11150                               Voltio
11526                                 Zion
2551                       Daniela Herrero
192                                AbradAb
686                          Animal Nation
773                          Apocalipse 16
776                           Apollo Brown
1218                               Benzino
1368                            Black Milk
1383                          Black Violin
2408                              Curren$y
2475                           DJ Risk One
2601                            Das Racist
3067                        Dream Warriors
3697     Fisz Emade jako Tworzywo Sztuczne
4250                                   Guf
5590                         King Geedorah
6861                               Mod Sun
Name: name, dtype: object

In [22]:
tag_recommendations('Wisin & Yandel').head(20)

7538               Orishas
10670      Tito El Bambino
2496          Daddy Yankee
5324       Julieta Venegas
8767         Sean Kingston
2750     Delinquent Habits
9624         Tego Calderon
11150               Voltio
11526                 Zion
2364       Cristian Castro
2006              Chayanne
6826              Miranda!
7906               Pitbull
9304           Stereo MC's
975               Aventura
192                AbradAb
686          Animal Nation
773          Apocalipse 16
776           Apollo Brown
1218               Benzino
Name: name, dtype: object

In [23]:
tag_recommendations('Eminem').head(20)

8492      Royce da 5'9"
7430         Obie Trice
2448                D12
9382           Styles P
7659            Papoose
166                  AZ
43                 2Pac
1901            Cassidy
7125                Nas
4985              Jay-Z
6017     Little Brother
133              A-Team
11386            Xzibit
8225         Re-Up Gang
10881            Twista
6258             M.O.P.
5128         Joe Budden
11450        Young Buck
8256             Redman
9656        Termanology
Name: name, dtype: object