# Music Recommendation System

This is a project for the Aplica- course.

Year 2017, first period.

Students:
- Diego Vargas
- Andre Pando
- Ronie Arauco

## Making familiar with the dataset

In [1]:
import numpy as np
import pandas as pd
import codecs
# import matplotlib.pyplot as plt
# %matplotlib inline

artists = pd.read_table("./lastfm-data/artists.dat", encoding = 'latin1')
tags = pd.read_table("./lastfm-data/tags.dat", encoding = 'latin1')
user_artists = pd.read_table("./lastfm-data/user_artists.dat", encoding = 'latin1')
user_taggedartists = pd.read_table("./lastfm-data/user_taggedartists.dat",encoding = 'latin1', usecols=['userID', 'artistID', 'tagID'])
user_friends = pd.read_table("./lastfm-data/user_friends.dat",encoding = 'latin1')


# Information taken from
#    Last.fm website, http://www.lastfm.com
#
#    @inproceedings{Cantador:RecSys2011,
#       author = {Cantador, Iv\'{a}n and Brusilovsky, Peter and Kuflik, Tsvi},
#       title = {2nd Workshop on Information Heterogeneity and Fusion in Recommender Systems (HetRec 2011)},
#       booktitle = {Proceedings of the 5th ACM conference on Recommender systems},
#       series = {RecSys 2011},
#       year = {2011},
#       location = {Chicago, IL, USA},
#       publisher = {ACM},
#       address = {New York, NY, USA},
#       keywords = {information heterogeneity, information integration, recommender systems},
#    } 

In [2]:
# Contains information about the artists that has been listened and tagged
# by the users
# id \t name \t url \t pictureURL
artists.sample(3)

Unnamed: 0,id,name,url,pictureURL
1156,1165,Luky,http://www.last.fm/music/Luky,http://userserve-ak.last.fm/serve/252/43709.jpg
9944,10215,Manfred HÃ¼bler & Siegfried Schwab,http://www.last.fm/music/Manfred%2BH%25C3%25BC...,http://userserve-ak.last.fm/serve/252/35825269...
6072,6197,The Young Veins,http://www.last.fm/music/The+Young+Veins,http://userserve-ak.last.fm/serve/252/48394907...


In [3]:
# The tags available in the dataset
# tagID \t tagValue
# tags.shape
tags.sample(3)

Unnamed: 0,tagID,tagValue
1497,1520,meow
11420,12074,butterfly boucher
377,384,modern melodic death metal


In [5]:
# Contains the artists listened by each user, providing also
# the listening count for each [user, artist] pair
# userID \t artistID \t weight
# user_artists.shape
user_artists.sample(3)
#user_artists

Unnamed: 0,userID,artistID,weight
87364,1978,2347,508
68690,1546,439,213
6175,130,285,197


In [6]:
# Tag assignments of artists provided by each particular user
# as well with the time of when was the tag assigned by the user
# userID \t artistID \t tagID \t day \t month \t year
# user_taggedartists.shape
user_taggedartists.sample(3)

Unnamed: 0,userID,artistID,tagID
131471,1517,903,9731
158757,1800,4025,494
177229,1974,6282,1


In [7]:
# Contains the friend relations between users in the database
# userID \t friendID
# user_friends.shape
user_friends.sample(3)

Unnamed: 0,userID,friendID
5830,410,1825
10437,778,636
4469,321,268


Obj| shape 
--- | ---
artists | (17632, 4)
tags | (11946, 2)
user_artists | (92834, 3)
user_taggedartists | (186479, 6)
user_friends | (25434, 2)

In [8]:
# What is the artist with most and least listeners?

# - Most listeners
listeners_agg = user_artists[['artistID','userID']].groupby('artistID', sort=False).agg(['count'])
print("artists with least followers")
print(listeners_agg['userID'].sort_values('count').head(3)) #-- least 9201
print("--------------------")
print("artists with most followers")
print(listeners_agg['userID'].sort_values('count').tail(3)) #-- most 89

# And how many plays do they make?
listens_agg = user_artists[['artistID', 'weight']].groupby(['artistID']).agg(['sum'])
print("--------------------")
print("Amount of plays for the artist with least followers")
print(listens_agg.filter(regex='^9201$',axis=0)) # -- least 139 plays
print("--------------------")
print("Amount of plays for the artist with most followers")
print(listens_agg.filter(regex='^89$',axis=0)) # -- most 1291387 plays
# What are the tags made by those users?



# What is the artist with most and the least listen counts? 
# (the least can't be 0, according with the description of the artist dataset)
print("artist with least plays")
print(listens_agg['weight'].sort_values('sum').head(3)) # -- least 14371
print("--------------------")
print("artist with Most plays")
print(listens_agg['weight'].sort_values('sum').tail(3)) # -- most 2393140

# and how many users makes those listen counts?
print("--------------------")
print("Amount of users for the artist with least plays")
print(listeners_agg.filter(regex='^14371$',axis=0)) # -- artist with less 
print("--------------------")
print("Amount of users for the artist with most plays")
print(listeners_agg.filter(regex='^289$',axis=0)) # -- artist with moee

# What is the most and the least used tag?
# What is the most and the least tagged artists?
# What is the user that tagges the most and tagges the least? 

artists with least followers
          count
artistID       
9201          1
12363         1
12366         1
--------------------
artists with most followers
          count
artistID       
288         484
289         522
89          611
--------------------
Amount of plays for the artist with least followers
         weight
            sum
artistID       
9201        139
--------------------
Amount of plays for the artist with most followers
           weight
              sum
artistID         
89        1291387
artist with least plays
          sum
artistID     
14371       1
11746       1
9493        1
--------------------
artist with Most plays
              sum
artistID         
89        1291387
72        1301308
289       2393140
--------------------
Amount of users for the artist with least plays
         userID
          count
artistID       
14371         1
--------------------
Amount of users for the artist with most plays
         userID
          count
artistID       
289 

### The problem
The database doesn't contain any rating/rate column, rather a _weight_ for each artists by user which works as a _listen_ counter. That said, there's going to be artists that has a high amount of plays, but little users - and viceversa.

So, for this solution, the amount of plays has to be converted to a relative along to the amount of users. 

The following graph shows how the data is being shown.

![Graph](graph.png)

One is using the **Content-Based Filtering**, since the data set we currently have is a set of users and a set of categories (keywords or tags). The similarity between the two will be the keywords extracted from the artists tags. Each user should have a degree of interest in certain tags, which can be retrieved using the most tagged item in the most frequent artists the user hears (See table 1). That said, we can only recommend artists to the already given set of users.


| Tag  | $U_1$ | $U_2$ | $U_3$ | $U_x$ |
|------|-----|-----|-----|-----|
| $Tag_1$ |  3  |  2  |     |     |
| $Tag_2$ |  5  |  3  |  3  |     |
| $Tag_3$ |     |  3  |  5  |  4  |
| $Tag_4$ |  1  |     |  5  |  4  |


#### How to retrieve the Interest (Ideas)
The interest can be retrieved from the following table (which belongs for one user):

<table>
    <thead>
        <tr>
            <th>Plays $P_i$</th>
            <th>Artist $A_i$</th>
            <th>Tag $T_j$</th>
            <th>Weight $W_{ij}$</th>
        </tr>
    </thead>
    <tbody>
        <tr>
             <td rowspan="2">$P_1$</td>
             <td rowspan="2">$A_1$</td>
             <td>$T_1$</td>
             <td>$W_{11}$</td>
        </tr>
        <tr>
             <td>$T_2$</td>
             <td>$W_{12}$</td>
        </tr>
        <tr>
             <td rowspan="2">$P_2$</td>
             <td rowspan="2">$A_2$</td>
             <td>$T_2$</td>
             <td>$W_{22}$</td>
        </tr>
        <tr>
             <td>$T_3$</td>
             <td>$W_{23}$</td>
        </tr>
        <tr>
             <td rowspan="4">$P_3$</td>
             <td rowspan="4">$A_3$</td>
             <td>$T_2$</td>
             <td>$W_{32}$</td>
        </tr>
        <tr>
             <td>$T_3$</td>
             <td>$W_{33}$</td>
        </tr>
        <tr>
             <td>$T_4$</td>
             <td>$W_{34}$</td>
        </tr>
        <tr>
             <td>$T_5$</td>
             <td>$W_{35}$</td>
        </tr>
        
    </tbody>
</table>

Being $P_i$ the amount of times the user has played the artist $A_i$ (found as _weight_); $W_i$ the amount of users that has tagged the artist $A_i$ with the tag $T_j$.

From the table we now an Artist has been listened: 

$$listenShare_i = \frac{P_i}{\sum_{i = 1}^{N}P_i}$$

And for the tag

$$tagShare_j = \sum_{i = 1}^{N}\frac{W_{ij}*listenShare_i}{\sum_{z=1}^{M}W_{iz}}$$

We can then, retrieve the interest from 0 to 5, capping the result of the $tagShare_j$ asigning 5 to the maximum value $max(tagShare_j)$. $N$ is the number of artist, $M$ is the number of tags

As an example, say we have the following data for the user $U_x$

<table>
    <thead>
        <tr>
            <th>Plays $P_i$</th>
            <th>Artist $A_i$</th>
            <th>Tag $T_j$</th>
            <th>Weight $W_{ij}$</th>
        </tr>
    </thead>
    <tbody>
        <tr>
             <td rowspan="2">$150$</td>
             <td rowspan="2">$A_1$</td>
             <td>$T_1$</td>
             <td>$30$</td>
        </tr>
        <tr>
             <td>$T_2$</td>
             <td>$15$</td>
        </tr>
        <tr>
             <td rowspan="2">$45$</td>
             <td rowspan="2">$A_2$</td>
             <td>$T_2$</td>
             <td>$13$</td>
        </tr>
        <tr>
             <td>$T_3$</td>
             <td>$7$</td>
        </tr>
        <tr>
             <td rowspan="4">$15$</td>
             <td rowspan="4">$A_3$</td>
             <td>$T_2$</td>
             <td>$45$</td>
        </tr>
        <tr>
             <td>$T_3$</td>
             <td>$15$</td>
        </tr>
        <tr>
             <td>$T_4$</td>
             <td>$16$</td>
        </tr>
        <tr>
             <td>$T_5$</td>
             <td>$6$</td>
        </tr>
        
    </tbody>
</table>


Using the formula $tagShare$ we can get the interest on the user $U_x$ on the tags:

| Tag | $tagShare$ | Interest |
| -- | -- | -- |
| $T_1$ | $0.476$ | $5.000$ |
| $T_2$ | $0.417$ | $4.374$ |
| $T_3$ | $0.088$ | $0.925$ |
| $T_4$ | $0.014$ | $0.146$ |
| $T_5$ | $0.005$ | $0.055$ |

Tenemos: dado un usuario (Ux), sus intereses representados por Tags (T1.T2...Tn) y su importancia representado 
en un rango de valore del 0 al 5. También, tenemos artistas(A1,A2,...An) que están representados por tags
brindados por los usuarios. Entonces, lo que se debe hacer es:

1. dado un Usuario (U1), calcular el interés intrínseco asociado a sus tags.
2. Realizar por artista un ponderamiento entre los top 3 tags del usuario U1 que estén incluidos en el artista
   A1, de manera que el total de tags asociados al artista A1 sean la unidad, y que el resultados de los tags 1,
   2,3 representen, por ejemplo, el 40% del total.
3. Finalmente, se recomendará el artista cuya ponderación de los 3 tags principales del usuario sea la más alta
   del dataset.
    
psdt: De haberse agotado el dataset de artistas y que la ponderación no haya sido lo suficientemente alta(ese
      40% no haya alcanzado el mínimo requerido para sugerir, digamos 50%) se procederá a sugerir el artista
      cuyos tags esten estrechamente relacionados con los top 3 tags del usuario. 
      ¿Cómo se calculará esto?
      Con la similitud de cosenos. Se hallará, por ejemplo, la similitud entre el tag 1 y el tag 2. De estar
      ambos estrechamente relacionados(similitud mayor al 80%), y que los top 3 tags este el tag 1 pero NO el 2,
      y de cumplirse lo anteriormente señalado, se recomendará un artista que tenga el tag 2 siguiendo los pasos
      2 y 3.

In [9]:
tagsXArtist = {}
#user_taggedartists.query('artistID == 289')
#tags x Artist
# dic = {}
# dic['289'] = [23,39,24]
# dic['289'].append(20)
# lista = dic['289']
# print (dic['289'])
# print ("Lista: ", lista[0])

#artists.query('id == 289')

#iterable = user_taggedartists.query('artistID == 289')

#print(iterable)
# for index, row in iterable.iterrows():
#      print(row['tagID'])
# i = 0
# i+=1
#     if(i == 15):
#         break
for index, row in artists.iterrows():
    idArt = row['id']
    #print (idArt)
    cad = '' + str(idArt)
    tagsXArtist[cad] = []
    auxQuery = 'artistID == ' + str(idArt)
    iterable = user_taggedartists.query(auxQuery)
    for index2, row2 in iterable.iterrows():
        tagsXArtist[cad].append(row2['tagID'])
    
print(tagsXArtist['13'])
#print(tagsXArtist['14'])
#A este punto tenemos todos los tags que le corresponden a sus respectivos artistas

[9, 86, 565, 61, 565, 61, 86, 565, 9, 61, 86, 86, 565, 18, 61, 565, 10, 86, 4328, 61, 86, 12]


In [10]:
#Ahora procederé a transformas el tagsXArtist a una tupla, cuyo primer parametro sera el tag y el segundo
#la ocurrencia
import collections
counter = collections.Counter(tagsXArtist['1'])
print(counter.most_common(5))
newList = counter.most_common(5)
#De esta forma accedes al idTag newList[0][0], y de esta newList[0][1] a su frecuencia. con len(tagsXArtist['289'])
#sacas el total de tags en la lista
frecTagsXArtist = {}
# print(key, value)
#     break
for key, value in tagsXArtist.items():
    newkey = '' + str(key)
    #     frecTagsXArtist[newkey] = 
    counter = collections.Counter(value)
    frecTagsXArtist[int(newkey)] = counter.most_common(5)
    
#frecTagsXArtist

[(139, 5), (141, 3), (179, 2), (541, 2), (552, 1)]


In [12]:
frecTagsXArtist

{1: [(139, 5), (141, 3), (179, 2), (541, 2), (552, 1)],
 2: [(575, 8), (30, 5), (179, 5), (127, 2), (61, 1)],
 3: [(4, 3), (3706, 2), (2092, 1), (4117, 1), (4122, 1)],
 4: [(139, 6), (541, 4), (141, 3), (179, 3), (1, 2)],
 5: [(575, 3), (179, 1), (190, 1), (612, 1), (1097, 1)],
 6: [(7, 6), (4, 4), (82, 4), (1482, 1), (127, 1)],
 7: [(1, 25), (61, 23), (73, 18), (6, 15), (79, 9)],
 8: [(139, 8), (141, 4), (1, 4), (541, 3), (1219, 3)],
 9: [(61, 10), (86, 8), (565, 3), (127, 2), (562, 1)],
 10: [(86, 9), (565, 5), (61, 4), (18, 2), (563, 1)],
 11: [(86, 5), (61, 4), (565, 4), (563, 1), (567, 1)],
 12: [(5, 7), (4, 4), (742, 2), (2096, 2), (743, 1)],
 13: [(86, 6), (565, 5), (61, 5), (9, 2), (18, 1)],
 15: [(4, 12), (1532, 8), (3343, 4), (5, 2), (2139, 1)],
 16: [(179, 6), (61, 4), (190, 4), (1478, 2), (575, 2)],
 17: [(61, 5), (565, 4), (86, 3), (10, 2), (12, 2)],
 18: [(575, 7), (179, 6), (61, 4), (18, 3), (86, 2)],
 19: [(61, 8), (86, 5), (18, 3), (565, 3), (959, 1)],
 20: [(4, 3), (8

In [27]:
def tag_share(user_artist_plays, all_artist_tags):
    tagshare = []
    
    artists = []
    total_weight = 0
    for index, row in user_artist_plays.iterrows():
        artistID = row['artistID']
        weight = row['weight']
        total_weight += weight
        artists.append((artistID, weight))
    #print("total artist:", len(artists))
    #even_total = 0
    tag_sum = {}
    for artistID, weight in artists:
        artist_tags = all_artist_tags[artistID]
        total_tag_weight = 0
        listen_share = weight/total_weight
        # find the total tag weight for each artist
        for tagid, weight in artist_tags:
            total_tag_weight += weight

        # get the sum of tags
        #total_keys = 0
        for tagid, weight in artist_tags:
            if tagid not in tag_sum.keys():
                tag_sum[tagid] = weight*listen_share/total_tag_weight
                #total_keys += 1
            else:
                tag_sum[tagid] += weight*listen_share/total_tag_weight
        #print(artistID, total_keys)
        #even_total += total_keys
    #print(even_total)
    
    tagshare = {}
    max_weight = max(tag_sum.values())
    ratio = 5/max_weight
    for i,x in tag_sum.items():
        tagshare[i] = int(x*ratio)
    
    return tagshare
        

In [28]:
from sklearn.feature_extraction import DictVectorizer
import operator 

vec = DictVectorizer()
K = 10
users_tag_interest = {}
users_tag_interest_novec = {}
for index, row in user_artists.iterrows():
    userid = row['userID']
    if int(userid) not in users_tag_interest.keys():
        query = 'userID==' + str(userid)
        tagdata = user_artists.query(query)
        
        v = tag_share(tagdata, frecTagsXArtist)
        sorted_v = sorted(v.items(), key=operator.itemgetter(1), reverse=True)
        sorted_dic = {}
        i = 0
        for tid, x in sorted_v:
            if i == 5: break
            sorted_dic[tid] = x
            i += 1
        users_tag_interest_novec[int(userid)] = sorted_dic
        users_tag_interest[int(userid)] = vec.fit_transform(sorted_dic)

In [44]:
[tags for _, tags in frecTagsXArtist.items()][0]

[(139, 5), (141, 3), (179, 2), (541, 2), (552, 1)]

In [62]:
frecTagsXArtist_dic = {}
frecTagsXArtist_vectorized = {}
for artistID, tags in frecTagsXArtist.items():
    frecTagsXArtist_dic[artistID] = {}
    for tag, w in tags:
        maxval = max([i for _,i in tags])
        frecTagsXArtist_dic[artistID][tag] = int(w * 5 / maxval)

    frecTagsXArtist_vectorized[artistID] = vec.fit_transform(frecTagsXArtist_dic[artistID])

In [63]:
print("ArtistID = 2, tags vectorized")
print(frecTagsXArtist_vectorized[2])
print("ArtistID = 2, tags no vectorized")
print(frecTagsXArtist[2])
print("UserID = 2 interests vectorized")
print(users_tag_interest[2])

print("UserID = 2 interest no vectorized ")
print(users_tag_interest_novec[2])

ArtistID = 2, tags vectorized
  (0, 0)	3.0
  (0, 2)	1.0
  (0, 3)	3.0
  (0, 4)	5.0
ArtistID = 2, tags no vectorized
[(575, 8), (30, 5), (179, 5), (127, 2), (61, 1)]
UserID = 2 interests vectorized
  (0, 0)	2.0
  (0, 1)	2.0
  (0, 2)	4.0
  (0, 3)	2.0
  (0, 4)	3.0
UserID = 2 interest no vectorized 
{18: 4, 25: 3, 16: 2, 21: 2, 13: 2}


In [64]:
print(type(frecTagsXArtist_vectorized[2].toarray()))
print(frecTagsXArtist_vectorized[2].shape)

<class 'numpy.ndarray'>
(1, 5)


In [65]:
print(type(users_tag_interest[2]))
print(users_tag_interest[2].T)

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 0)	2.0
  (1, 0)	2.0
  (2, 0)	4.0
  (3, 0)	2.0
  (4, 0)	3.0


In [66]:
import numpy
a = numpy.dot(frecTagsXArtist_vectorized[2].toarray(),users_tag_interest[2].T.toarray())
b = frecTagsXArtist_vectorized[2].toarray()+users_tag_interest[2].toarray()
a/b

array([[  6.2  ,  15.5  ,   6.2  ,   6.2  ,   3.875]])

In [21]:
#Andre: correlación entre artistas dado usuarios
M = user_artists.pivot_table(index = ['userID'], columns = ['artistID'], values = 'weight')
print(M.shape)
#print(M)
print(M.sample(3))

(1892, 17632)
artistID  1      2      3      4      5      6      7      8      9      \
userID                                                                    
830         NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
2034        NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
263         NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   

artistID  10     ...    18736  18737  18738  18739  18740  18741  18742  \
userID           ...                                                      
830         NaN  ...      NaN    NaN    NaN    NaN    NaN    NaN    NaN   
2034        NaN  ...      NaN    NaN    NaN    NaN    NaN    NaN    NaN   
263         NaN  ...      NaN    NaN    NaN    NaN    NaN    NaN    NaN   

artistID  18743  18744  18745  
userID                         
830         NaN    NaN    NaN  
2034        NaN    NaN    NaN  
263         NaN    NaN    NaN  

[3 rows x 17632 columns]


In [22]:
def pearson(s1, s2):
    #take two pd.Series objects and return a perarson correlation 
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2)) 

In [23]:
def get_recs(artist_id, M, num):
    review = []
    for tit in M.columns:
        title = int(tit)
        if (title == artist_id):
            continue
        cor = pearson(M[artist_id], M[title])
        if np.isnan(cor):
            continue
        else:
            review.append((title, cor))
    review.sort(key = lambda tup: tup[1], reverse = True)
    return review[:num]

In [24]:
recs = get_recs(10, M, 10)



In [25]:
recs[:10]

[(1270, 0.84625166362640236),
 (6870, 0.79973610999270717),
 (31, 0.60541983452097059),
 (17, 0.60140355270117807),
 (6873, 0.58777609737501391),
 (10997, 0.58584335326227865),
 (11, 0.56624050673298409),
 (1299, 0.55629341727635284),
 (1914, 0.50591322681864803),
 (171, 0.41072864956425126)]

In [26]:
print(artists.query('id == 10'))
print(artists.query('id == 1270'))

   id     name                               url  \
9  10  Grendel  http://www.last.fm/music/Grendel   

                                          pictureURL  
9  http://userserve-ak.last.fm/serve/252/5872875.jpg  
        id           name                                     url  \
1261  1270  Tactical Sekt  http://www.last.fm/music/Tactical+Sekt   

                                            pictureURL  
1261  http://userserve-ak.last.fm/serve/252/158232.jpg  


### Bibliography

1. Robillard, M., Maalej, W., Walker, R. J., & Zimmermann, T. (Eds.). (2014). Recommendation Systems in Software Engineering. Springer Berlin Heidelberg. Cap. 2 p. 20-21 https://doi.org/10.1007/978-3-642-45135-5
2. Jannach, D., Zanker, M., Felfernig, A., & Friedrich, G. (2011). Recommender systems: an introduction. Cambridge University Press (Vol. 40). https://doi.org/10.1017/CBO9780511763113