# Last.fm Dataset

Let's take a first look of the data

In [1]:
import pandas as pd
from IPython.display import display

In [3]:
tables = {
    'artists': pd.read_table('artists.dat'),
    'tags': pd.read_table('tags.dat'),
    'user_artists': pd.read_table('user_artists.dat'),
    'user_friends': pd.read_table('user_friends.dat'),
    'user_taggedartists': pd.read_table('user_taggedartists.dat'),
    'user_taggedartists-timestamps': pd.read_table('user_taggedartists-timestamps.dat')
}

In [4]:
print '\n__________________________________________________________\n'
for tableName in tables.keys():
    print tableName
    display(tables[tableName].head(5))
    print '\n__________________________________________________________\n'


__________________________________________________________

user_taggedartists-timestamps


Unnamed: 0,userID,artistID,tagID,timestamp
0,2,52,13,1238536800000
1,2,52,15,1238536800000
2,2,52,18,1238536800000
3,2,52,21,1238536800000
4,2,52,41,1238536800000



__________________________________________________________

tags


Unnamed: 0,tagID,tagValue
0,1,metal
1,2,alternative metal
2,3,goth rock
3,4,black metal
4,5,death metal



__________________________________________________________

user_friends


Unnamed: 0,userID,friendID
0,2,275
1,2,428
2,2,515
3,2,761
4,2,831



__________________________________________________________

artists


Unnamed: 0,id,name,url,pictureURL
0,1,MALICE MIZER,http://www.last.fm/music/MALICE+MIZER,http://userserve-ak.last.fm/serve/252/10808.jpg
1,2,Diary of Dreams,http://www.last.fm/music/Diary+of+Dreams,http://userserve-ak.last.fm/serve/252/3052066.jpg
2,3,Carpathian Forest,http://www.last.fm/music/Carpathian+Forest,http://userserve-ak.last.fm/serve/252/40222717...
3,4,Moi dix Mois,http://www.last.fm/music/Moi+dix+Mois,http://userserve-ak.last.fm/serve/252/54697835...
4,5,Bella Morte,http://www.last.fm/music/Bella+Morte,http://userserve-ak.last.fm/serve/252/14789013...



__________________________________________________________

user_artists


Unnamed: 0,userID,artistID,weight
0,2,51,13883
1,2,52,11690
2,2,53,11351
3,2,54,10300
4,2,55,8983



__________________________________________________________

user_taggedartists


Unnamed: 0,userID,artistID,tagID,day,month,year
0,2,52,13,1,4,2009
1,2,52,15,1,4,2009
2,2,52,18,1,4,2009
3,2,52,21,1,4,2009
4,2,52,41,1,4,2009



__________________________________________________________



Create dictionaries of artists and tags IDs using the table 'artists' and 'tags':

In [5]:
artistDictionary = pd.Series(tables['artists'].name.values, index=tables['artists'].id).to_dict()
# There are gaps of pairs (artistID, name) that arise some errors afterwards
# We fill these gaps in our dictionary putting 'unknown'
for i in range(1, max(artistDictionary.keys())+1):
    if i not in artistDictionary: 
        artistDictionary[i] = 'unknown'

tagDictionary = pd.Series(tables['tags'].tagValue.values, index=tables['tags'].tagID).to_dict()

In [6]:
usersArtists = tables['user_artists'].copy()

# Replace artist IDs by its values
usersArtists['artistID'] = usersArtists['artistID'].apply(lambda x: artistDictionary[x])
usersArtists = usersArtists.rename(columns={'artistID':'artist'})

usersArtists['artist'].value_counts()[0:10]

Lady Gaga             611
Britney Spears        522
Rihanna               484
The Beatles           480
Katy Perry            473
Madonna               429
Avril Lavigne         417
Christina Aguilera    407
Muse                  400
Paramore              399
Name: artist, dtype: int64

In [10]:
taggedartists = tables['user_taggedartists'].copy()

# Replace artist and tag IDs by their values
taggedartists['artistID'] = taggedartists['artistID'].apply(lambda x: artistDictionary[x])
taggedartists['tagID'] = taggedartists['tagID'].apply(lambda x: tagDictionary[x])
taggedartists = taggedartists.rename(columns={'artistID':'artist', 'tagID':'tag'})

# We remove those tags associated to artistIDs not included in the table 'artists.dat'
print str(taggedartists[taggedartists.artist == 'unknown'].shape[0])+' tags removed'
taggedartists = taggedartists[taggedartists.artist != 'unknown']

# remove rows whose tag year is smaller than 2000 (outliers?)
taggedartists = taggedartists[taggedartists['year']>=2000]

# join the time in 1 column, passing everything to days and normalizing between 0 and 1
taggedartists['time'] = taggedartists['year']*365 + (taggedartists['month']-1)*30 #+ taggedartists['day']
taggedartists['time'] = (taggedartists['time']- taggedartists['time'].min())/(taggedartists['time'].max()-taggedartists['time'].min())
taggedartists.drop(['day','month', 'year'], axis=1, inplace=True)

taggedartists['tag'].value_counts()[0:1000]

1538 tags removed


rock                               7459
pop                                5401
alternative                        5223
electronic                         4616
indie                              4422
female vocalists                   4207
80s                                2782
dance                              2725
alternative rock                   2619
classic rock                       2282
british                            2085
indie rock                         2053
singer-songwriter                  1826
hard rock                          1784
metal                              1722
experimental                       1702
ambient                            1660
90s                                1604
new wave                           1589
seen live                          1426
chillout                           1375
hip-hop                            1366
folk                               1297
punk                               1288
electronica                        1268
