# Last.fm Dataset

Let's take a first look of the data

In [1]:
import pandas as pd
from IPython.display import display

In [2]:
tables = {
    'artists': pd.read_table('../data/artists.dat'),
    'tags': pd.read_table('../data/tags.dat'),
    'user_artists': pd.read_table('../data/user_artists.dat'),
    'user_friends': pd.read_table('../data/user_friends.dat'),
    'user_taggedartists': pd.read_table('../data/user_taggedartists.dat'),
    'user_taggedartists-timestamps': pd.read_table('../data/user_taggedartists-timestamps.dat')
}

In [3]:
len(set(tables['tags']['tagValue']))

11946

In [4]:
def tagPreprocessing(tag):
    #write every tag in lower case
    tag = tag.lower()
    
    #remove punctuation marks, symbols, blank spaces
    punct_to_remove = ['\r', '\n', '\t'] + list(' #%&\*+/<=>-\\^{|}~()[]:;\'`¡.,¿?!')
    for ch in punct_to_remove:
        tag = tag.replace(ch, '')
    
    return tag

In [5]:
tables['tags']['tagValue'] = tables['tags']['tagValue'].map(lambda x: tagPreprocessing(x))
tables['tags'].head()

Unnamed: 0,tagID,tagValue
0,1,metal
1,2,alternativemetal
2,3,gothrock
3,4,blackmetal
4,5,deathmetal


In [6]:
dictionarytagIDs = {}
for tagValueIDs in tables['tags'].groupby(['tagValue']).groups.values():
    for tagID in tagValueIDs:
        dictionarytagIDs[tagID]=tagValueIDs[0]

In [7]:
tables['tags'].groupby(['tagValue']).groups

{'ecologicalpowermetal': [4420],
 'billycorgan': [4017],
 'reversedguitar': [9608],
 'sonja': [4332],
 'makesmereminisce': [11337],
 'beardcore': [2674],
 'jihad': [10763],
 'ohsobeautiful': [872],
 'dudakesnay': [10439],
 'divinayyo1401091745sala2twilightlll': [5311],
 'desnudate': [6475],
 'whitenoize': [9822],
 'whattheyworselive': [7973],
 'highschoolmemories': [7572],
 'shura': [5655],
 'lowwhistle': [8382],
 'unanswered': [8537],
 'xtc': [3651],
 'crooner': [4484],
 'imissyou': [4134],
 'bestmusicof2011': [7076],
 'funkyfresh': [11917],
 'digit': [10722],
 'thebestofmetallica': [6724],
 'regional': [8839],
 'pauladeanda': [4854],
 'veronicamars': [4652],
 'hardtoexplainwhatifeelwhenilisten': [8877],
 'dels': [3982],
 'intensesadness': [6889],
 'bestrapperalive': [7448],
 'disturb': [3828],
 'bestvoice': [8918],
 'minimalista': [529],
 'loveisweird': [1451],
 'powerfullyrics': [6729],
 'ptuthrashmetal': [4365],
 'amyrichter': [8384],
 'devymetal': [11556],
 'iseeyou': [10946, 1094

In [8]:
tables['user_taggedartists'].head()

Unnamed: 0,userID,artistID,tagID,day,month,year
0,2,52,13,1,4,2009
1,2,52,15,1,4,2009
2,2,52,18,1,4,2009
3,2,52,21,1,4,2009
4,2,52,41,1,4,2009


In [9]:
len(set(tables['user_taggedartists']['tagID']))

9749

In [10]:
def applyDictionaryTagIDs(tagID, dictionarytagIDs):
    try:
        return dictionarytagIDs[tagID]
    except:
        return tagID

In [11]:
tables['user_taggedartists']['tagID'] = tables['user_taggedartists']['tagID'].apply(lambda x: applyDictionaryTagIDs(x, dictionarytagIDs))

In [12]:
len(set(tables['user_taggedartists']['tagID']))

9491

In [13]:
tables['user_taggedartists']

Unnamed: 0,userID,artistID,tagID,day,month,year
0,2,52,13,1,4,2009
1,2,52,15,1,4,2009
2,2,52,18,1,4,2009
3,2,52,21,1,4,2009
4,2,52,16,1,4,2009
5,2,63,13,1,4,2009
6,2,63,14,1,4,2009
7,2,63,23,1,4,2009
8,2,63,40,1,4,2009
9,2,73,13,1,4,2009


In [18]:
len([aid for aid in tables['artists']['id']])

17632

In [21]:
len([tid for tid in tables['tags']['tagID']])

11946

In [23]:
for i, j in enumerate(tables['artists']['id'][:10]):
    print i,j

0 1
1 2
2 3
3 4
4 5
5 6
6 7
7 8
8 9
9 10


In [3]:
print '\n__________________________________________________________\n'
for tableName in tables.keys():
    print tableName
    display(tables[tableName].head(5))
    print '\n__________________________________________________________\n'


__________________________________________________________

user_taggedartists-timestamps


Unnamed: 0,userID,artistID,tagID,timestamp
0,2,52,13,1238536800000
1,2,52,15,1238536800000
2,2,52,18,1238536800000
3,2,52,21,1238536800000
4,2,52,41,1238536800000



__________________________________________________________

tags


Unnamed: 0,tagID,tagValue
0,1,metal
1,2,alternative metal
2,3,goth rock
3,4,black metal
4,5,death metal



__________________________________________________________

user_friends


Unnamed: 0,userID,friendID
0,2,275
1,2,428
2,2,515
3,2,761
4,2,831



__________________________________________________________

artists


Unnamed: 0,id,name,url,pictureURL
0,1,MALICE MIZER,http://www.last.fm/music/MALICE+MIZER,http://userserve-ak.last.fm/serve/252/10808.jpg
1,2,Diary of Dreams,http://www.last.fm/music/Diary+of+Dreams,http://userserve-ak.last.fm/serve/252/3052066.jpg
2,3,Carpathian Forest,http://www.last.fm/music/Carpathian+Forest,http://userserve-ak.last.fm/serve/252/40222717...
3,4,Moi dix Mois,http://www.last.fm/music/Moi+dix+Mois,http://userserve-ak.last.fm/serve/252/54697835...
4,5,Bella Morte,http://www.last.fm/music/Bella+Morte,http://userserve-ak.last.fm/serve/252/14789013...



__________________________________________________________

user_artists


Unnamed: 0,userID,artistID,weight
0,2,51,13883
1,2,52,11690
2,2,53,11351
3,2,54,10300
4,2,55,8983



__________________________________________________________

user_taggedartists


Unnamed: 0,userID,artistID,tagID,day,month,year
0,2,52,13,1,4,2009
1,2,52,15,1,4,2009
2,2,52,18,1,4,2009
3,2,52,21,1,4,2009
4,2,52,41,1,4,2009



__________________________________________________________



Create dictionaries of artists and tags IDs using the table 'artists' and 'tags':

In [93]:
taggedartists = tables['user_taggedartists'].copy()
def detectUnknownArtists(artistID, knownArtists):
    try:
        return knownArtists[artistID]
    except:
        return -1

In [94]:
artistDictionary = pd.Series(tables['artists'].id.values, index=tables['artists'].id).to_dict()

In [95]:
# Replace artist and tag IDs by their values
taggedartists['artistID'] = taggedartists['artistID'].apply(lambda x: detectUnknownArtists(x, artistDictionary))

# We remove those tags associated to artistIDs not included in the table 'artists.dat'
print str(taggedartists[taggedartists.artistID == -1].shape[0])+' tags removed'
taggedartists = taggedartists[taggedartists.artistID != -1]

# remove rows whose tag year is smaller than 2000 (outliers?)
taggedartists = taggedartists[taggedartists['year']>=2000]

1538 tags removed


In [88]:
taggedartists.head()

Unnamed: 0,userID,artistID,tagID,day,month,year
0,2,52,13,1,4,2009
1,2,52,15,1,4,2009
2,2,52,18,1,4,2009
3,2,52,21,1,4,2009
4,2,52,16,1,4,2009


In [81]:
artistDictionary = pd.Series(tables['artists'].name.values, index=tables['artists'].id).to_dict()
# There are gaps of pairs (artistID, name) that arise some errors afterwards
# We fill these gaps in our dictionary putting 'unknown'
for i in range(1, max(artistDictionary.keys())+1):
    if i not in artistDictionary: 
        artistDictionary[i] = 'unknown'

tagDictionary = pd.Series(tables['tags'].tagValue.values, index=tables['tags'].tagID).to_dict()

In [85]:
artistDictionary

{1: 1,
 2: 2,
 3: 3,
 4: 4,
 5: 5,
 6: 6,
 7: 7,
 8: 8,
 9: 9,
 10: 10,
 11: 11,
 12: 12,
 13: 13,
 14: -1,
 15: 15,
 16: 16,
 17: 17,
 18: 18,
 19: 19,
 20: 20,
 21: 21,
 22: 22,
 23: 23,
 24: 24,
 25: 25,
 26: 26,
 27: 27,
 28: 28,
 29: -1,
 30: 30,
 31: 31,
 32: 32,
 33: -1,
 34: 34,
 35: -1,
 36: 36,
 37: 37,
 38: 38,
 39: 39,
 40: 40,
 41: 41,
 42: -1,
 43: 43,
 44: 44,
 45: 45,
 46: 46,
 47: 47,
 48: 48,
 49: 49,
 50: 50,
 51: 51,
 52: 52,
 53: 53,
 54: 54,
 55: 55,
 56: 56,
 57: 57,
 58: 58,
 59: 59,
 60: 60,
 61: 61,
 62: 62,
 63: 63,
 64: 64,
 65: 65,
 66: 66,
 67: 67,
 68: 68,
 69: 69,
 70: 70,
 71: 71,
 72: 72,
 73: 73,
 74: 74,
 75: 75,
 76: 76,
 77: 77,
 78: 78,
 79: 79,
 80: 80,
 81: 81,
 82: 82,
 83: 83,
 84: 84,
 85: 85,
 86: 86,
 87: 87,
 88: 88,
 89: 89,
 90: 90,
 91: 91,
 92: 92,
 93: 93,
 94: 94,
 95: 95,
 96: 96,
 97: 97,
 98: 98,
 99: 99,
 100: 100,
 101: 101,
 102: 102,
 103: 103,
 104: 104,
 105: 105,
 106: 106,
 107: 107,
 108: 108,
 109: 109,
 110: 110,
 111: 

In [6]:
usersArtists = tables['user_artists'].copy()

# Replace artist IDs by its values
usersArtists['artistID'] = usersArtists['artistID'].apply(lambda x: artistDictionary[x])
usersArtists = usersArtists.rename(columns={'artistID':'artist'})

usersArtists['artist'].value_counts()[0:10]

Lady Gaga             611
Britney Spears        522
Rihanna               484
The Beatles           480
Katy Perry            473
Madonna               429
Avril Lavigne         417
Christina Aguilera    407
Muse                  400
Paramore              399
Name: artist, dtype: int64

In [7]:
taggedartists = tables['user_taggedartists'].copy()

# Replace artist and tag IDs by their values
taggedartists['artistID'] = taggedartists['artistID'].apply(lambda x: artistDictionary[x])
taggedartists['tagID'] = taggedartists['tagID'].apply(lambda x: tagDictionary[x])
taggedartists = taggedartists.rename(columns={'artistID':'artist', 'tagID':'tag'})

# We remove those tags associated to artistIDs not included in the table 'artists.dat'
print str(taggedartists[taggedartists.artist == 'unknown'].shape[0])+' tags removed'
taggedartists = taggedartists[taggedartists.artist != 'unknown']

# remove rows whose tag year is smaller than 2000 (outliers?)
taggedartists = taggedartists[taggedartists['year']>=2000]

# join the time in 1 column, passing everything to days and normalizing between 0 and 1
taggedartists['time'] = taggedartists['year']*365 + (taggedartists['month']-1)*30 #+ taggedartists['day']
taggedartists['time'] = (taggedartists['time']- taggedartists['time'].min())/(taggedartists['time'].max()-taggedartists['time'].min())
taggedartists.drop(['day','month', 'year'], axis=1, inplace=True)

taggedartists['tag'].value_counts()[0:1000]

1538 tags removed


rock                               7459
pop                                5401
alternative                        5223
electronic                         4616
indie                              4422
female vocalists                   4207
80s                                2782
dance                              2725
alternative rock                   2619
classic rock                       2282
british                            2085
indie rock                         2053
singer-songwriter                  1826
hard rock                          1784
metal                              1722
experimental                       1702
ambient                            1660
90s                                1604
new wave                           1589
seen live                          1426
chillout                           1375
hip-hop                            1366
folk                               1297
punk                               1288
electronica                        1268


In [10]:
sum(taggedartists['tag'].value_counts())

184936

In [11]:
len(taggedartists['tag'].value_counts())

9717

In [123]:
class LastfmNetwork(object):
        
    def tagPreprocessing(self, tag):
        #write every tag in lower case
        tag = tag.lower()
        #remove punctuation marks, symbols, whitespaces
        punct_to_remove = ['\r', '\n', '\t'] + list(' #%&\*+/<=>-\\^{|}~()[]:;\'`¡.,¿?!')
        for ch in punct_to_remove:
            tag = tag.replace(ch, '')
        return tag
    
    def applyDictionaryTagIDs(self, tagID, dictionarytagIDs):
        try:
            return dictionarytagIDs[tagID]
        except:
            return tagID
    
    def applyDictionaryTagIDs(self, tagID, dictionarytagIDs):
        try:
            return dictionarytagIDs[tagID]
        except:
            return tagID
        
    def detectUnknownArtists(self, artistID, knownArtists):
        try:
            return knownArtists[artistID]
        except:
            return -1

    def dataPreprocessing(self, artists, tags, user_taggedartists):
        # There are artistIDs that appear at user_taggedartists but not in artists; 
        #we detect and remove these unknown artistIDs from user_taggedartists
        knownArtists = pd.Series(artists.id.values, index=artists.id).to_dict()
        user_taggedartists['artistID'] = user_taggedartists['artistID'].apply(lambda x: self.detectUnknownArtists(x, knownArtists))
        user_taggedartists = user_taggedartists[user_taggedartists.artistID != -1]

        # remove rows whose tag year is smaller than 2000 (outliers?)
        user_taggedartists = user_taggedartists[user_taggedartists['year']>=2000]
        
        # tag preprocessing, removing whitespaces and symbols
        tags['tagValue'] = tags['tagValue'].map(lambda x: self.tagPreprocessing(x))
        
        # group the tagIDs of the tags that ended up being the same into a single tagID
        # we create a dictionary that performs this mapping
        dictionaryTagIDs = {}
        for tagValueIDs in tags.groupby(['tagValue']).groups.values():
            for tagID in tagValueIDs:
                dictionaryTagIDs[tagID]=tagValueIDs[0]
        # apply the dictionary to user_taggedartists 
        user_taggedartists['tagID'] = user_taggedartists['tagID'].apply(lambda x: self.applyDictionaryTagIDs(x, dictionaryTagIDs))
        
        return artists, tags, user_taggedartists
        
    def __init__(self, artists, tags, user_friends, user_artists, user_taggedartists, r=0.1, preprocessing=True):
        if preprocessing:
            artists, tags, user_taggedartists = self.dataPreprocessing(artists, tags, user_taggedartists)
        self.artists = artists
        self.tags = tags
        self.user_taggedartists = user_taggedartists
        

In [124]:
artists=pd.read_table('../data/artists.dat')
tags=pd.read_table('../data/tags.dat')
user_friends=pd.read_table('../data/user_friends.dat')
user_artists=pd.read_table('../data/user_artists.dat')
user_taggedartists=pd.read_table('../data/user_taggedartists.dat')

In [126]:
a = LastfmNetwork(artists, tags, user_friends, user_artists, user_taggedartists)
len(set(a.user_taggedartists['tagID']))

9460

In [127]:
b = LastfmNetwork(artists, tags, user_friends, user_artists, user_taggedartists, preprocessing=False)
len(set(b.user_taggedartists['tagID']))

9749

In [129]:
tags_artists = a.user_taggedartists.groupby(['tagID', 'artistID']).groups

for (tag, artist) in tags_artists:

    print tags_artists[(tag, artist)]

[93613]
[16764, 43096, 47386, 49466, 60675, 80429, 90694, 108965, 123568, 131624, 134277, 138264, 141170, 158478, 160462, 165496, 170989, 180456]
[47820]
[184539]
[143489]
[4735, 76373]
[78594]
[91591]
[87604]
[14747]
[172337]
[45045]
[86177]
[178050]
[146092, 163970]
[32129]
[162217]
[60411]
[1823]
[224, 80845]
[177194]
[89465]
[89074]
[158632]
[48282]
[151175]
[143683]
[146061]
[16732, 110674]
[85883]
[181652]
[141861]
[95736]
[136795]
[83854]
[112939]
[101601]
[115968]
[81186]
[56057]
[20606]
[74521]
[84357]
[157080]
[16312]
[10651, 64166]
[91004]
[120514]
[78149]
[58815]
[29240]
[122171, 142947]
[47814, 112295]
[164633]
[43461]
[11324, 113885, 125065]
[27620, 84358, 180060]
[172929]
[33837, 41516, 150862]
[162237]
[66679]
[93178]
[4254]
[46825, 161493]
[299]
[49915, 51784, 64687, 151826, 158068, 165751, 167442]
[161439]
[43022]
[60033, 97117, 163341, 176532]
[30656, 60144, 72562, 182815, 184036]
[181232]
[179270]
[177757]
[184032]
[74034]
[32397]
[56951, 64439, 72268, 85526, 86838,