In [0]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

# display results to 3 decimal points, not in scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [0]:
user_data = pd.read_table('usersha1-artmbid-artname-plays.tsv',
                          header = None, nrows = 2e7,
                          names = ['users', 'musicbrainz-artist-id', 'artist-name', 'plays'],
                          usecols = ['users', 'artist-name', 'plays'])
user_profiles = pd.read_table('usersha1-profile.tsv',
                          header = None,
                          names = ['users', 'gender', 'age', 'country', 'signup'],
                          usecols = ['users', 'country'])

In [3]:
user_data.head()


Unnamed: 0,users,artist-name,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137.0
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099.0
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897.0
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717.0
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706.0


In [0]:
user_profiles.head()

In [0]:
if user_data['artist-name'].isnull().sum() > 0:
    user_data = user_data.dropna(axis = 0, subset = ['artist-name'])

In [7]:
artist_plays = (user_data.
     groupby(by = ['artist-name'])['plays'].
     sum().
     reset_index().
     rename(columns = {'plays': 'total_artist_plays'})
     [['artist-name', 'total_artist_plays']]
    )
artist_plays.head()
user_data_with_artist_plays = user_data.merge(artist_plays, left_on = 'artist-name', right_on = 'artist-name', how = 'left')
user_data_with_artist_plays.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137.0,6456.0
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099.0,835919.0
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897.0,42805.0
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717.0,104473.0
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706.0,21702.0


In [8]:
popularity_threshold = 40000
user_data_popular_artists = user_data_with_artist_plays.query('total_artist_plays >= @popularity_threshold')
user_data_popular_artists.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099.0,835919.0
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897.0,42805.0
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717.0,104473.0
5,00000c289a1829a808ac09c00daf10bc3c4e223b,red hot chili peppers,691.0,3221675.0
7,00000c289a1829a808ac09c00daf10bc3c4e223b,the black dahlia murder,507.0,232786.0


In [9]:
combined = user_data_popular_artists.merge(user_profiles, left_on = 'users', right_on = 'users', how = 'left')
usa_data = combined.query('country == \'United States\'')
usa_data.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays,country
123,00007a47085b9aab8af55f52ec8846ac479ac4fe,devendra banhart,456.0,620383.0,United States
124,00007a47085b9aab8af55f52ec8846ac479ac4fe,boards of canada,407.0,1500652.0,United States
125,00007a47085b9aab8af55f52ec8846ac479ac4fe,cocorosie,386.0,544809.0,United States
126,00007a47085b9aab8af55f52ec8846ac479ac4fe,aphex twin,213.0,1057383.0,United States
127,00007a47085b9aab8af55f52ec8846ac479ac4fe,animal collective,203.0,873387.0,United States


In [10]:
if not usa_data[usa_data.duplicated(['users', 'artist-name'])].empty:
    initial_rows = usa_data.shape[0]

    print ("Initial dataframe shape {0}".format(usa_data.shape))
    usa_data = usa_data.drop_duplicates(['users', 'artist-name'])
    current_rows = usa_data.shape[0]
    print ("New dataframe shape {0}".format(usa_data.shape))
    print ("Removed {0} rows".format(initial_rows - current_rows))


Initial dataframe shape (563154, 5)
New dataframe shape (563152, 5)
Removed 2 rows


In [0]:
artists= usa_data['artist-name']

In [0]:
tags_data = pd.read_csv('tags_data2.csv',encoding = "ISO-8859-1", header=None, names=['artist','similars','tags'])

In [17]:
tags_data.head()

Unnamed: 0,artist,similars,tags
0,Computer Truck,[],[]
1,Lack of Limits,[],[]
2,The Kinks,"[['TRMLOXQ12903CF06BB', 1], ['TRCOWHF128F93216...","[['classic rock', '100'], ['60s', '76'], ['bri..."
3,Everclear,[],[]
4,Paula Abdul,"[['TRALFWK128F1458532', 1], ['TRRMRMZ128F14585...","[['pop', '100'], ['female vocalists', '66'], [..."
