In [13]:
import os
import numpy as np
import pandas as pd
import subprocess

from scipy.sparse import coo_matrix, csr_matrix
from sklearn.preprocessing import normalize

http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-360K.html

In [28]:
# download the dataset if it isn't in the same folder
file_dir = 'lastfm-dataset-360K'
file_path = os.path.join(file_dir, 'usersha1-artmbid-artname-plays.tsv')
if not os.path.isdir(file_dir):
    subprocess.call(['curl', '-O', 'http://mtg.upf.edu/static/datasets/last.fm/lastfm-dataset-360K.tar.gz'])
    # http://askubuntu.com/questions/25347/what-command-do-i-need-to-unzip-extract-a-tar-gz-file
    subprocess.call(['tar', '-xvzf', 'lastfm-dataset-360K.tar.gz'])

# read in triplets of user/artist/playcount from the input dataset
col_names = ['user', 'artist', 'plays']
data = pd.read_csv(file_path, sep = '\t', usecols = [0, 2, 3], names = col_names)

# there are 2 rows that contains NaN value, simply drop them
data = data.dropna(axis = 0)
print(data.shape)
data.head()

(17535653, 3)


Unnamed: 0,user,artist,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706


In [29]:
# map each artist and user to a unique numeric value
data['user'] = data['user'].astype('category')
data['artist'] = data['artist'].astype('category')

# create a sparse matrix of using the (data, (row, col)) format
plays = coo_matrix(( data['plays'].astype(np.float64),
                     (data['artist'].cat.codes, data['user'].cat.codes) ))
plays

<292364x358868 sparse matrix of type '<class 'numpy.float64'>'
	with 17535653 stored elements in COOrdinate format>

In [43]:
row  = np.array([0, 3, 1, 0])
col  = np.array([0, 3, 1, 2])
data = np.array([4, 5, 7, 9])
test = coo_matrix((data, (row, col)), shape=(4, 4))
test.row

array([0, 3, 1, 0], dtype=int32)

In [44]:
test.col

array([0, 3, 1, 2], dtype=int32)

In [45]:
np.test.data

array([4, 5, 7, 9])

In [38]:
def bm25_weight(data, K1=100, B=0.8):
    """ Weighs each row of the matrix data by BM25 weighting """
    # calculate idf per term (user)
    N = float(data.shape[0])
    idf = numpy.log(N / (1 + numpy.bincount(data.col)))

    # calculate length_norm per document (artist)
    row_sums = numpy.squeeze(numpy.asarray(data.sum(1)))
    average_length = row_sums.sum() / N
    length_norm = (1.0 - B) + B * row_sums / average_length

    # weight matrix rows by bm25
    ret = coo_matrix(data)
    ret.data = ret.data * (K1 + 1.0) / (K1 * length_norm[ret.row] + ret.data) * idf[ret.col]
    return ret


def bm25(plays):
    plays = bm25_weight(plays)
    return plays.dot(plays.T)

array([     0,      0,      0, ..., 358867, 358867, 358867], dtype=int32)

In [37]:
plays.shape[0]

# calculate idf per term (user)
    N = float(data.shape[0])
    idf = numpy.log(N / (1 + numpy.bincount(data.col)))

292364

In [None]:
v1,v2 = [3, 45, 7, 2], [2, 54, 13, 15]
def cosine(plays):
    normalized = normalize(plays)
    return normalized.dot(normalized.T)


In [33]:
artists = dict(enumerate(data['artist'].cat.categories))

In [36]:
user_count = data.groupby('artist').size()
user_count

artist
 04)]                                                     1
 2                                                        4
 58725ab=>                                                1
 80lİ yillarin tÜrkÇe sÖzlÜ aŞk Şarkilari                 1
 amy winehouse                                            1
 cours de la somme                                        1
 fatboy slim                                              1
 kanye west                                               1
 mala rodriguez                                           1
 mohamed lamine                                           1
 oliver shanti & friends                                  1
 the ranting gryphon                                      6
! bruno fergani / ftp !                                   1
! music for donations !                                   1
! www.polskie-mp3.tk ! breakout                           1
! www.polskie-mp3.tk ! czan                               1
! www.polskie-mp3.tk ! lista prze

In [19]:

    
    
    to_generate = sorted(list(artists), key=lambda x: -user_count[x])

ValueError: negative row index found