In [2]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
from numpy import zeros, log, array
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import pairwise_distances
import scipy.spatial.distance as dist
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
path = os.path.expanduser("~/Google Drive/CSVs/artname-plays.tsv")

In [4]:
class MusicData(object):
    def __init__(self, filename):
        # load TSV file from disk
        self.data = pd.read_table(filename,
                                      usecols=[0, 2, 3],
                                      names=['user', 'artist', 'plays'])

        # generate sets for artists/users
        self.artist_sets = dict((artist, set(users)) for artist, users in
                                self.data.groupby('artist')['user'])
        self.user_sets = dict((user, set(artists)) for user, artists in
                              self.data.groupby('user')['artist'])

        # assign each user a unique numeric id
        userids = defaultdict(lambda: len(userids))
        self.data['userid'] = self.data['user'].map(userids.__getitem__)

        # get a sparse vector for each artist
        self.artists = dict((artist,
                             csr_matrix((array(group['plays']),
                                         (zeros(len(group)),
                                         group['userid'])),
                                        shape=[1, len(userids)]))
                            for artist, group in self.data.groupby('artist'))

        N = len(self.artists)
        self.idf = [1. + log(N / (1. + p)) for p in self.data.groupby('userid').size()]
        self.average_plays = self.data['plays'].sum() / float(N)


def clean_dataset(filename):
    """ so - i lied a little in the post about it being a one line operation
    to read in the dataset with pandas.
    it *should* be a one line operation, but there are a bunch of malformed
    lines in the dataset that trips up pandas. So lets read in the thing one
    line at a time, and strip out the bad data. After this runs it will be a
    one-liner to read in. honest this time """

    with open(filename + ".cleaned", "w") as output:
        for i, line in enumerate(open(filename)):
            tokens = line.strip().split("\t")
            if len(tokens) != 4:
                print("wrong # of tokens", i)
                continue

            if not tokens[3].isdigit():
                print("non integer play count", i)
                continue

            if tokens[2] == '""':
                print("invalid artist id", tokens[2])
                continue

            # some lines contain carriage returns (without newlines), which
            # randomly messes pandas up
            line = line.replace('\r', '')

            output.write(line)

In [5]:
#mdata = MusicData(path)

In [6]:
#clean_dataset(path)

In [7]:
def read_data(filename):
    """ Reads in the last.fm dataset, and returns a tuple of a pandas dataframe
    and a sparse matrix of artist/user/playcount """
    # read in triples of user/artist/playcount from the input dataset
    data = pd.read_table(filename,
                             usecols=[0, 2, 3],
                             names=['user', 'artist', 'plays'])

    # map each artist and user to a unique numeric value
    data['user'] = data['user'].astype("category")
    data['artist'] = data['artist'].astype("category")

    # create a sparse matrix of all the users/plays
    plays = coo_matrix((data['plays'].astype(float),
                       (data['artist'].cat.codes.copy(),
                        data['user'].cat.codes.copy())))

    return data, plays

In [8]:
def overlap(a, b):
    return len(a.intersection(b))

In [9]:
#musicD = MusicData(path)

In [10]:
def cosine1(plays):
    normalized = normalize(plays)
    return normalized.dot(normalized.T)

In [11]:
def bhattacharya(plays):
    plays.data = np.sqrt(plays.data)
    return cosine(plays)

In [12]:
def ochiai(plays):
    plays = csr_matrix(plays)
    plays.data = np.ones(len(plays.data))
    return cosine(plays)

In [13]:
def bm25_weight(data, K1=100, B=0.8):
    """ Weighs each row of the matrix data by BM25 weighting """
    # calculate idf per term (user)
    N = float(data.shape[0])
    idf = np.log(N / (1 + np.bincount(data.col)))

    # calculate length_norm per document (artist)
    row_sums = np.squeeze(np.asarray(data.sum(1)))
    average_length = row_sums.sum() / N
    length_norm = (1.0 - B) + B * row_sums / average_length

    # weight matrix rows by bm25
    ret = coo_matrix(data)
    ret.data = ret.data * (K1 + 1.0) / (K1 * length_norm[ret.row] + ret.data) * idf[ret.col]
    return ret

In [14]:
def bm25(plays):
    plays = bm25_weight(plays)
    return plays.dot(plays.T)

In [15]:
path = os.path.expanduser("~/Google Drive/CSVs/artname-plays.tsv.cleaned")

In [16]:
data, plays = read_data(path)

In [17]:
def get_largest(row, N=10):
    if N >= row.nnz:
        best = zip(row.data, row.indices)
    else:
        ind = np.argpartition(row.data, -N)[-N:]
        best = zip(row.data[ind], row.indices[ind])
    return sorted(best, reverse=True)


def calculate_similar_artists(similarity, artists, artistid):
    neighbours = similarity[artistid]
    top = get_largest(neighbours)
    return [(artists[other], score, i) for i, (score, other) in enumerate(top)]

In [18]:
similarity = cosine(plays)

In [19]:
#similarity = bm25(plays)

In [20]:
#similarity = cosine2(plays)

In [20]:
artists = dict(enumerate(data['artist'].cat.categories))
user_count = data.groupby('artist').size()
to_generate = sorted(list(artists), key=lambda x: -user_count[x])

In [62]:
l = []
for artist in to_generate:
    name = artists[artist]
    for other, score, rank in calculate_similar_artists(similarity, artists, artist):
            l.append([name, other, score, rank])

In [63]:
sim = pd.DataFrame(l, columns=['name', 'other', 'score', 'rank'])

In [1]:
sim.head(5)

NameError: name 'sim' is not defined

In [8]:
numberList = [1, 2, 3]
strList = ['one', 'two', 'three']

# Two iterables are passed
result = zip(numberList, strList)

In [9]:
for i in result:
    print(i)

(1, 'one')
(2, 'two')
(3, 'three')


In [12]:
en = enumerate(numberList)

In [13]:
set(en)

{(0, 1), (1, 2), (2, 3)}