## load data

In [1]:
import time
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, diags
from scipy.sparse.linalg import spsolve

df = pd.read_table("~/music-recommend/dataset/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv", 
                         usecols=[0, 2, 3], 
                         names=['user', 'artist', 'plays'],
                         na_filter=False)

df['user'] = df['user'].astype('category')
df['artist'] = df['artist'].astype('category')

plays = csr_matrix((df['plays'].astype(float), 
                   (df['user'].cat.codes, 
                    df['artist'].cat.codes)))

print ('user count ', plays.shape[0])
print ('artist count ', plays.shape[1])
print ('plays matrix memory usage: %d MB.' % (plays.data.nbytes/1024/1024))

def get_row_index_by_user(user):
    for index, i in enumerate(df['user'].cat.categories):
        if i == user:
            return index
    return None

def get_col_index_by_artist(artist):
    for index, i in enumerate(df['artist'].cat.categories):
        if i == artist:
            return index
    return None
    
def get_sparse_matrix_item(i, j):
    return plays.getrow(i).getcol(j).data[0]

user count  358868
artist count  292365
plays matrix memory usage: 133 MB.


## check

In [2]:
user1_index = get_row_index_by_user('00000c289a1829a808ac09c00daf10bc3c4e223b')
artist1_index = get_col_index_by_artist('red hot chili peppers')
print ('00000c289a1829a808ac09c00daf10bc3c4e223b listened red hot chili peppers count: ', get_sparse_matrix_item(user1_index, artist1_index))

00000c289a1829a808ac09c00daf10bc3c4e223b listened red hot chili peppers count:  691.0


## picture and formula

![](https://ws1.sinaimg.cn/large/0073xHwmly1g0gm9z9a4wj31im0p2abk.jpg)

loss function:
$$
\min _ { x _ { * } , y _ {*} , } \sum _ { u , i } c _ { u i } \left( p _ { u i } - x _ { u } ^ { T } y _ { i } \right) ^ { 2 } + \lambda \left( \sum _ { u } \left\| x _ { u } \right\| ^ { 2 } + \sum _ { i } \left\| y _ { i } \right\| ^ { 2 } \right)
$$

confidence:
$$
c _ { u i } = 1 + \alpha r _ { u i }
$$

ALS:
$$
x _ { u } = \left( Y ^ { T } C ^ { u } Y + \lambda I \right) ^ { - 1 } Y ^ { T } C ^ { u } p ( u ) = Y ^ { T } Y + Y ^ { T } \left( C ^ { u } - I \right) Y
$$


## translate formula

In [None]:
def weighted_alternating_least_squares(plays, factors, alpha=40, regularization=0.1, iterations=20):
    Cui = (plays * alpha).astype('double')
    users, items = Cui.shape

    X = np.random.rand(users, factors) * 0.01
    Y = np.random.rand(items, factors) * 0.01

    Ciu = Cui.T.tocsr()
    for iteration in range(iterations):
        least_squares(Cui, X, Y, regularization)
        least_squares(Ciu, Y, X, regularization)

    return X, Y

def least_squares(Cui, X, Y, regularization):
    users, factors = X.shape
    for u in range(users):
        conf = Cui[u,:].toarray() ；
        pref = conf.copy()
        conf = conf + 1
        pref[pref != 0] = 1
        Cu = diags(conf, [0])
        A = Y.T.dot(Cu).dot(Y) + regularization * np.eye(factors)
        b = Y.T.dot(Cu).dot(pref.T)
        X[u] = spsolve(A, b)

## implicit translate formula

In [None]:
def nonzeros(m, row):
    for index in range(m.indptr[row], m.indptr[row+1]):
        yield m.indices[index], m.data[index]

def least_squares(Cui, X, Y, regularization):
    users, factors = X.shape
    YtY = Y.T.dot(Y)

    for u in range(users):
        if u % 10000 == 0 and u > 0:
            print (u)
        # accumulate YtCuY + regularization * I in A
        A = YtY + regularization * np.eye(factors)

        # accumulate YtCuPu in b
        b = np.zeros(factors)

        for i, confidence in nonzeros(Cui, u):
            factor = Y[i]
            A += (confidence - 1) * np.outer(factor, factor)
            b += confidence * factor

        # Xu = (YtCuY + regularization * I)^-1 (YtCuPu)
        X[u] = np.linalg.solve(A, b)

## use implicit

In [3]:
from implicit.nearest_neighbours import bm25_weight
from implicit.als import AlternatingLeastSquares

model = AlternatingLeastSquares(factors=50, regularization=0.01, iterations = 50)
model.fit(bm25_weight(plays.T.tocsr()))

user_factors = model.user_factors
artist_factors = model.item_factors

100%|██████████| 50.0/50 [04:22<00:00,  5.37s/it]


## annoy

In [4]:
from annoy import AnnoyIndex
import random

artist_nn_index = AnnoyIndex(50)
for i in range(artist_factors.shape[0]):
    artist_nn_index.add_item(i, artist_factors[i])

artist_nn_index.build(25)

True

In [5]:
def get_similar_artists(artist, n = 20):
    similar_artist_list = list()
    for i in artist_nn_index.get_nns_by_item(artist, n):
        similar_artist_list.append(df['artist'].cat.categories[i])
    return similar_artist_list

yes = get_col_index_by_artist('yes')
the_clash = get_col_index_by_artist('the clash')
the_smiths = get_col_index_by_artist('the smiths')
pink_floyd = get_col_index_by_artist('pink floyd')
blur = get_col_index_by_artist('blur')

print ('yes similar artists:\n', get_similar_artists(yes))
print ('----------')
print ('the_clash similar artists:\n', get_similar_artists(the_clash))
print ('----------')
print ('the_smiths similar artists:\n', get_similar_artists(the_smiths))
print ('----------')
print ('pink_floyd similar artists:\n', get_similar_artists(pink_floyd))
print ('----------')
print ('blur similar artists:\n', get_similar_artists(blur))

yes similar artists:
 ['yes', 'emerson, lake & palmer', 'genesis', 'rush', 'king crimson', 'jethro tull', 'the moody blues', 'gentle giant', 'camel', 'kansas', 'frank zappa', 'david gilmour', 'focus', 'jeff beck', 'roger waters', 'steely dan', 'marillion', 'van der graaf generator', 'led zeppelin', 'cream']
----------
the_clash similar artists:
 ['the clash', 'ramones', 'pixies', 'iggy pop', 'david bowie', 'the specials', 'the smiths', 'dead kennedys', 'the pogues', 'the white stripes', 'the beatles', 'lou reed', 'the velvet underground', 'the cure', 'johnny cash', 'violent femmes', 'joy division', 'the kinks', 'misfits', 'bob dylan']
----------
the_smiths similar artists:
 ['the smiths', 'morrissey', 'the cure', 'joy division', 'david bowie', 'new order', 'pixies', 'echo & the bunnymen', 'the clash', 'the jesus and mary chain', 'pulp', 'elliott smith', 'blur', 'blondie', 'r.e.m.', 'nick cave and the bad seeds', 'talking heads', 'cocteau twins', 'beck', 'manic street preachers']
------