# Examples

http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-360K.html

https://github.com/lyst/lightfm/issues/174

https://github.com/lyst/lightfm/tree/master/examples/dataset

In [1]:
import pandas as pd
import numpy as np
import tarfile
from lightfm import LightFM
import scipy.sparse as sp



In [2]:
import os.path
ndata_name = 'normalized_data'
ndata =ndata_name+'.npz'
if not os.path.isfile(ndata):
    # uncompressed dataset
    fname = 'D:\Sistemas\datasets\lastfm-dataset-360K.tar.gz'
    if (fname.endswith("tar.gz")):
        tar = tarfile.open(fname, "r:gz")
        tar.extractall()
        tar.close()
    elif (fname.endswith("tar")):
        tar = tarfile.open(fname, "r:")
        tar.extractall()
        tar.close()
        
    # read uncompressed data

    headers = ["user_id", "artist_id", "artist_name", "plays"]

    data = pd.read_csv("lastfm-dataset-360K\\usersha1-artmbid-artname-plays.tsv", sep="\t", header=None, names=headers)
    data = np.array(data)
    
    # normalize data, based on the amount of listenings of each user
    # if the user has listened more times that artist is understood that it likes it more, 
    # so it gets a higher score

    # position 0: user index
    # position 1: artist index
    users = set(data[np.arange(data.shape[0]),0])
    user2ui = dict(zip(users,range(len(users))))
    items = set(data[np.arange(data.shape[0]),1])
    items2ui = dict(zip(items,range(len(items))))

    user2Views = dict()
    for user in users:
        user2Views[user] = []
    for value in data:
        user2Views[value[0]].append(value)
    for user in users:
        user2Views[user] = np.array(user2Views[user])
    for key, value in user2Views.items():
        filas_user = np.arange(len(value))
        actual = value[filas_user,3]

        value[filas_user,3] = actual/max(actual)*5

    data = []
    for key, value in user2Views.items():
        data.extend(value)

    data = np.array(data) 

    # save data
    np.savez(ndata_name, data)

In [3]:
# if enough memory continue, if not restart kernel
npzfile = np.load(ndata)
data = npzfile['arr_0']

In [4]:
def isNaN(num):
    return num != num

In [5]:
# delete all rows with a nan id

toDelete = []
for index, row in enumerate(data):
    if isNaN(row[0]) or isNaN(row[1]) or isNaN(row[2]):
        toDelete.append(index)
        
data = np.delete(data, toDelete, 0)        

In [6]:
users = set()
items = set()

# cast to string due to object comparison (it's faster)
for idx,row in enumerate(data):
    users.add(str(row[0]))
    items.add(str(row[1]))

user2ui = dict(zip(users,range(len(users))))
items2ui = dict(zip(items,range(len(items))))

In [7]:
n_train = int(0.8*data.shape[0])
n_test = int(0.2*data.shape[0])

In [8]:
train_data = data[np.random.choice(n_train,n_train, replace=False)]
test_data = data[np.random.choice(n_test,n_test, replace=False)]

In [9]:
n_users = len(users)
n_items = len(items)

In [10]:
def _build_interaction_matrix(rows, cols, data, user2ui, items2ui , min_plays):

    mat = sp.lil_matrix((rows, cols), dtype=np.int32)

    for uid, iid, _, plays in data:
        mat[user2ui[uid], items2ui[iid]] = plays

    return mat.tocoo()

In [11]:
train = _build_interaction_matrix(n_users, n_items, train_data, user2ui, items2ui, 3)
test = _build_interaction_matrix(n_users, n_items, test_data, user2ui, items2ui, 3)

In [12]:
id_features = sp.identity(n_items, format="csr", dtype=np.float32)
id_feature_labels = np.empty(n_items, dtype=np.object)

In [13]:
for uid,iid,imeta,_ in data:
    id_feature_labels[items2ui[iid]] = imeta

In [14]:
xdata = {
    "train": train,
    "test": test,
    "item_features": id_features,
    "item_feature_labels": id_feature_labels,
    "item_labels": id_feature_labels,
}

In [15]:
#print training and testing data
print(repr(xdata['train']))
print(repr(xdata['test']))

<358858x160111 sparse matrix of type '<class 'numpy.int32'>'
	with 6261451 stored elements in COOrdinate format>
<358858x160111 sparse matrix of type '<class 'numpy.int32'>'
	with 1564393 stored elements in COOrdinate format>


In [16]:
def get_model(loss,data,epochs=30,num_threads=4):
    #create model
    model = LightFM(loss=loss)
    #train model
    model.fit(data['train'], epochs=epochs, num_threads=num_threads)
    return model

In [17]:
def sample_recommendation(model, data, user_ids):

    #number of users and artist in training data
    n_users, n_items = data['train'].shape

    #generate recommendations for each user we input
    for user_id in user_ids:
        #artist they already like
        known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices]

        #artist our model predicts they will like
        scores = model.predict(user_id, np.arange(n_items))
        #rank them in order of most liked to least
        top_items = data['item_labels'][np.argsort(-scores)]

        #print out the results
        print("User %s" % user_id)
        print("     Known positives:")

        for x in known_positives[:3]:
            print("        %s" % x)

        print("     Recommended:")

        for x in top_items[:3]:
            print("        %s" % x)

In [18]:
import pickle
def save_model(model,fname):
    with open(fname, 'wb') as fle:
        pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL)

In [19]:
#CHALLENGE part 2 of 3 - use 3 different loss functions (so 3 different models), compare results, print results for
#the best one. - Available loss functions are warp, logistic, bpr, and warp-kos.

warp_model = get_model("warp",xdata,num_threads=8)
save_model(warp_model,"warp_model.pickle")

In [20]:
logistic_model = get_model("logistic",xdata,num_threads=8)
save_model(logistic_model,"logistic_model.pickle")

In [21]:
bpr_model = get_model("bpr",xdata,num_threads=8)
save_model(bpr_model,"bpr_model.pickle")

In [22]:
wkos_model = get_model("warp-kos",xdata,num_threads=8)
save_model(wkos_model,"wkos_model.pickle")

In [31]:
sample_recommendation(warp_model, xdata, [2000,20000,30000,40000])

User 2000
     Known positives:
        barış akarsu
        salvatore adamo
        aleš brichta
     Recommended:
        enya
        loreena mckennitt
        clannad
User 20000
     Known positives:
        nine inch nails
        zемфира
        massive attack
     Recommended:
        pink floyd
        metallica
        iron maiden
User 30000
     Known positives:
        jason mraz
        diana krall
        red hot chili peppers
     Recommended:
        kanye west
        jack johnson
        the beatles
User 40000
     Known positives:
        red hot chili peppers
        system of a down
        regina spektor
     Recommended:
        radiohead
        the beatles
        coldplay


In [32]:
sample_recommendation(logistic_model, xdata, [2000,20000,30000,40000])

User 2000
     Known positives:
        barış akarsu
        salvatore adamo
        aleš brichta
     Recommended:
        radiohead
        the beatles
        coldplay
User 20000
     Known positives:
        nine inch nails
        zемфира
        massive attack
     Recommended:
        radiohead
        the beatles
        coldplay
User 30000
     Known positives:
        jason mraz
        diana krall
        red hot chili peppers
     Recommended:
        radiohead
        the beatles
        coldplay
User 40000
     Known positives:
        red hot chili peppers
        system of a down
        regina spektor
     Recommended:
        radiohead
        the beatles
        coldplay


In [33]:
sample_recommendation(bpr_model, xdata, [2000,20000,30000,40000])

User 2000
     Known positives:
        barış akarsu
        salvatore adamo
        aleš brichta
     Recommended:
        bad boys blue
        yanni
        hevia
User 20000
     Known positives:
        nine inch nails
        zемфира
        massive attack
     Recommended:
        pink floyd
        queen
        u2
User 30000
     Known positives:
        jason mraz
        diana krall
        red hot chili peppers
     Recommended:
        john mayer
        jack johnson
        jason mraz
User 40000
     Known positives:
        red hot chili peppers
        system of a down
        regina spektor
     Recommended:
        radiohead
        the beatles
        bloc party


In [34]:
sample_recommendation(wkos_model, xdata, [2000,20000,30000,40000])

User 2000
     Known positives:
        barış akarsu
        salvatore adamo
        aleš brichta
     Recommended:
        the beatles
        simon & garfunkel
        bob dylan
User 20000
     Known positives:
        nine inch nails
        zемфира
        massive attack
     Recommended:
        deep purple
        queen
        genesis
User 30000
     Known positives:
        jason mraz
        diana krall
        red hot chili peppers
     Recommended:
        kanye west
        eminem
        michael jackson
User 40000
     Known positives:
        red hot chili peppers
        system of a down
        regina spektor
     Recommended:
        muse
        placebo
        franz ferdinand
