In [18]:
import os
from glob import glob

import dask
import dask.bag as db
import dask.dataframe as dd
import dask.array as da
from distributed import Client
from dask_jobqueue import SLURMCluster

from IPython.display import display
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.sparse as sp

In [2]:
from lightfm import LightFM
from lightfm.data import Dataset

In [19]:
LOCAL = False

if LOCAL:
    # This line creates a single-machine dask client
    client = Client()
else:    
    # This line creates a SLURM cluster dask and dask client
    # Logging outputs will be stored in /scratch/{your-netid}
    
    cluster = SLURMCluster(memory='4GB', cores=2, python='/scratch/work/public/dask/bin/python', 
                               local_directory='/tmp/{}/'.format(os.environ['SLURM_JOB_USER']),
                               job_extra=['--output=/scratch/{}/slurm-%j.out'.format(os.environ['SLURM_JOB_USER'])])

    cluster.submit_command = 'slurm'
    cluster.scale(100)

    display(cluster)
    client = Client(cluster)

display(client)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 45131 instead


VBox(children=(HTML(value='<h2>SLURMCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    …

0,1
Client  Scheduler: tcp://10.32.33.10:44205  Dashboard: http://10.32.33.10:45131/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [20]:
#read in train and test dataset to calculate accuracy
#train_test = dd.read_csv("../train_test_small.csv")
#test_modified = dd.read_csv("../test_modified_small.csv")
train_test = dd.read_csv("../train_test_large.csv")
test_modified = dd.read_csv("../test_modified_large.csv")

In [21]:
#partition the dataframe
train_test = train_test.repartition(npartitions=100)
test_modified = test_modified.repartition(npartitions=100)

In [22]:
#modify data to input into lightfm dataset
train = train_test[['userId', 'movieId', 'rating']]
train_bag = train.to_bag()
train_movie = train['movieId']
train_user = train['userId']
train_movie_bag = train_movie.to_bag()
train_user_bag = train_user.to_bag()

In [23]:
#modify data to input into lightfm dataset
test = test_modified[['userId', 'movieId', 'rating']]
test_bag = test.to_bag()

In [24]:
#create lightfm dataset
train_dataset = Dataset()

In [25]:
#fit dataset
train_dataset.fit(train_user_bag, train_movie_bag)

In [26]:
#build train interactions
(train_interactions, train_weights) = train_dataset.build_interactions(train_bag)

In [27]:
#build test interactions
(test_interactions, test_weights) = train_dataset.build_interactions(test_bag)

In [12]:
#check the size of the interactions
train_weights

<137109x35232 sparse matrix of type '<class 'numpy.float32'>'
	with 8363468 stored elements in COOrdinate format>

In [11]:
#build the model with tuned parameters
#small_model = LightFM(loss='warp', no_components=10, item_alpha=0.02, user_alpha=0.02)

In [30]:
large_model = LightFM(loss='warp', no_components=50, item_alpha=0.05, user_alpha=0.05)

In [12]:
#fit the model
#%time small_model.fit(train_weights, epochs=20, num_threads=2)

CPU times: user 311 ms, sys: 6.71 ms, total: 318 ms
Wall time: 331 ms


<lightfm.lightfm.LightFM at 0x14897656b5e0>

In [31]:
%time large_model.fit(train_weights, epochs=20, num_threads=10)

CPU times: user 6min 40s, sys: 4.02 s, total: 6min 44s
Wall time: 7min


<lightfm.lightfm.LightFM at 0x14893568c100>

In [13]:
#create predicted rank based on existing test interactions
small_ranks = small_model.predict_rank(test_interactions)

In [14]:
#transfer sparse matrix to array
small_ranks_array = small_ranks.toarray()

In [15]:
#calculate average precision for every user
precision = list()
for i in range(len(small_ranks_array)):
    ranks_i = small_ranks_array[i][small_ranks_array[i]!=0]
    ranks_i.sort()
    count = 0
    precision_i = 0
    for rank in ranks_i:
        count += 1
        precision_i += count/rank
        if rank > 100:
            break
    if count==0:
        precision.append(0)
    else:
        precision.append(precision_i/count)

In [17]:
print('MAP of the small dataset is', np.mean(precision))

MAP of the small dataset is 0.182347329892548


In [32]:
#create and save predicted rank matrix
#large_ranks = large_model.predict_rank(test_interactions)

In [19]:
#sp.save_npz("large_ranks.npz", large_ranks)

Transfering large sparse matrix to array kills the kernel. Therefore, we save the ranks matrix and compute MAP in the local machine. Please see the MAP_large.ipynb for reference. Overall, we have a MAP of 0.112 for the large dataset.

In [None]:
from lightfm.evaluation import precision_at_k

In [14]:
print("precision at 10 of the small dataset is", precision_at_k(small_model, test_interactions, k=100).mean())

precision at 10 of the small dataset is 0.10931146


In [17]:
print("precision at 10 of the large dataset is", precision_at_k(large_model, test_interactions, k=100).mean())

precision at 10 of the large dataset is 0.06818348
