## Matrix Factorisation

In [1]:
import os, sys, gzip
from surprise import Dataset
from surprise import Reader
from surprise import SVD
import pickle as pkl
import numpy as np

In [2]:
from tools import calc_RPrecision_HitRate

In [3]:
TOPs = [5, 10, 20, 30, 50, 100, 200, 300, 500, 1000]
datasets = ['aotm2011', '30music']
data_dir = 'data'

In [4]:
dix = 1
dataset_name = datasets[dix]
dataset_name

'30music'

In [5]:
# Reader(name=None, line_format='user item rating', sep=None, rating_scale=(1, 5), skip_lines=0)

In [6]:
fname = os.path.join(data_dir, '%s/setting2/mftrain_%s.csv' % (dataset_name, dataset_name))
reader = Reader(line_format='user item rating', sep=',')
data_train = Dataset.load_from_file(fname, reader=reader)

In [14]:
# http://surprise.readthedocs.io/en/stable/matrix_factorization.html
algo = SVD(n_factors=500, verbose=True)
trainset = data_train.build_full_trainset()
algo.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ff4e75549b0>

In [15]:
# algo.predict(uid, iid, r_ui=None, clip=True, verbose=False)

In [16]:
base_dir = 'data/%s/setting2' % dataset_name
Y = pkl.load(gzip.open(os.path.join(base_dir, 'Y.pkl.gz'), 'rb'))
PU_test = pkl.load(gzip.open(os.path.join(base_dir, 'PU_test.pkl.gz'), 'rb'))

In [17]:
Y_test = Y[:, -PU_test.shape[1]:]
print(Y_test.shape)
#Y_test.sum(axis=0)

(45468, 933)


In [18]:
N, K = Y.shape
ustrs = ['U%d' % i for i in range(N)]
istrs = ['P%d' % j for j in range(K)]

In [19]:
rps_mf = []
hitrates_mf = {top: [] for top in TOPs}

assert Y_test.shape == PU_test.shape
offset = Y.shape[1] - PU_test.shape[1]
for j in range(Y_test.shape[1]):
    if (j+1) % 10 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()
    y1 = Y_test[:, j].toarray().reshape(-1)
    y2 = PU_test[:, j].toarray().reshape(-1)
    indices = np.where(0 == y2)[0]
    y_true = y1[indices]
    y_pred = np.asarray([algo.predict(ustrs[i], istrs[j + offset]).est for i in indices]).reshape(-1)
    
    rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rps_mf.append(rp)
    for top in TOPs:
        hitrates_mf[top].append(hr_dict[top])

930 / 933

In [20]:
mf_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_mf), 
                                   'Hit-Rate': {top: np.mean(hitrates_mf[top]) for top in hitrates_mf}}}}
mf_perf

{'30music': {'Test': {'Hit-Rate': {5: 0.0019514303764383827,
    10: 0.003880947498172262,
    20: 0.008069349825425384,
    30: 0.011854011998981699,
    50: 0.019093180623916052,
    100: 0.035565454327025695,
    200: 0.06437593417769719,
    300: 0.08656623466576946,
    500: 0.12432728071281228,
    1000: 0.19566627159459635},
   'R-Precision': 0.004668726886529791}}}

In [21]:
fperf_mf = os.path.join(base_dir, 'perf-mf.pkl')
print(fperf_mf)
pkl.dump(mf_perf, open(fperf_mf, 'wb'))
pkl.load(open(fperf_mf, 'rb'))

data/30music/setting2/perf-mf.pkl


{'30music': {'Test': {'Hit-Rate': {5: 0.0019514303764383827,
    10: 0.003880947498172262,
    20: 0.008069349825425384,
    30: 0.011854011998981699,
    50: 0.019093180623916052,
    100: 0.035565454327025695,
    200: 0.06437593417769719,
    300: 0.08656623466576946,
    500: 0.12432728071281228,
    1000: 0.19566627159459635},
   'R-Precision': 0.004668726886529791}}}

## Example

In [25]:
# pip install scikit-surprise
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

In [26]:
# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')

In [27]:
# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)

In [28]:
# We'll use the famous SVD algorithm.
algo = SVD()

In [29]:
# Train the algorithm on the trainset
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd75e197cc0>

In [30]:
#algo.test?

In [31]:
# Predict ratings for the testset
predictions = algo.test(testset)

In [32]:
type(predictions)

list

In [33]:
predictions[2]

Prediction(uid='654', iid='462', r_ui=4.0, est=3.701551581789513, details={'was_impossible': False})

In [35]:
predictions[2].est

3.701551581789513

In [34]:
# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.9385


0.9385069358051978