###  Matrix factorization

In [10]:
import numpy as np
import pandas as pd
from surprise import Dataset
from surprise import SVD
from surprise import evaluate
from surprise import GridSearch

In [2]:
path = '~/data/'

In [4]:
import os
os.getcwd()

'/home/yamila/repos/matrixFactorization/codigo'

### Data set
Movie Lens data set

In [13]:
data = Dataset.load_builtin('ml-100k')
# si no tenes el data set, lo descarga y lo guarda en la carpeta de data sets de surprise.
# Tambien se puede bajar de http://files.grouplens.org/datasets/movielens/ml-100k.zip

In [21]:
names = ['user_id','item_id','rating','timestamp']
df = pd.read_csv( 'data/u.data', sep ='\t',names = names)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [23]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print('users:'+str(n_users))
print('movies:'+str(n_items))

users:943
movies:1682


### Medidas del error

Root mean squared error
$$RMSE = \sqrt{\frac{1}{n} \sum_{(i,j)} (r_{ij}-\hat{r}_{ij})^2}$$
Mean Absolute error
$$MAE = \frac{1}{n} \sum_{(i,j)}|r_{ij}-\hat{r}_{ij}|$$

In [24]:
data.split(n_folds=2)


In [25]:
help(SVD)

Help on class SVD in module surprise.prediction_algorithms.matrix_factorization:

class SVD(surprise.prediction_algorithms.algo_base.AlgoBase)
 |  The famous *SVD* algorithm, as popularized by `Simon Funk
 |  <http://sifter.org/~simon/journal/20061211.html>`_ during the Netflix
 |  Prize. When baselines are not used, this is equivalent to Probabilistic
 |  Matrix Factorization :cite:`salakhutdinov2008a` (see :ref:`note
 |  <unbiased_note>` below).
 |  
 |  The prediction :math:`\hat{r}_{ui}` is set as:
 |  
 |  .. math::
 |      \hat{r}_{ui} = \mu + b_u + b_i + q_i^Tp_u
 |  
 |  If user :math:`u` is unknown, then the bias :math:`b_u` and the factors
 |  :math:`p_u` are assumed to be zero. The same applies for item :math:`i`
 |  with :math:`b_i` and :math:`q_i`.
 |  
 |  For details, see equation (5) from :cite:`Koren:2009`. See also
 |  :cite:`Ricci:2010`, section 5.3.1.
 |  
 |  To estimate all the unknown, we minimize the following regularized squared
 |  error:
 |  
 |  .. math::
 |

In [35]:
algo = SVD(n_factors = 50,
          n_epochs = 20,
          biased = False,
          init_mean = 0,
          init_std_dev = 0.1,
          lr_all = 0.005,
          reg_all = 0.02)
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])


Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.9897
MAE:  0.7805
------------
Fold 2
RMSE: 0.9804
MAE:  0.7712
------------
------------
Mean RMSE: 0.9850
Mean MAE : 0.7759
------------
------------


In [36]:
help(evaluate)

Help on function evaluate in module surprise.evaluate:

evaluate(algo, data, measures=['rmse', 'mae'], with_dump=False, dump_dir=None, verbose=1)
    Evaluate the performance of the algorithm on given data.
    
    Depending on the nature of the ``data`` parameter, it may or may not
    perform cross validation.
    
    Args:
        algo(:obj:`AlgoBase             <surprise.prediction_algorithms.algo_base.AlgoBase>`):
            The algorithm to evaluate.
        data(:obj:`Dataset <surprise.dataset.Dataset>`): The dataset on which
            to evaluate the algorithm.
        measures(list of string): The performance measures to compute. Allowed
            names are function names as defined in the :mod:`accuracy
            <surprise.accuracy>` module. Default is ``['rmse', 'mae']``.
        with_dump(bool): If True, the predictions and the algorithm will be
            dumped for later further analysis at each fold (see :ref:`FAQ
            <serialize_an_algorithm>`). The fil

In [None]:
algo = SVD(n_factors = 100,
          n_epochs = 20,
          biased = False,
          reg_all = 0.02)
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])

In [None]:
print(perf)

In [37]:
param_grid = {'n_factors':[10,50,70,100,200], 'n_epochs' : [20,40,60], 'biased' : [True,False], 'reg_all' : [0.01,0.02,0.03]}
grid_search = GridSearch(SVD, param_grid, measures = ['RMSE','MAE'])

[{'n_factors': 10, 'n_epochs': 20, 'biased': True, 'reg_all': 0.01}, {'n_factors': 10, 'n_epochs': 20, 'biased': True, 'reg_all': 0.02}, {'n_factors': 10, 'n_epochs': 20, 'biased': True, 'reg_all': 0.03}, {'n_factors': 10, 'n_epochs': 20, 'biased': False, 'reg_all': 0.01}, {'n_factors': 10, 'n_epochs': 20, 'biased': False, 'reg_all': 0.02}, {'n_factors': 10, 'n_epochs': 20, 'biased': False, 'reg_all': 0.03}, {'n_factors': 10, 'n_epochs': 40, 'biased': True, 'reg_all': 0.01}, {'n_factors': 10, 'n_epochs': 40, 'biased': True, 'reg_all': 0.02}, {'n_factors': 10, 'n_epochs': 40, 'biased': True, 'reg_all': 0.03}, {'n_factors': 10, 'n_epochs': 40, 'biased': False, 'reg_all': 0.01}, {'n_factors': 10, 'n_epochs': 40, 'biased': False, 'reg_all': 0.02}, {'n_factors': 10, 'n_epochs': 40, 'biased': False, 'reg_all': 0.03}, {'n_factors': 10, 'n_epochs': 60, 'biased': True, 'reg_all': 0.01}, {'n_factors': 10, 'n_epochs': 60, 'biased': True, 'reg_all': 0.02}, {'n_factors': 10, 'n_epochs': 60, 'biased

In [38]:
grid_search.evaluate(data)

------------
Parameters combination 1 of 90
params:  {'n_factors': 10, 'n_epochs': 20, 'biased': True, 'reg_all': 0.01}
------------
Mean RMSE: 0.9483
Mean MAE : 0.7491
------------
------------
Parameters combination 2 of 90
params:  {'n_factors': 10, 'n_epochs': 20, 'biased': True, 'reg_all': 0.02}
------------
Mean RMSE: 0.9493
Mean MAE : 0.7503
------------
------------
Parameters combination 3 of 90
params:  {'n_factors': 10, 'n_epochs': 20, 'biased': True, 'reg_all': 0.03}
------------
Mean RMSE: 0.9489
Mean MAE : 0.7505
------------
------------
Parameters combination 4 of 90
params:  {'n_factors': 10, 'n_epochs': 20, 'biased': False, 'reg_all': 0.01}
------------
Mean RMSE: 0.9770
Mean MAE : 0.7653
------------
------------
Parameters combination 5 of 90
params:  {'n_factors': 10, 'n_epochs': 20, 'biased': False, 'reg_all': 0.02}
------------
Mean RMSE: 0.9792
Mean MAE : 0.7684
------------
------------
Parameters combination 6 of 90
params:  {'n_factors': 10, 'n_epochs': 20, '

------------
Mean RMSE: 1.0226
Mean MAE : 0.8057
------------
------------
Parameters combination 47 of 90
params:  {'n_factors': 70, 'n_epochs': 40, 'biased': False, 'reg_all': 0.02}
------------
Mean RMSE: 1.0023
Mean MAE : 0.7902
------------
------------
Parameters combination 48 of 90
params:  {'n_factors': 70, 'n_epochs': 40, 'biased': False, 'reg_all': 0.03}
------------
Mean RMSE: 0.9904
Mean MAE : 0.7814
------------
------------
Parameters combination 49 of 90
params:  {'n_factors': 70, 'n_epochs': 60, 'biased': True, 'reg_all': 0.01}
------------
Mean RMSE: 1.0195
Mean MAE : 0.7996
------------
------------
Parameters combination 50 of 90
params:  {'n_factors': 70, 'n_epochs': 60, 'biased': True, 'reg_all': 0.02}
------------
Mean RMSE: 0.9974
Mean MAE : 0.7833
------------
------------
Parameters combination 51 of 90
params:  {'n_factors': 70, 'n_epochs': 60, 'biased': True, 'reg_all': 0.03}
------------
Mean RMSE: 0.9814
Mean MAE : 0.7714
------------
------------
Paramete

In [39]:
grid_search.best_params['RMSE']


{'biased': True, 'n_epochs': 20, 'n_factors': 10, 'reg_all': 0.01}

In [40]:
grid_search.best_params['MAE']

{'biased': True, 'n_epochs': 20, 'n_factors': 10, 'reg_all': 0.01}

In [41]:
results = pd.DataFrame.from_dict(grid_search.cv_results)


In [43]:
results.to_csv('data/results.csv')

In [46]:
results = pd.read_csv('data/results.csv')
results

Unnamed: 0.1,Unnamed: 0,MAE,RMSE,n_epochs,n_factors,params,scores
0,0,0.749063,0.948313,20,10,"{'n_factors': 10, 'n_epochs': 20, 'biased': Tr...","{'RMSE': 0.94831340289822119, 'MAE': 0.7490631..."
1,1,0.750258,0.949310,20,10,"{'n_factors': 10, 'n_epochs': 20, 'biased': Tr...","{'RMSE': 0.94930958352407002, 'MAE': 0.7502579..."
2,2,0.750506,0.948872,20,10,"{'n_factors': 10, 'n_epochs': 20, 'biased': Tr...","{'RMSE': 0.9488721774402662, 'MAE': 0.75050611..."
3,3,0.765346,0.977037,20,10,"{'n_factors': 10, 'n_epochs': 20, 'biased': Fa...","{'RMSE': 0.97703720565431973, 'MAE': 0.7653462..."
4,4,0.768364,0.979233,20,10,"{'n_factors': 10, 'n_epochs': 20, 'biased': Fa...","{'RMSE': 0.97923343576997302, 'MAE': 0.7683635..."
5,5,0.768952,0.979095,20,10,"{'n_factors': 10, 'n_epochs': 20, 'biased': Fa...","{'RMSE': 0.97909459030187329, 'MAE': 0.7689516..."
6,6,0.753730,0.961738,40,10,"{'n_factors': 10, 'n_epochs': 40, 'biased': Tr...","{'RMSE': 0.96173807686370871, 'MAE': 0.7537295..."
7,7,0.751323,0.955722,40,10,"{'n_factors': 10, 'n_epochs': 40, 'biased': Tr...","{'RMSE': 0.95572219250989821, 'MAE': 0.7513228..."
8,8,0.751270,0.953942,40,10,"{'n_factors': 10, 'n_epochs': 40, 'biased': Tr...","{'RMSE': 0.95394181118022869, 'MAE': 0.7512704..."
9,9,0.767020,0.979931,40,10,"{'n_factors': 10, 'n_epochs': 40, 'biased': Fa...","{'RMSE': 0.97993059262316762, 'MAE': 0.7670200..."
