# SVD baseline algorithm using surprise package:

using `matrix_factorization.SVD` algorithm from http://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html

In [12]:
import pandas as pd
import numpy as np
from scipy import stats

import helpers
from surprise_helpers import CustomReader, get_ratings_from_predictions
from surprise import Reader, Dataset
from surprise.model_selection.search import RandomizedSearchCV
from surprise.prediction_algorithms.matrix_factorization import SVD

## Data loading

In [13]:
reader = CustomReader()
filepath = helpers.get_train_file_path()
data = Dataset.load_from_file(filepath, reader=reader)

## Search over params


In [None]:
param_grid = {'n_epochs': stats.randint(5,20), 
              'lr_all': stats.uniform(0.002,0.005),
              'reg_all': stats.uniform(0.02,0.6),
              'n_factors': stats.randint(50,150),
             }      
        

gs = RandomizedSearchCV(algo_class=SVD, param_distributions=param_grid, measures=['rmse'], 
                        cv=10, joblib_verbose=100, n_jobs=-1, n_iter=100)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

In [None]:
pd.DataFrame.from_dict(gs.cv_results)

## Results: params 

note: run on Leonhard cluster (20 cores and 22GB mem)
cv=10

0.996617863993
{'lr_all': 0.0080655611939959484, 'n_epochs': 19, 'n_factors': 9, 'reg_all': 0.042201220509606799}

0.998777808857
{'n_factors': 5}

1.00085021593
{'lr_all': 0.0035314408264436933, 'n_epochs': 19, 'n_factors': 50, 'reg_all': 0.027105037999075404}

1.00534587695
{'lr_all': 0.0034656840329879137, 'n_epochs': 10, 'n_factors': 42, 'reg_all': 0.12231592623013628}

1.00104676332
{'lr_all': 0.0066032381482039656, 'n_epochs': 17, 'n_factors': 107, 'reg_all': 0.036362623151074552}

1.00382744957
{'lr_all': 0.0045664408289589759, 'n_epochs': 12, 'n_factors': 9, 'reg_all': 0.04029560227746723}

## Train

In [15]:
# choose optimal params from above
algo = SVD(n_epochs=19, lr_all=0.0080655611939959484, reg_all=0.042201220509606799, n_factors=9)

# train 
algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x13bef8940>

## Predicting
We load the test data to predict.

In [16]:
test_file_path = helpers.get_test_file_path()
test_data = Dataset.load_from_file(test_file_path, reader=reader)
testset = test_data.construct_testset(test_data.raw_ratings)
predictions = algo.test(testset)
predictions[0]

Prediction(uid=36, iid=0, r_ui=3.0, est=3.3141439284719922, details={'was_impossible': False})

We need to convert the predictions into the right format.

In [17]:
ratings = get_ratings_from_predictions(predictions)

Now we can write the file.

In [18]:
output = helpers.write_submission(ratings, 'submission_surprise_SVD_0.csv')
print(output[0:100])

Id,Prediction
r37_c1,3.314144
r73_c1,3.072523
r156_c1,3.743513
r160_c1,3.346467
r248_c1,3.582211
r25
