# Baseline using scikit-surprise

In [1]:
import helpers
from surprise_helpers import CustomReader, get_ratings_from_predictions
from surprise import Reader, Dataset

## Data loading
We load the data using our custom reader.
See: http://surprise.readthedocs.io/en/stable/getting_started.html#use-a-custom-dataset

In [2]:
reader = CustomReader()
filepath = helpers.get_train_file_path()
data = Dataset.load_from_file(filepath, reader=reader)

## Parameter search
We search for good values of parameters of the chosen algorithm.

In [57]:
from surprise import BaselineOnly
from surprise.model_selection import RandomizedSearchCV

algo = BaselineOnly
param_grid = {
    'bsl_options': {
        'method': ['als', 'sgd'],
        'n_epochs': [5, 10, 20],
        'reg': [0.2, 0.02, 0.002],
        'learning_rate': [0.05, 0.005, 0.0005],
        'reg_i': [5, 10, 15],
        'reg_u': [5, 15, 25]
    }
}
algo = RandomizedSearchCV(BaselineOnly,
                          param_grid,
                          n_iter=20,
                          measures=['rmse'], cv=3, n_jobs=-1,
                          refit=True # so we can use test() directly
                          joblib_verbose=1)

algo.fit(data)

print('Best score {} with parameters:'.format(algo.best_score['rmse']))
pd.DataFrame.from_dict(best_params['rmse']['bsl_options'])

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using als...
Estimating biases using als...
Estimati

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.6min


Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.7min finished


Best score 1.000172548115941 with parameters:


AttributeError: module 'pandas' has no attribute 'pd'

In [60]:
pd.DataFrame.from_dict(algo.best_params['rmse'])

Unnamed: 0,bsl_options
learning_rate,0.0005
method,als
n_epochs,20
reg,0.2
reg_i,10
reg_u,15


In [67]:
algo.best_params['rmse']

{'bsl_options': {'method': 'als',
  'n_epochs': 20,
  'reg': 0.2,
  'learning_rate': 0.0005,
  'reg_i': 10,
  'reg_u': 15}}

In [61]:
import pandas as pd
res = algo.cv_results
params = res['param_bsl_options']
short_res = {key: [params[i][key] for i in range(len(params))] for key in params[0].keys()}
short_res['rank'] = res['rank_test_rmse']
short_res['rmse'] = res['mean_test_rmse']
results_df = pd.DataFrame.from_dict(short_res)
results_df

Unnamed: 0,method,n_epochs,reg,learning_rate,reg_i,reg_u,rank,rmse
0,als,20,0.002,0.0005,15,5,11,1.000643
1,sgd,10,0.02,0.005,10,5,13,1.004419
2,sgd,10,0.2,0.005,15,5,15,1.009682
3,sgd,5,0.002,0.005,15,5,14,1.005124
4,als,10,0.2,0.0005,5,25,4,1.000366
5,als,10,0.02,0.0005,10,15,2,1.000184
6,als,20,0.2,0.0005,10,15,1,1.000173
7,als,20,0.02,0.05,10,25,6,1.000439
8,als,5,0.002,0.005,5,25,5,1.000405
9,sgd,10,0.002,0.05,5,15,17,1.03218


## Predicting
We load the test data to predict.

In [64]:
test_file_path = helpers.get_test_file_path()
test_data = Dataset.load_from_file(test_file_path, reader=reader)
testset = test_data.construct_testset(test_data.raw_ratings)
predictions = algo.test(testset)
predictions[0]

Estimating biases using als...


Prediction(uid=36, iid=0, r_ui=3.0, est=3.3015024741648467, details={'was_impossible': False})

We need to convert the predictions into the right format.

In [65]:
ratings = get_ratings_from_predictions(predictions)

Now we can write the file.

In [66]:
output = helpers.write_submission(ratings, 'submission_surprise_baseline.csv')
print(output[0:10])

Id,Predict
