# KNN
baseline algorithm using surprise package:

using `knns.KNNBaseline` algorithm from http://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html

In [2]:
import pandas as pd
import numpy as np
from scipy import stats

from surprise.model_selection import cross_validate
from surprise import Reader, Dataset
from surprise.model_selection.search import RandomizedSearchCV
from surprise.prediction_algorithms.knns import KNNBaseline

import helpers
from surprise_helpers import CustomReader, get_ratings_from_predictions

## Data loading

In [3]:
reader = CustomReader()
filepath = helpers.get_train_file_path()
data = Dataset.load_from_file(filepath, reader=reader)

## Search over params


In [None]:
param_grid = {
    'k': stats.randint(5,100),
    'sim_options': {
        'name': ['pearson_baseline'],
        'shrinkage':[0,100],
    }
}
gs = RandomizedSearchCV(algo_class=KNNBaseline, param_distributions=param_grid, measures=['rmse'], 
                        cv=10, joblib_verbose=100, n_jobs=-1, n_iter=5)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

In [None]:
pd.DataFrame.from_dict(gs.cv_results)

## Results: params 

note: run on Leonhard cluster (20 cores and 22GB mem) <br/>
cv=10

1.008268129160658
{}

0.991818290107
{'k': 99, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.991848814897
{'k': 98, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.991922408271
{'k': 95, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.992005703139
{'k': 96, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.992005703139
{'k': 96, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.992342619036
{'k': 89, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.992483581052
{'k': 85, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.992472079676
{'k': 87, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.992750060565
{'k': 79, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.993845243292
{'k': 65, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.997544923863
{'k': 96, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 0, 'user_based': True}}

0.998283812548
{'k': 86, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 0, 'user_based': True}}


## Train

In [3]:
# choose optimal params from above
algo = 

# train 
algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x109d6ccc0>

## Predicting
We load the test data to predict.

In [4]:
test_file_path = helpers.get_test_file_path()
test_data = Dataset.load_from_file(test_file_path, reader=reader)
testset = test_data.construct_testset(test_data.raw_ratings)
predictions = algo.test(testset)
predictions[0]

Prediction(uid=36, iid=0, r_ui=3.0, est=3.3858988120492683, details={'was_impossible': False})

We need to convert the predictions into the right format.

In [5]:
ratings = get_ratings_from_predictions(predictions)

Now we can write the file.

In [6]:
output = helpers.write_submission(ratings, 'submission_surprise_SVDpp_0.csv')
print(output[0:100])

Id,Prediction
r37_c1,3.385899
r73_c1,3.124768
r156_c1,3.766790
r160_c1,3.321574
r248_c1,3.561472
r25
