# KNN
baseline algorithm using surprise package:

using `knns.KNNBaseline` algorithm from http://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

from surprise.model_selection import cross_validate
from surprise import Reader, Dataset
from surprise.model_selection.search import RandomizedSearchCV
from surprise.prediction_algorithms.knns import KNNBaseline

import data_handler
from surprise_extensions import CustomReader, get_ratings_from_predictions

## Data loading

In [2]:
reader = CustomReader()
filepath = data_handler.get_train_file_path()
data = Dataset.load_from_file(filepath, reader=reader)

## Search over params


In [None]:
param_grid = {
    'k': stats.randint(5,100),
    'sim_options': {
        'name': ['pearson_baseline'],
        'shrinkage':[0,100],
    }
}
gs = RandomizedSearchCV(algo_class=KNNBaseline, param_distributions=param_grid, measures=['rmse'], 
                        cv=10, joblib_verbose=100, n_jobs=-1, n_iter=5)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

In [None]:
pd.DataFrame.from_dict(gs.cv_results)

## Results: params 

note: run on Leonhard cluster (20 cores and 22GB mem) <br/>
cv=10

0.990511947895
{'k': 237, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 279, 'user_based': True}}

0.9905256316
{'k': 244, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 256, 'user_based': True}}

0.990541323231
{'k': 283, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 489, 'user_based': True}}

0.990582491183
{'k': 303, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 396, 'user_based': True}}

0.990559809349
{'k': 287, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 420, 'user_based': True}}

0.990600352326
{'k': 239, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 455, 'user_based': True}}

0.990620245289
{'k': 286, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 285, 'user_based': True}}

0.990653184151
{'k': 215, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 254, 'user_based': True}}

0.990740953266
{'k': 269, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 293, 'user_based': True}}

------

0.990745916194
{'k': 190, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 191, 'user_based': True}}

0.990751578428
{'k': 175, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 166, 'user_based': True}}

0.990777929504
{'k': 174, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 159, 'user_based': True}}

0.9907865737
{'k': 174, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 144, 'user_based': True}}

0.990789001481
{'k': 165, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 91, 'user_based': True}}

0.990866512039
{'k': 157, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 175, 'user_based': True}}

0.990881215984
{'k': 193, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 108, 'user_based': True}}

0.990881263871
{'k': 186, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 71, 'user_based': True}}

0.990905733161
{'k': 197, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 61, 'user_based': True}}

0.990904062625
{'k': 176, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 168, 'user_based': True}}

0.990961856449
{'k': 183, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 63, 'user_based': True}}

0.991028325888
{'k': 147, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.991081017802
{'k': 136, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.99108224188
{'k': 149, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.991085028421
{'k': 139, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.991131609141
{'k': 161, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 54, 'user_based': True}}

0.99112466769
{'k': 140, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.991160936299
{'k': 136, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.991818290107
{'k': 99, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.991848814897
{'k': 98, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.991922408271
{'k': 95, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.992005703139
{'k': 96, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.992005703139
{'k': 96, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.992342619036
{'k': 89, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.992483581052
{'k': 85, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.992472079676
{'k': 87, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.992750060565
{'k': 79, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.993845243292
{'k': 65, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': True}}

0.997544923863
{'k': 96, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 0, 'user_based': True}}

0.998283812548
{'k': 86, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 0, 'user_based': True}}

1.008268129160658
{}


## Train

In [3]:
# choose optimal params from above
# {'k': 237, 'sim_options': {'name': 'pearson_baseline', 'shrinkage': 279, 'user_based': True}}
sim_options = {'name': 'pearson_baseline',
               'shrinkage': 279,
               'user_based': True
               }
algo = KNNBaseline(k=237, sim_options=sim_options)

# train 
algo.fit(data.build_full_trainset())

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x1025fb0f0>

## Predicting
We load the test data to predict.

In [4]:
test_file_path = data_handler.get_test_file_path()
test_data = Dataset.load_from_file(test_file_path, reader=reader)
testset = test_data.construct_testset(test_data.raw_ratings)
predictions = algo.test(testset)
predictions[0]

Prediction(uid=36, iid=0, r_ui=3.0, est=3.2370282594995112, details={'actual_k': 186, 'was_impossible': False})

We need to convert the predictions into the right format.

In [5]:
ratings = get_ratings_from_predictions(predictions)

Now we can write the file.

In [6]:
output = data_handler.write_submission(ratings, 'submission_surprise_KNNBaseline_1.csv')
print(output[0:100])

Id,Prediction
r37_c1,3.237028
r73_c1,2.963425
r156_c1,3.719131
r160_c1,3.311598
r248_c1,3.266735
r25
