# NMF baseline algorithm using surprise package:

using matrix_factorization.NMF algorithm from http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.NMF


In [1]:
import pandas as pd
import numpy as np
from scipy import stats

import helpers
from surprise_helpers import CustomReader, get_ratings_from_predictions
from surprise import Reader, Dataset
from surprise.model_selection.search import RandomizedSearchCV
from surprise.prediction_algorithms.matrix_factorization import NMF

## Data Loading

In [2]:
reader = CustomReader()
filepath = helpers.get_train_file_path()
data = Dataset.load_from_file(filepath, reader=reader)

## Search over Parameters

In [None]:
param_grid = {'n_epochs': stats.randint(230,290), 
              'n_factors': stats.randint(1,30),
                'reg_pu': stats.uniform(0.1,0.2),
                'reg_qi': stats.uniform(0.1,0.2),
                'reg_bu': stats.uniform(0.25,0.45),
                'reg_bi': stats.uniform(0.85,1.0)
             }      
        

gs = RandomizedSearchCV(algo_class=NMF, param_distributions=param_grid, measures=['rmse'], 
                        cv=10, joblib_verbose=100, n_jobs=-1, n_iter=20)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

In [None]:
pd.DataFrame.from_dict(gs.cv_results)

## Train

In [4]:
# choose optimal params from above
algo = NMF(n_epochs=256, reg_pu=0.1686198480906289, reg_qi=0.13702113849142605, n_factors=15, reg_bu=0.34284468705230597, reg_bi=0.93748069281861202)

# train 
algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7f8aae36cfd0>

## Predicting
We load the test data to predict.

In [5]:
test_file_path = helpers.get_test_file_path()
test_data = Dataset.load_from_file(test_file_path, reader=reader)
testset = test_data.construct_testset(test_data.raw_ratings)
predictions = algo.test(testset)
predictions[0]

Prediction(uid=36, iid=0, r_ui=3.0, est=3.1898341221489006, details={'was_impossible': False})

We need to convert the predictions into the right format.

In [6]:
ratings = get_ratings_from_predictions(predictions)

Now we can write the file.

In [7]:
output = helpers.write_submission(ratings, 'submission_surprise_NMF_0.csv')
print(output[0:100])

Id,Prediction
r37_c1,3.189834
r73_c1,2.928078
r156_c1,3.547578
r160_c1,3.170538
r248_c1,3.208088
r25
