In [1]:
from surprise import KNNBasic, BaselineOnly, NormalPredictor, KNNWithMeans, KNNBaseline, SVDpp, SVD, NMF
from surprise import Dataset
from surprise import evaluate, print_perf
from surprise.model_selection import cross_validate, GridSearchCV
from surprise import Reader
import os
import io

In [2]:
# load MovieLens dataset
dirname = os.path.dirname("__file__")
filename = os.path.join(dirname, 'dataset/ml-100k/u.data')
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file(filename, reader)

Train model using different algorithms in Surprise

1.KNNBasic 

In [3]:
sim_options = {'name': 'pearson_baseline', 'user_based': False}

alg = KNNBasic(sim_options=sim_options)
perf = cross_validate(alg, data, measures=['RMSE'], cv=3)
print_perf(perf)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Mean    
TEST_RMSE1.0082  1.0107  1.0030  1.0073  
FIT_TIME5.8820  5.6620  5.5680  5.7040  
TEST_TIME14.5570 15.3710 13.7760 14.5680 


2.KNNWithMeans

In [4]:
sim_options = {'name': 'pearson_baseline', 'user_based': False}

alg = KNNWithMeans(sim_options=sim_options)
perf = cross_validate(alg, data, measures=['RMSE'], cv=3)
print_perf(perf)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Mean    
TEST_RMSE0.9373  0.9360  0.9298  0.9343  
FIT_TIME5.0590  5.8020  5.3520  5.4043  
TEST_TIME16.1350 16.2540 15.8750 16.0880 


3.KNNBaseline

In [5]:
sim_options = {'name': 'pearson_baseline', 'user_based': False}

alg = KNNBaseline(sim_options=sim_options)
perf = cross_validate(alg, data, measures=['RMSE'], cv=3)
print_perf(perf)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
        Fold 1  Fold 2  Fold 3  Mean    
TEST_RMSE0.9244  0.9299  0.9275  0.9272  
FIT_TIME5.7800  5.9480  5.6190  5.7823  
TEST_TIME18.0030 19.0140 18.2310 18.4160 


4.BaselineOnly

In [6]:
bsl_options = {'method': ['sgd'], 'reg': [0.02, 0.04, 0.06], 'learning_rate': [0.002, 0.005, 0.008]}
param_grid = {'bsl_options': bsl_options}

#alg = BaselineOnly()
gs = GridSearchCV(BaselineOnly, param_grid, measures=['rmse'], cv=3)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])


Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
0.944231389941042
{'bsl_options': {'method': 'sgd', 'reg': 0.02, 'learning_rate': 0.008}}


5.NormalPredictor

In [7]:
alg = NormalPredictor()
perf = cross_validate(alg, data, measures=['RMSE'], cv=3)
print_perf(perf)

        Fold 1  Fold 2  Fold 3  Mean    
TEST_RMSE1.5247  1.5175  1.5150  1.5191  
FIT_TIME0.2610  0.2680  0.2600  0.2630  
TEST_TIME0.7280  0.6060  0.6140  0.6493  


6.SVD

In [8]:
param_grid = {'n_epochs': [5, 10, 15], 'lr_all': [0.002, 0.004, 0.008], 'reg_all': [0.2, 0.5, 0.8]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])


0.9481902425711644
{'n_epochs': 15, 'lr_all': 0.008, 'reg_all': 0.2}


7.SVDpp

In [3]:
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.004], 'reg_all': [0.2, 0.5]}
gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])


0.956871414282793
{'n_epochs': 10, 'lr_all': 0.004, 'reg_all': 0.2}


8.NMF

In [4]:
param_grid = {'n_epochs': [5, 10, 15]}
gs = GridSearchCV(NMF, param_grid, measures=['rmse'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])


1.002760792834376
{'n_epochs': 10}


In [3]:
# movie id and name mapping
def read_item_names():
    dirname = os.path.dirname("__file__")
    filename = os.path.join(dirname, 'dataset/ml-100k/u.item')
    rid_to_name = {}
    name_to_rid = {}
    with io.open(filename, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]
    return rid_to_name, name_to_rid


In [4]:
# show topN similar movies as given movie_name
def showSimilarMovies(alg, rid_to_name, name_to_rid, movie_name, topN):
    # get movie raw_id
    movie_raw_id = name_to_rid[movie_name]
    # convert raw_id to inner_id
    movie_inner_id = alg.trainset.to_inner_iid(movie_raw_id)
    # get topN similar movies
    neighbor_movie_ids = alg.get_neighbors(movie_inner_id, topN)
    neighbors_raw_ids = [alg.trainset.to_raw_iid(inner_id) for inner_id in neighbor_movie_ids]
    neighbors_movies = [rid_to_name[raw_id] for raw_id in neighbors_raw_ids]
    print("The " + str(topN)  + " nearest neighbors of " + movie_name + " are: ")
    print()
    for movie in neighbors_movies:
        print(movie)

Choose KNNBaseline alg to build model

In [5]:
# Use KNNBaseline to build model
sim_options = {'name': 'pearson_baseline', 'user_based': False}
trainset = data.build_full_trainset()
alg = KNNBaseline(sim_options=sim_options)
alg.fit(trainset)


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x914d668>

In [6]:
rid_to_name, name_to_rid = read_item_names()
movie_name = "Twelve Monkeys (1995)"

# get topN recommondation based on given movie name
showSimilarMovies(alg, rid_to_name, name_to_rid, movie_name, 5)


The 5 nearest neighbors of Twelve Monkeys (1995) are: 

Bob Roberts (1992)
Star Trek: The Wrath of Khan (1982)
Dead Man Walking (1995)
Clockwork Orange, A (1971)
Brazil (1985)
