### We are going to do recommendations on MovieLens (netflix) type data.

In [None]:
# install surprise package
# https://surprise.readthedocs.io/en/stable/index.html

!pip install surprise

In [None]:
from surprise import KNNBaseline, SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import accuracy

import io  # needed because of weird encoding of u.item file


## Step 1 - Read Data

In [None]:
# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')
print ("downloaded data")

## Step 2 - Train

In [None]:
%%time

## Train the algo

trainset = data.build_full_trainset()

sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)


algo.fit(trainset)

In [None]:
# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

## Step 3 - Calculate The RMSE 

We want to see how our model does.

Anything less than plus or minus 0.5 star should be considered a success. That means on a scale of one to five we were less than half a star off.

In [None]:
testset = trainset.build_testset()
predictions = algo.test(testset)
# RMSE should be low as we are biased
accuracy.rmse(predictions, verbose=True) 



## Step 4 - Parse Data

In [None]:
def read_item_names():
    """Read the u.item file from MovieLens 100-k dataset and return two
    mappings to convert raw ids into movie names and movie names into raw ids.
    """

    file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid

In [None]:
from surprise import get_dataset_dir

# Read the mappings raw id <-> movie name
rid_to_name, name_to_rid = read_item_names()

print ("rid_to_name:")
iterator = iter(rid_to_name.items())
for i in range(10):
    print(next(iterator))

print()
print ("name_to_rid:")
iterator = iter(name_to_rid.items())
for i in range(10):
    print(next(iterator))

## Step 5 - Do Recommendations

Find similar movies


In [None]:
# Retrieve inner id of the movie Toy Story

movie_name = 'Toy Story (1995)'
# movie_name = 'Get Shorty (1995)'

movie_raw_id = name_to_rid[movie_name]
movie_inner_id = algo.trainset.to_inner_iid(movie_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
neighbors = algo.get_neighbors(movie_inner_id, k=10)

# Convert inner ids of the neighbors into names.
neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in neighbors)
neighbors = (rid_to_name[rid]
                       for rid in neighbors)

print()
print('The 10 nearest neighbors for :  ', movie_name)
for m in neighbors:
    print(m)
