# Because You Watched

Using surprise SVD++ with matrix factorization to recommend movies

## Setup And Exploration

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from surprise import Dataset

# Set default seed
np.random.seed(2802)

In [2]:
ratings_df = pd.read_csv('datasets/movielens_100k/ratings.csv', usecols=['userId', 'movieId', 'rating'])

# Surprise dataset must have the columns in this order
ratings_df.columns = ['user_id', 'movie_id', 'rating']

ratings_df.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [3]:
movies = pd.read_csv('datasets/movielens_100k/movies.csv')

movies.columns = ['movie_id', 'title', 'genres']
movies = movies.set_index('movie_id')

movies.head()

Unnamed: 0_level_0,title,genres
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings = Dataset.load_from_df(ratings_df, rating_scale=(0.5, 5.0))

## Model Selection

In [None]:
from surprise import SVDpp

from surprise.model_selection import RandomizedSearchCV

param_distributions = {'n_factors': [150, 200],
                       'n_epochs': [150, 200],
                       'lr_all': [0.005, 0.008],
                       'reg_all': [0.1, 0.08]}

search = RandomizedSearchCV(SVDpp, param_distributions, measures=['rmse', 'mae'], cv=10, n_iter=5, n_jobs=-1)
search.fit(ratings)

print('RMSE Best Score', round(search.best_score['rmse'], 4))
print('RMSE Best Params', search.best_params['rmse'])

print('\nMAE Best Score', round(search.best_score['mae'], 4))
print('MAE Best Params', search.best_params['mae'])

In [7]:
from surprise.model_selection import cross_validate

model = search.best_estimator['rmse']

cv_results = cross_validate(model, ratings, measures=['rmse', 'mae'], cv=2, verbose=True)
cv_results

# Recommending Movies

## Predict Movie Ratings According a User

In [8]:
def to_prediction_info(prediction):
    movie_id, rating = prediction
    try:
        title = movies.loc[movie_id]['title']
        return (title, round(float(rating), 4))
    except:
        return None
        
def recommend_by_similar_users(user_id, n_recomendations=10):
    movie_ids = ratings.df['movie_id'].unique()
    user_movies_ids = ratings.df[ratings.df['user_id'] == user_id]['movie_id']   
    
    user_unwatched_movies_ids = np.setdiff1d(movie_ids, user_movies_ids)
    
    testset = [[user_id, movie_id, 0.0] # Parameter r_ui is unused in this context
               for movie_id in user_unwatched_movies_ids]
    
    predictions = model.test(testset)
    
    found_movies = [(pred.iid, pred.est) for pred in predictions]
    
    found_movies = map(to_prediction_info, found_movies)
    found_movies = filter(lambda prediction: prediction is not None, found_movies)
    found_movies = sorted(found_movies, key=lambda prediction: prediction[1], reverse=True)
    found_movies = list(found_movies[:n_recomendations])
    
    return found_movies

In [9]:
user_id = 10

recommendations = recommend_by_similar_users(user_id)
recommendations

[('Shawshank Redemption, The (1994)', 3.7052),
 ('Forrest Gump (1994)', 3.6853),
 ('To Kill a Mockingbird (1962)', 3.6802),
 ("One Flew Over the Cuckoo's Nest (1975)", 3.6518),
 ('Terminator 2: Judgment Day (1991)', 3.645),
 ('Apollo 13 (1995)', 3.6443),
 ('Odessa File, The (1974)', 3.644),
 ('Rocketeer, The (1991)', 3.6428),
 ('Fight Club (1999)', 3.6427),
 ('Groundhog Day (1993)', 3.638)]

## Checking Recommendations

In [10]:
user_movie_ratings = ratings.df[ratings.df['user_id'] == user_id].nlargest(10, 'rating')
user_movie_ratings = user_movie_ratings.reset_index()
user_movie_ratings = user_movie_ratings.set_index('movie_id')

user_movie_ratings = user_movie_ratings.join(movies)
user_movie_ratings = user_movie_ratings[['user_id', 'title', 'rating']]

user_movie_ratings

Unnamed: 0_level_0,user_id,title,rating
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
86548,10,Water for Elephants (2011),3.0


*Notes:*

This model fix some limitations of traditional models like:

- **Popularity bias:** When only the most popular movies will be considered on the recommendation.
- **Could start:** When there aren't many movies watched, the recommendation tends not to be very accurate.
    
When we factor the matrix, the popularity of the movie has no effect on the recommendations, so the amount of ratings is insignificant, solving both problems at once.

# Saving Model

In [11]:
import pickle

def save_model(filepath, fitted_model):
    try:
        with open(filepath, 'wb') as f:
            pickle.dump(fitted_model, f)
    except Exception as e:
        print(e)

        
def load_model(filepath):
    fitted_model = None
    try:
        with open(filepath, 'rb') as f:
            fitted_model = pickle.load(f)
            
        return fitted_model
    except Exception as e:
        print(e)

In [12]:
filename = 'svdpp_movies_recommender_matrix_factorization.pkl'
filepath = f'models/{filename}'

save_model(filepath, model)

[Errno 2] No such file or directory: 'models/svdpp_movies_recommender_matrix_factorization.pkl'
