In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

Install Surprise library with either pip or conda:

```
pip install scikit-surprise
```
OR
```
conda install -c conda-forge scikit-surprise
```

In [2]:
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split, RandomizedSearchCV
from surprise import accuracy
from surprise import SVD, KNNWithMeans, SlopeOne, BaselineOnly
from surprise import get_dataset_dir
from surprise.model_selection import cross_validate

In [3]:
## movies
movies = pd.read_csv('movies.csv',dtype={'movieid':str})
id_to_name = movies.set_index('movieid')['title'].to_dict()

In [4]:
movies.head()

Unnamed: 0,movieid,title,genres
0,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
1,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
2,7153,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy
3,3578,Gladiator (2000),Action|Adventure|Drama
4,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...


In [None]:
class_ratings = pd.read_csv('class_ratings.csv')
user_ids = class_ratings['userid'].unique()
n_ratings = class_ratings.groupby('userid').size()
users = zip(user_ids, n_ratings)

In [19]:
## MovieLens ratings
df = pd.read_csv('ratings_subset.csv', dtype={'userid': str, 'movieid': str, 'rating': float})

In [20]:
df.head()

Unnamed: 0,userid,movieid,rating
0,13,4993,5.0
1,16,106916,4.5
2,21,68157,4.5
3,37,68358,4.5
4,38,3996,5.0


In [21]:
## combine class ratings with ratings from MovieLens
df = pd.concat([df, class_ratings])

In [22]:
df.shape

(1060583, 3)

In [25]:
df.tail(2)

Unnamed: 0,userid,movieid,rating
3281,Emily,78499,3.5
3282,Jarom,5218,4.5


In [24]:
print(f'Unique users: {df["userid"].nunique()}')
print(f'Unique movies: {df["movieid"].nunique()}')
print(f'Average ratings/user: {df.groupby("userid").size().mean()}')

Unique users: 13638
Unique movies: 600
Average ratings/user: 77.76675465610793


**Train a model**

In [36]:
## Create a new data object with the updated data
reader = Reader(rating_scale=(0.5,5))
data = Dataset.load_from_df(df[['userid','movieid','rating']],reader)

In [37]:
# train-test split
trainset, testset = train_test_split(data, test_size=.25)

In [38]:
## Matrix factorization SVD
model_svd = SVD(n_factors=100)
model_svd.fit(trainset)


# ## KNN with mean adjustment
# ## Knn model take 3-3.5 min
# model_knn = KNNWithMeans(k=10)
# model_knn.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x32c4dfe00>

In [39]:

# ## Baseline
model_base = BaselineOnly()
model_base.fit(trainset)
base_predictions = model_base.test(testset)

# ## SlopeOne
model_so = SlopeOne()
model_so.fit(trainset)


Estimating biases using als...


<surprise.prediction_algorithms.slope_one.SlopeOne at 0x32c4d7470>

In [41]:
np.var(df['rating'])

1.0211489414899861

In [42]:
p_svd = model_svd.test(testset)
accuracy.rmse(p_svd)

RMSE: 0.7798


0.7797532940262601

In [43]:
## predicting from the model takes about 3 minutes
p_knn = model_knn.test(testset)
accuracy.rmse(p_knn)

NameError: name 'model_knn' is not defined

In [44]:
base_user, base_movie = model_base.compute_baselines()
p_base = model_base.test(testset)
accuracy.rmse(p_base)

RMSE: 0.8319


0.8318783064847846

In [45]:
p_so = model_so.test(testset)
accuracy.rmse(p_so)

RMSE: 0.8280


0.8280154360892522

In [None]:
## Tune hyperparameters for the SVD model
 ## This code takes about 8 mintues to run (on my computer at least)
 ## GridSearchCV will take much longer

# param_grid = {'n_factors': [50, 100, 150],
#               'n_epochs': [20, 30, 40],
#               'lr_all': [0.002, 0.008, 0.012],
#               'reg_all': [0.02, 0.06, 0.1, 0.15]}

# start_time = datetime.now()
# grid_search = RandomizedSearchCV(SVD, param_grid, measures=['rmse'], cv=5)
# grid_search.fit(data)
# end_time = datetime.now()

# min_to_run = (end_time-start_time).seconds/60
# print(f"Best RMSE score: {grid_search.best_score['rmse']}")
# print(f"Best parameters: {grid_search.best_params['rmse']}")
# print(f"Minutes to Run: {min_to_run}")

In [None]:
df.head()

**Get actual recommendations**

In [46]:
def get_top_recommendations(user, fitted_model, n_recommendations=10):
    unrated_movies = df[~df['movieid'].isin(df[df['userid'] == user]['movieid'])]
    predicted_ratings = []
    for movie in unrated_movies['movieid'].unique():
        predicted_ratings.append((movie, fitted_model.predict(user, movie).est))
    predicted_ratings.sort(key=lambda x: x[1], reverse=True)
    top_recommendations = predicted_ratings[:n_recommendations]
    print(f'Top {n_recommendations} recommendations for {user}')
    print('-----')
    for m,r in top_recommendations:
        print(f'{id_to_name[m]} ({round(r,3)})')

In [49]:
## Give recommendations for our class
get_top_recommendations('Caleb Christensen', model_svd, n_recommendations=10)

Top 10 recommendations for Caleb Christensen
-----
Spirited Away (Sen to Chihiro no kamikakushi) (2001) (4.31)
Lord of the Rings: The Return of the King, The (2003) (4.304)
Memento (2000) (4.12)
Eternal Sunshine of the Spotless Mind (2004) (4.08)
Old Boy (2003) (4.075)
Departed, The (2006) (3.964)
Dark Knight, The (2008) (3.922)
City of God (Cidade de Deus) (2002) (3.899)
Lives of Others, The (Das leben der Anderen) (2006) (3.881)
Interstellar (2014) (3.857)


In [48]:
user_ids

array(['Caleb Christensen', 'Emma Ouzts', 'James Christensen', 'Jessa',
       'Madison', 'Rebz27', 'Ryan Corry', 'Savage', 'Spencer Wilson',
       'TalmageA', 'Tyler Zaugg', 'Xela Marchant', 'bradyheinig',
       'brandon-keele', 'brycemartin', 'daphne', 'razedori', 'rebz27',
       'shannon'], dtype=object)