In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
%store -r anime_ratings

### Baseline Model

In [3]:
# Import Surprise
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import cross_validate, train_test_split
from surprise.prediction_algorithms import SVD
from surprise.model_selection import GridSearchCV

In [4]:
# Get data ready for Surprise
# With collaborative filtering, only user, item, and rating are needed
data = anime_ratings[['user_id', 'MAL_ID', 'rating']]

reader = Reader(line_format='user item rating', sep=',')
data = Dataset.load_from_df(data, reader=reader)

In [5]:
# Create a train test split
trainset, testset = train_test_split(data, test_size=.2, random_state=42)

In [6]:
# Instatiate svd
svd = SVD(n_factors=15, n_epochs=10, lr_all=0.005, reg_all=0.02, random_state=42)

# Fit to the training set
svd.fit(trainset)

# Create predictions
predictions = svd.test(testset)

# Measure the accuracy of the predictions
rmse = accuracy.rmse(predictions)
print("RMSE:", rmse)

RMSE: 3.2239
RMSE: 3.223900678618706


__Insights:__ The baseline model's predictions were 3 points off from the actual ratings. This is a fairly large spread.

### Model 2

In [20]:
# Get data ready for Surprise
# With collaborative filtering, only user, item, and rating are needed
data = anime_ratings[['user_id', 'MAL_ID', 'rating']]

# Take a sample of data to use in a grid search
num_samples = 100000
sampled_data = data.sample(n=num_samples, random_state=42)

# Instatiate reader and load sampled_data using surprise 
reader = Reader(line_format='user item rating', sep=',')
sampled_data = Dataset.load_from_df(sampled_data, reader=reader)

In [21]:
param_grid = {
    'n_factors': [5, 15, 20],
    'n_epochs': [5, 10, 25],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.02, 0.04, 0.06]
}

In [22]:
from surprise.model_selection import KFold

In [23]:
# Define the parameter grid
param_grid = {
    'n_factors': [5, 15, 20],
    'n_epochs': [5, 10, 25],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.02, 0.04, 0.06]
}

# Create a 5-fold cross-validation object
kf = KFold(n_splits=5, random_state=42)

# Create a GridSearchCV object
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5, n_jobs=-1, joblib_verbose=6)

# Fit the grid search to the sampled data
grid_search.fit(sampled_data)

# Print the best RMSE and corresponding hyperparameters
print("Best RMSE: {:.4f}".format(grid_search.best_score['rmse']))
print("Best Parameters:", grid_search.best_params['rmse'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   42.6s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:  1.4min


Best RMSE: 3.2304
Best Parameters: {'n_factors': 5, 'n_epochs': 25, 'lr_all': 0.01, 'reg_all': 0.06}


[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  2.1min finished


In [24]:
# Get data ready for Surprise
# With collaborative filtering, only user, item, and rating are needed
data = anime_ratings[['user_id', 'MAL_ID', 'rating']]

# Instatiate reader and load sampled_data using surprise 
reader = Reader(line_format='user item rating', sep=',')
data = Dataset.load_from_df(data, reader=reader)

In [None]:
# Define the parameter grid
param_grid = {
    'n_factors': [5, 15, 20],
    'n_epochs': [5, 10, 25],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.02, 0.04, 0.06]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5, n_jobs=-1, joblib_verbose=6)

# Fit the grid search to the sampled data
grid_search.fit(data)

# Print the best RMSE and corresponding hyperparameters
print("Best RMSE: {:.4f}".format(grid_search.best_score['rmse']))
print("Best Parameters:", grid_search.best_params['rmse'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
IOStream.flush timed out
