In [None]:
import pandas as pd

In [None]:
# Reading files
ratings = pd.read_csv('../data/ratings.csv')

In [None]:
ratings.head()

In [None]:
# Users with the least amount of ratings?
ratingsByUser = ratings.groupby('userId', as_index=False)['movieId'].count()
ratingsByUser.rename(columns = {'movieId':'numRatings'}, inplace = True)
ratingsByUser.loc[ratingsByUser['numRatings'] == ratingsByUser['numRatings'].min()]

In [None]:
# Users with the higuest number of ratings?
ratingsByUser.loc[ratingsByUser['numRatings'] == ratingsByUser['numRatings'].max()]

In [None]:
# Movies with the least amount of ratings?
ratingsByMovie = ratings.groupby('movieId', as_index=False)['userId'].count()
ratingsByMovie.rename(columns = {'userId':'numRatings'}, inplace = True)
ratingsByMovie.loc[ratingsByMovie['numRatings'] == ratingsByMovie['numRatings'].min()]

In [None]:
# Movies with the higuest number of ratings?
type(ratingsByMovie.loc[ratingsByMovie['numRatings'] == ratingsByMovie['numRatings'].max()])

# Movie title converter

In [None]:
movies = pd.read_csv('../data/movies.csv', usecols=[0,1], index_col=0, squeeze=True).to_dict()

In [None]:
movies[1]

# Surprise SciKit

In [None]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from collections import defaultdict

In [None]:
!head ../data/ratings.csv

In [None]:
%%time

ratings_path = '../data/ratings.csv'

reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale = (0.5,5.0), skip_lines=1)

data = Dataset.load_from_file(ratings_path, reader=reader)

#trainset, testset = train_test_split(data, test_size=0.2)
trainset = data.build_full_trainset() # cross-validation will be applied for the evaluation

SVD_model = SVD(lr_all=0.005, reg_all=0.02)
SVD_model.fit(trainset)

results = cross_validate(
    algo = SVD_model, data = data, measures=['RMSE'], 
    cv=5, return_train_measures=True
    )
results['test_rmse'].mean()

SVD_model.predict(uid='1', iid='6')

movies.loc[movies['movieId'] == 6]

ratings.loc[(ratings['userId'] == 1) & (ratings['rating'] == 5)]

movies.loc[movies['movieId'].isin([47,50])]

In [None]:
param_grid = {
    'n_factors': [5], #[10, 100, 500],
    'n_epochs': [5], #[5, 20, 50], 
    'lr_all': [0.005], #[0.001, 0.005, 0.02],
    'reg_all': [0.02]} #[0.005, 0.02, 0.1]}

In [None]:
gs_model = GridSearchCV(
    algo_class = SVD,
    param_grid = param_grid,
    n_jobs = -1,
    joblib_verbose = 25,
    return_train_measures=True)

In [None]:
gs_model.fit(data)

gs_model.best_params

In [None]:
best_SVD = gs_model.best_estimator['rmse']
best_SVD.fit(trainset)

In [None]:
gs_model.cv_results['mean_test_rmse']

In [None]:
print(f"Test RMSE = {gs_model.cv_results['mean_test_rmse'][0].round(3)}")

# Get recommendations

In [None]:
TEST_USER = '2'

%%time
ratings = pd.read_csv('../data/ratings.csv')
movies = pd.read_csv('../data/movies.csv')
unrated_movie_ids = [movieId for movieId in movies['movieId'] if not movieId in ratings.loc[(ratings['userId'] == TEST_USER), 'movieId'].tolist()]
unrated_movie_ids

predictions = [()]

best_SVD.predict(uid = str(TEST_USER), iid = '1')

In [None]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        print(f'iid: {iid} - title:{movies[int(iid)]}')
        top_n[uid].append((movies[int(iid)], est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
%%time
# Then predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()

#It's important to reduce the testset for the given user only, otherwise it takes too long to predict

testset = [user for user in testset if user[0] == TEST_USER]

In [None]:
%%time
full_predictions = best_SVD.test(testset)

In [None]:
full_predictions

In [None]:
len(full_predictions)

In [None]:
type(full_predictions[0])

In [None]:
full_predictions[0].uid

In [None]:
#############################
# To Do: full predictions will be loaded, I need to filter the ones for the given user only
#############################
user_predictions = [prediction for prediction in full_predictions if prediction.uid == TEST_USER]

In [None]:
len(user_predictions)

In [None]:
%%time
top_ratings = get_top_n(user_predictions, n=10)

In [None]:
recommended_movies = [movie[0] for movie in top_ratings[TEST_USER]]
recommended_movies

# Save model

In [None]:
from surprise import dump

In [None]:
!pwd

In [None]:
dump.dump('modelo', algo=best_SVD, predictions=full_predictions)

In [None]:
!ls -lah

In [None]:
#!rm modelo

# Load model

In [None]:
full_predictions, mi_modelo = dump.load('modelo')

In [None]:
user_predictions = [prediction for prediction in full_predictions if prediction.uid == TEST_USER]

In [None]:
print(len(full_predictions))
print(len(user_predictions))