## Different Hit Rates for SVD algorithm

Load packages and data

In [31]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from collections import defaultdict


In [32]:
from surprise import Dataset
from surprise import Reader

#from surprise import BaselineOnly
#from surprise import KNNBasic
#from surprise import KNNWithMeans
from surprise import SVD
from surprise import accuracy
#from surprise import SlopeOne
#from surprise import SVDpp
#from surprise import NMF
#from surprise import NormalPredictor
from surprise import KNNBaseline
#from surprise import KNNWithZScore
#from surprise import CoClustering

from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import LeaveOneOut

from surprise import accuracy

RSEED = 42

In [33]:
movies = pd.read_csv('../data/ml-latest-small/movies.csv')
ratings = pd.read_csv('../data/ml-latest-small/ratings.csv')
ratings.movieId.nunique()

9724

In [34]:
df = pd.read_csv('../data/df_features.csv')
movieIds = df.drop_duplicates('movieId').movieId.to_list()
len(movieIds)

9543

In [35]:
ratings = ratings[ratings['movieId'].isin(movieIds)]
ratings.movieId.nunique()

9525

In [36]:
ratings.shape

(100329, 4)

Start Surprise

In [37]:
reader = Reader(rating_scale=(0.5,5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

Rating Predictions

In [38]:
trainset, testset = train_test_split(data, test_size=.25, random_state=RSEED)
algo = SVD(random_state=RSEED)
algo.fit(trainset)
predictions = algo.test(testset)

In [39]:
def MAE(predictions):
        return accuracy.mae(predictions, verbose=False)
def RMSE(predictions):
        return accuracy.rmse(predictions, verbose=False)
    
print("RMSE: ", RMSE(predictions))
print("MAE: ", MAE(predictions))

RMSE:  0.8740598389134607
MAE:  0.673521674727199


Top N

In [40]:
# Top N Recommendations

def GetTopN(predictions, n=10):
    topN = defaultdict(list)
    for userID, movieID, actualRating, estimatedRating, _ in predictions:
        topN[int(userID)].append((int(movieID), estimatedRating))

    for userID, ratings in topN.items():
        ratings.sort(key=lambda x: x[1], reverse=True)
        topN[int(userID)] = ratings[:n]

    return topN

LeaveOneOut

In [41]:
LOOCV = LeaveOneOut(n_splits=1, random_state=RSEED)

for trainset, testset in LOOCV.split(data):
    # Train model without left-out ratings
    algo.fit(trainset)
    # Predicts ratings for left-out ratings only
    leftOutPredictions = algo.test(testset)
    # Build predictions for all ratings not in the training set
    bigTestSet = trainset.build_anti_testset()
    allPredictions = algo.test(bigTestSet)
    # Compute top 10 recs for each user
    topNPredicted = GetTopN(allPredictions, n=10)

In [42]:
len(allPredictions)

5703821

In [43]:
topNPredicted = GetTopN(allPredictions, n=10)
topNPredicted[3]

[(1245, 3.7596136198656778),
 (1204, 3.6815787336523),
 (1104, 3.619943133381922),
 (3275, 3.4739792217709113),
 (2692, 3.452763400494233),
 (3201, 3.447145535347274),
 (34405, 3.4470058539466764),
 (1235, 3.437819931210512),
 (8950, 3.4287542360243615),
 (5902, 3.426820878970853)]

In [44]:
topNPredicted[4]

[(5618, 4.799785212691094),
 (1252, 4.453375731464654),
 (6283, 4.43796002089187),
 (1041, 4.4226816633546395),
 (954, 4.402875759279252),
 (1245, 4.388742553432238),
 (8207, 4.354396111283244),
 (913, 4.339730041881197),
 (4848, 4.323874272336767),
 (924, 4.322100514598407)]

Hit Rate

In [45]:
def HitRate(topNPredicted, leftOutPredictions):
    hits = 0
    total = 0

 # For each left-out rating
    for leftOut in leftOutPredictions:
        userID = leftOut[0]
        leftOutMovieID = leftOut[1]
        # Is it in the predicted top 10 for this user?
        hit = False
        for movieID, predictedRating in topNPredicted[int(userID)]:
            if (int(leftOutMovieID) == int(movieID)):
                hit = True
                break
        if (hit) :
            hits += 1

        total += 1

    # Compute overall precision
    return hits/total
print("\nHit Rate: ", HitRate(topNPredicted, leftOutPredictions))


Hit Rate:  0.03114754098360656


Hit Rate by Rating Value

In [46]:

def RatingHitRate(topNPredicted, leftOutPredictions):
    hits = defaultdict(float)
    total = defaultdict(float)
    # For each left-out rating
    for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
        # Is it in the predicted top N for this user?
        hit = False
        for movieID, predictedRating in topNPredicted[int(userID)]:
            if (int(leftOutMovieID) == movieID):
                hit = True
                break
        if (hit) :
            hits[actualRating] += 1
        total[actualRating] += 1

    # Compute overall precision
    for rating in sorted(hits.keys()):
        print(rating, hits[rating] / total[rating])
print("Hit Rate by Rating value: ")
RatingHitRate(topNPredicted, leftOutPredictions)

Hit Rate by Rating value: 
3.0 0.024793388429752067
3.5 0.037037037037037035
4.0 0.03867403314917127
4.5 0.0196078431372549
5.0 0.05172413793103448


Cumulative Hit Rate

In [47]:
def CumulativeHitRate(topNPredicted, leftOutPredictions, ratingCutoff=0):
    hits = 0
    total = 0
    # For each left-out rating
    for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
        # Only look at ability to recommend things the users actually liked...
        if (actualRating >= ratingCutoff):
            # Is it in the predicted top 10 for this user?
            hit = False
            for movieID, predictedRating in topNPredicted[int(userID)]:
                if (int(leftOutMovieID) == movieID):
                    hit = True
                    break
            if (hit) :
                hits += 1
            total += 1

        # Compute overall precision
    return hits/total
print("Cumulative Hit Rate (rating >= 4): ", CumulativeHitRate(topNPredicted, leftOutPredictions, 4.0))

Cumulative Hit Rate (rating >= 4):  0.040229885057471264


Average Reciprocal Hit Ranking (ARHR)

In [48]:
def AverageReciprocalHitRank(topNPredicted, leftOutPredictions):
    summation = 0
    total = 0
        # For each left-out rating
    for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
        # Is it in the predicted top N for this user?
        hitRank = 0
        rank = 0
        for movieID, predictedRating in topNPredicted[int(userID)]:
            rank = rank + 1
            if (int(leftOutMovieID) == movieID):
                hitRank = rank
                break
        if (hitRank > 0) :
                summation += 1.0 / hitRank

        total += 1

    return summation / total

print("Average Reciprocal Hit Rank: ", AverageReciprocalHitRank(topNPredicted, leftOutPredictions))

Average Reciprocal Hit Rank:  0.011677725735102784


## Extra:

### Grid Search for SVD

In [49]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV

# Use movielens-100K
#data = Dataset.load_builtin('ml-100k')

param_grid = {'n_epochs': [50, 60], 'lr_all': [0.01],
              'reg_all': [0.1], 'n_factors':[150, 200]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.857237700877555
{'n_epochs': 60, 'lr_all': 0.01, 'reg_all': 0.1, 'n_factors': 200}
