In [1]:
from surprise import accuracy
from collections import defaultdict

In [2]:
class RecommenderMetrics:
    def MAE(predictions):
        return accuracy.mae(predictions, verbose=True)
    
    def RMSE(predictions):
        return accuracy.rmse(predictions, verbose=True)
    
    def GetTopN(predictions, n=10, minumumRating=0.5):
        topN = defaultdict()

        for userID, movieID, actualRating, estimatedRating, _ in predictions:
            if(estimatedRating >= minumumRating):
                topN[int(userID)].append(int(movieID), estimatedRating)

        for userID, ratings in topN.items():
            ratings.sort(key=lambda x: x[1], reversed=True)
            topN[int(userID)] = ratings[:n]

        return topN
    
    def HitRate(topNpredicted, leftOutPredictions):
        hits = 0
        total = 0

        for leftOut in leftOutPredictions:
            userID, leftOutMovieID = leftOut[0], leftOut[1]

            hit = False
            for movieID, predictedRating in topNpredicted[int(userID)]:
                if(int(leftOutMovieID) == int(movieID)):
                    hit = True
                    break
            if hit:
                hits += 1

            total += 1

        return hits/total

    # Similarity to HitRate, add a threshold to determine which movies customers really like.
    def cumulativeHitRate(topNpredicted, leftOutPredictions, ratingCutOff=0):
        hits = 0
        total = 0

        for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
            if(actualRating >= ratingCutOff):
                hit = False
                for movieID, predictedRating in topNpredicted[int(userID)]:
                    if(int(leftOutMovieID) == int(movieID)):
                        hit = True
                        break
                if hit:
                    hits += 1

                total += 1

        return hits/total
    
    def RatingHitRate(topNPredicted, leftOutPredictions):
        hits = defaultdict(float)
        total = defaultdict(float)

        for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
            hit = False
            for movieID, predictedRating in topNPredicted[int(userID)]:
                if(int(leftOutMovieID) == movieID):
                    hit = True
                    break
            if(hit):
                hits[actualRating] += 1

            total[actualRating] += 1

        for rating in sorted(hits.keys):
            print(rating, hits[actualRating]/total[actualRating])

    def AverageReciprocalHitRank(topNPredicted, leftOutPredictions):
        summation = 0
        total = 0

        for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
            hitRank = 0
            rank = 0
            for movieID, predictedRating in topNPredicted(userID):
                rank += 1
                if(int(leftOutMovieID) == movieID):
                    hitRank = rank
                    break
            if(hitRank > 0):
                summation += 1.0/hitRank

            total 

        return summation / total
    
    def UserCoverage(topNPredicted, numUsers, ratingThreshold=0):
        hits = 0

        for userID in topNPredicted.keys():
            hit = False
            for movieID, predictedRating in topNPredicted[userID]:
                if(predictedRating >= ratingThreshold):
                    hit = True
                    break
            if(hit):
                hits += 1

        return hits / numUsers
    
    def Diversity(topNPredicted, simAlgo):
        pass

    def novelty(topNPredicted, rankings):
        n = 0
        total = 0

        for userID in topNPredicted.keys():
            for rating in topNPredicted[userID]:
                movieID = rating[0]
                rank = rankings[movieID]
                total += rank
                n += 1

        return total / n



In [None]:
import os
import pandas as pd
import sys
from surprise import Dataset, Reader
from collections import defaultdict
import numpy as np
import re

class MovieLens:
    ID2name = {}
    name2ID = {}
    ratingsPath = 'ml-latest-small/ratings.csv'
    moviesPath = 'ml-latest-small/movies.csv'
    df_ratings = None
    df_movies = None
    df_genre = {
        "Action": 0,
        "Adventure": 1,
        "Animation": 2,
        "Children": 3,
        "Comedy": 4,
        "Crime": 5,
        "Documentary": 6,
        "Drama": 7,
        "Fantasy": 8,
        "Film-Noir": 9,
        "Horror": 10,
        "Musical": 11,
        "Mystery": 12,
        "Romance": 13,
        "Sci-Fi": 14,
        "Thriller": 15,
        "War": 16,
        "Western": 17
    }

    def loadDataset(self):
        # Look for files relative to the directory we are running from
        # os.chdir(os.path.dirname(sys.argv[0]))
        df_ratings = pd.read_csv(self.ratingsPath)
        self.df_ratings = pd.DataFrame(df_ratings)
        df_movies = pd.read_csv(self.moviesPath)
        self.df_movies = pd.DataFrame(df_movies)

        # print(self.df_movies.head)
        # print(self.df_movies['genres'])
        for i in range(len(self.df_movies)):
            movieID = self.df_movies['movieId'][i]
            movieName = self.df_movies['genres'][i]
            self.ID2name[movieID] = movieName
            self.name2ID[movieName] = movieID

        reader = Reader(line_format='user item rating timestamp', sep=',',skip_lines=1)
        ratingsDataset = Dataset.load_from_file(self.ratingsPath, reader=reader)
        # print(ratingsDataset)
        return ratingsDataset

    def getUserRatings(self, user):
        userRatings = []
        # hit = False
        # print(self.df_ratings.columns)
        for i in range(len(self.df_ratings)):
            if(user == self.df_ratings["userId"][i]):
                userRatings.append((self.df_ratings['movieId'][i], self.df_ratings['rating'][i]))

        return userRatings

    def getPopularityRank(self):
        ratings = defaultdict(int)
        rankings = defaultdict(int)

        for i in range(len(self.df_ratings)):
            movieID = self.df_ratings['movieId'][i]
            ratings[movieID] += 1

        rank = 1
        for id, count in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
            rankings[id] = rank
            rank += 1
        
        return rankings

    def getGenre(self):
        genres = defaultdict()

        num_genre = len(self.df_genre)
        for i in range(len(self.df_movies)):
            movieID = self.df_movies['movieId'][i]
            movieGenre = self.df_movies['genres'][i].split(sep="|")
            print(movieGenre)

            genre_encode = [0] * num_genre
            for gen in self.df_genre:
                if gen in movieGenre:
                    genre_encode[self.df_genre[gen]] = 1
            genres[movieID] = genre_encode

        return genres

    def getYears(self):
        years = defaultdict()
        p = re.compile(r"(?:\((\d{4})\))?\s*$")

        for i in range(len(self.df_movies)):
            movieID = self.df_movies['movieId'][i]
            movieName = self.df_movies['title'][i]
            m = p.search(movieName)
            year = m.group(1)
            if year:
                years[movieID] = int(year)
        
        return years

    def getMiseEnScene(self):
        pass 
    
    def getMovieName(self, movieID):
        movieName = ""
        if movieID in self.ID2name:
            movieName = self.ID2name[movieID]
        
        return movieName
    
    def getMovieID(self, movieName):
        movieID = 0
        if movieName in self.name2ID:
            movieID = self.name2ID[movieName]

        return movieID



In [None]:
movieLens = MovieLens()
movieLens.loadDataset()
    
user_id = 1  # Example user ID
movieLens.getPopularityRank()
# user_ratings = movieLens.getUserRatings(user_id)
# print(f"User {user_id} ratings: {user_ratings}")

defaultdict(int,
            {356: 1,
             318: 2,
             296: 3,
             593: 4,
             2571: 5,
             260: 6,
             480: 7,
             110: 8,
             589: 9,
             527: 10,
             2959: 11,
             1: 12,
             1196: 13,
             50: 14,
             2858: 15,
             47: 16,
             780: 17,
             150: 18,
             1198: 19,
             4993: 20,
             1210: 21,
             858: 22,
             457: 23,
             592: 24,
             2028: 25,
             5952: 26,
             7153: 27,
             588: 28,
             608: 29,
             2762: 30,
             380: 31,
             32: 32,
             364: 33,
             1270: 34,
             377: 35,
             3578: 36,
             4306: 37,
             1580: 38,
             590: 39,
             648: 40,
             344: 41,
             4226: 42,
             367: 43,
             58559: 44,
           

: 

In [24]:
a = "Adventure|Animation|Children|Comedy|Fantasy"
a = a.split(sep='|')
print(a)

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']


In [36]:
print(a.loadDataset)

<bound method MovieLens.loadDataset of <__main__.MovieLens object at 0x00000172DFFBC910>>


In [26]:
df = pd.read_csv("ml-latest-small/movies.csv")
df = pd.DataFrame(df)
df["genres"]

0       Adventure|Animation|Children|Comedy|Fantasy
1                        Adventure|Children|Fantasy
2                                    Comedy|Romance
3                              Comedy|Drama|Romance
4                                            Comedy
                           ...                     
9737                Action|Animation|Comedy|Fantasy
9738                       Animation|Comedy|Fantasy
9739                                          Drama
9740                               Action|Animation
9741                                         Comedy
Name: genres, Length: 9742, dtype: object