In [19]:
import pandas as pd
import numpy as np
from collections import namedtuple
from scipy.sparse import coo_matrix, csr_matrix

from sklearn.decomposition import NMF
from scipy.sparse import coo_matrix


In [20]:
CONFIG = {
    "MOVIE_LENS_MOVIES_PATH": "./movielens-1m-dataset/movies.csv",
    "MOVIE_LENS_USERS_PATH": "./movielens-1m-dataset/users.csv",
    "TRAIN_PATH": "./movielens-1m-dataset/train.csv",
    "TEST_PATH": "./movielens-1m-dataset/test.csv",
    "MOVIE_LENS_SAMPLE_SIZE": 1000,
    "RANDOM_STATE": 42,
    "DEFAULT_TEST_SIZE": 0.2,
    "DEFAULT_TRAIN_SIZE": 0.8,
}

In [21]:
MV_users = pd.read_csv(CONFIG['MOVIE_LENS_USERS_PATH'])
MV_movies = pd.read_csv(CONFIG['MOVIE_LENS_MOVIES_PATH'])
train = pd.read_csv(CONFIG['TRAIN_PATH'])
test = pd.read_csv(CONFIG['TEST_PATH'])

In [22]:

Data = namedtuple('Data', ['users','movies','train','test'])
data = Data(MV_users, MV_movies, train, test)

In [23]:
data.users

Unnamed: 0,uID,gender,age,accupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [24]:
data.movies

Unnamed: 0,mID,title,year,Doc,Com,Hor,Adv,Wes,Dra,Ani,...,Chi,Cri,Thr,Sci,Mys,Rom,Fil,Fan,Act,Mus
0,1,Toy Story,1995,0,1,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2,3,Grumpier Old Men,1995,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II,1995,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents,2000,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3879,3949,Requiem for a Dream,2000,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3880,3950,Tigerland,2000,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3881,3951,Two Family House,2000,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
data.train

Unnamed: 0,uID,mID,rating
0,744,1210,5
1,3040,1584,4
2,1451,1293,5
3,5455,3176,2
4,2507,3074,5
...,...,...,...
700141,1184,2916,3
700142,137,1372,5
700143,195,2514,3
700144,1676,2566,3


In [26]:
data.test

Unnamed: 0,uID,mID,rating
0,2233,440,4
1,4274,587,5
2,2498,454,3
3,2868,2336,5
4,1636,2686,5
...,...,...,...
300058,810,247,4
300059,1193,3210,4
300060,6039,2289,4
300061,5397,429,3


In [35]:

class RecSys:
    def __init__(self, data):
        self.data = data
        self.allusers = list(self.data.users["uID"])
        self.allmovies = list(self.data.movies["mID"])
        self.genres = list(self.data.movies.columns.drop(["mID", "title", "year"]))
        self.mid2idx = dict(
            zip(self.data.movies.mID, list(range(len(self.data.movies))))
        )
        self.uid2idx = dict(zip(self.data.users.uID, list(range(len(self.data.users)))))
        self.Mr = self.rating_matrix()
        self.Mm = None
        self.sim = np.zeros((len(self.allmovies), len(self.allmovies)))

    def rating_matrix(self):
        """
        Convert the rating matrix to numpy array of shape (#allusers,#allmovies)
        """
        ind_movie = [self.mid2idx[x] for x in self.data.train.mID]
        ind_user = [self.uid2idx[x] for x in self.data.train.uID]
        rating_train = list(self.data.train.rating)

        return np.array(
            coo_matrix(
                (rating_train, (ind_user, ind_movie)),
                shape=(len(self.allusers), len(self.allmovies)),
            ).toarray()
        )

    def predict_everything_to_3(self):
        """
        Predict everything to 3 for the test data
        """
        # Generate an array with 3s against all entries in test dataset
        return np.full(len(self.data.test), 3.0)

    def predict_to_user_average(self):
        """
        Predict to average rating for the user.
        Returns numpy array of shape (#users,)
        """
        # Generate an array as follows:
        # 1. Calculate all avg user rating as sum of ratings of user across all movies/number of movies whose rating > 0
        # 2. Return the average rating of users in test data
        avg_rating_per_user = {}

        for uidx, uid in enumerate(self.allusers):
            ratings = self.Mr[uidx, :]
            rated = ratings > 0
            if rated.sum() > 0:
                avg_rating_per_user[uid] = ratings[rated].mean()
            else:
                avg_rating_per_user[uid] = 3.0  # fallback

        return np.array([avg_rating_per_user[uid] for uid in self.data.test.uID])

    def predict_from_sim(self, uid, mid):
        """
        Predict a user rating on a movie given userID and movieID
        """
        # Predict user rating as follows:
        # 1. Get entry of user id in rating matrix
        # 2. Get entry of movie id in sim matrix
        # 3. Employ 1 and 2 to predict user rating of the movie
        uidx = self.uid2idx[uid]
        midx = self.mid2idx[mid]

        user_ratings = self.Mr[uidx, :]
        similarities = self.sim[midx, :]

        mask = user_ratings > 0
        if mask.sum() == 0:
            return np.nan

        numerator = np.dot(similarities[mask], user_ratings[mask])
        denominator = np.sum(np.abs(similarities[mask]))

        if denominator == 0:
            return np.nan
        return numerator / denominator

    def predict(self):
        """
        Predict ratings in the test data. Returns predicted rating in a numpy array of size (# of rows in testdata,)
        """
        # your code here
        preds = []
        for uid, mid in zip(self.data.test.uID, self.data.test.mID):
            preds.append(self.predict_from_sim(uid, mid))
        return np.array(preds)

    def rmse(self, yp):
        yp[np.isnan(yp)] = (
            3  # In case there is nan values in prediction, it will impute to 3.
        )
        yt = np.array(self.data.test.rating)
        return np.sqrt(((yt - yp) ** 2).mean())


class ContentBased(RecSys):
    def __init__(self, data):
        super().__init__(data)
        self.data = data
        self.Mm = self.calc_movie_feature_matrix()

    def calc_movie_feature_matrix(self):
        """
        Create movie feature matrix in a numpy array of shape (#allmovies, #genres)
        """
        return self.data.movies[self.genres].values.astype(int)

    def calc_item_item_similarity(self):
        """
        Create item-item similarity using Jaccard similarity
        """
        # Update the sim matrix by calculating item-item similarity using Jaccard similarity
        # Jaccard Similarity: J(A, B) = |A∩B| / |A∪B|
        n_movies = len(self.allmovies)
        for i in range(n_movies):
            for j in range(i, n_movies):
                intersection = np.logical_and(self.Mm[i], self.Mm[j]).sum()
                union = np.logical_or(self.Mm[i], self.Mm[j]).sum()
                if union == 0:
                    sim_value = 0
                else:
                    sim_value = intersection / union
                self.sim[i, j] = sim_value
                self.sim[j, i] = sim_value  # symmetry


class Collaborative(RecSys):
    def __init__(self, data):
        super().__init__(data)

    def calc_item_item_similarity(self, simfunction, *X):
        """
        Create item-item similarity using similarity function.
        X is an optional transformed matrix of Mr
        """
        # General function that calculates item-item similarity based on the sim function and data inputed
        if len(X) == 0:
            self.sim = simfunction()
        else:
            self.sim = simfunction(
                X[0]
            )  # *X passes in a tuple format of (X,), to X[0] will be the actual transformed matrix

    def cossim(self):
        """
        Calculates item-item similarity for all pairs of items using cosine similarity (values from 0 to 1) on utility matrix
        Returns a cosine similarity matrix of size (#all movies, #all movies)
        """
        # Return a sim matrix by calculating item-item similarity for all pairs of items using Jaccard similarity
        # Cosine Similarity: C(A, B) = (A.B) / (||A||.||B||)
        Mr = self.Mr.copy().astype(float)
        user_sum = Mr.sum(axis=1)
        user_count = (Mr > 0).sum(axis=1)
        user_mean = np.divide(
            user_sum,
            user_count,
            out=np.zeros_like(user_sum, dtype=float),
            where=user_count != 0,
        )
        # Fill missing
        for u in range(Mr.shape[0]):
            Mr[u, Mr[u, :] == 0] = user_mean[u]
        # Subtract user mean to center
        X = Mr - user_mean[:, np.newaxis]

        # Step 2: Cosine similarity
        X = X.T  # now items x users
        numerators = X @ X.T
        norms = np.linalg.norm(X, axis=1).reshape(-1, 1)
        denominators = norms @ norms.T
        sim = np.divide(
            numerators,
            denominators,
            out=np.zeros_like(numerators),
            where=denominators != 0,
        )

        # Step 3: Rescale to 0~1
        sim = (sim + 1) / 2

        # Fill diagonal (zero norms become zero sim, so ensure self-similarity = 1)
        np.fill_diagonal(sim, 1.0)

        return sim

    def jacsim(self, Xr):
        """
        Calculates item-item similarity for all pairs of items using jaccard similarity (values from 0 to 1)
        Xr is the transformed rating matrix.
        """
        # Return a sim matrix by calculating item-item similarity for all pairs of items using Jaccard similarity
        # Jaccard Similarity: J(A, B) = |A∩B| / |A∪B|
        Xb = csr_matrix((Xr > 0).astype(int))
        intersection = (Xb.T @ Xb).toarray()
        item_counts = np.array(Xb.sum(axis=0)).ravel()
        union = item_counts[:, None] + item_counts[None, :] - intersection

        with np.errstate(divide="ignore", invalid="ignore"):
            sim = intersection / union
            sim[union == 0] = 0.0
        np.fill_diagonal(sim, 1.0)
        return sim

class NMFRecommender(RecSys):
    def __init__(self, data, n_components=20):
        super().__init__(data)
        self.n_components = n_components
        self.W = None
        self.H = None
        self.Mr_hat = None
        
    def fill_missing(self):
        """
        Fill missing values in the rating matrix with user average ratings to create a dense non-negative matrix.
        """
        Mr_filled = self.Mr.copy()
        avg_user_rating = Mr_filled.sum(axis=1) / np.maximum((Mr_filled > 0).sum(axis=1), 1)
        for u in range(Mr_filled.shape[0]):
            Mr_filled[u, Mr_filled[u, :] == 0] = avg_user_rating[u]
        return Mr_filled
    
    def calc_nmf(self):
        """
        Fit NMF on the filled matrix and compute predicted full matrix Mr_hat.
        """
        Mr_filled = self.fill_missing()
        nmf_model = NMF(n_components=self.n_components, init='random', random_state=42, max_iter=3000, tol=1e-3)
        self.W = nmf_model.fit_transform(Mr_filled)
        self.H = nmf_model.components_
        self.Mr_hat = np.dot(self.W, self.H)
        
    def predict(self):
        """
        Predict ratings on the test data using the reconstructed matrix from NMF.
        Returns numpy array of predicted ratings.
        """
        preds = []
        for uid, mid in zip(self.data.test.uID, self.data.test.mID):
            uidx = self.uid2idx[uid]
            midx = self.mid2idx[mid]
            preds.append(self.Mr_hat[uidx, midx])
        return np.array(preds)

In [36]:
rs = RecSys(data)
yp = rs.predict_everything_to_3()
print(rs.rmse(yp))

1.2585510334053043


In [37]:
cb = ContentBased(data)
cb.calc_item_item_similarity()
yp = cb.predict()
rmse = cb.rmse(yp)
print(f"content based rmse: {rmse}")

content based rmse: 1.0128116783754684


In [38]:
nmf_rs = NMFRecommender(data, n_components=20)
nmf_rs.calc_nmf()
yp_nmf = nmf_rs.predict()
print("RMSE (NMF):", nmf_rs.rmse(yp_nmf))

RMSE (NMF): 1.0443069723276244
