In [1]:
from sklearn.decomposition import NMF
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix, csr_matrix
from scipy.spatial.distance import jaccard, cosine 
from pytest import approx

In [2]:
MV_users = pd.read_csv('data/users.csv')
MV_movies = pd.read_csv('data/movies.csv')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
MV_users.head(5)

Unnamed: 0,uID,gender,age,accupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [4]:
MV_movies.head()

Unnamed: 0,mID,title,year,Doc,Com,Hor,Adv,Wes,Dra,Ani,...,Chi,Cri,Thr,Sci,Mys,Rom,Fil,Fan,Act,Mus
0,1,Toy Story,1995,0,1,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2,3,Grumpier Old Men,1995,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II,1995,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
from collections import namedtuple
Data = namedtuple('Data', ['users','movies','train','test'])
data = Data(MV_users, MV_movies, train, test)

In [4]:
class RecSys():
    def __init__(self,data):
        self.data=data
        self.allusers = list(self.data.users['uID'])
        self.allmovies = list(self.data.movies['mID'])
        self.genres = list(self.data.movies.columns.drop(['mID', 'title', 'year']))
        self.mid2idx = dict(zip(self.data.movies.mID,list(range(len(self.data.movies)))))
        self.uid2idx = dict(zip(self.data.users.uID,list(range(len(self.data.users)))))
        self.Mr=self.rating_matrix()
        self.Mm=None 
        self.sim=np.zeros((len(self.allmovies),len(self.allmovies)))
        
    def rating_matrix(self):
        """
        Convert the rating matrix to numpy array of shape (#allusers,#allmovies)
        """
        ind_movie = [self.mid2idx[x] for x in self.data.train.mID] 
        ind_user = [self.uid2idx[x] for x in self.data.train.uID]
        rating_train = list(train.rating)
        #Matrix where the row index is the user index and the col index is the movie index, the rating is the value
        return np.array(coo_matrix((rating_train, (ind_user, ind_movie)), shape=(len(self.allusers), len(self.allmovies))).toarray())



# Intro: Non Negative Matrix Factorization

Not much EDA needs to be done as we previously reviewed this dataset in the other part of the homework assignment. We are using the datasets and class methods from the week 3 programming assignment to get our rating matrix. From there we can process Nonnegative Matrix Factorization.

In [5]:
rf = RecSys(data)
print(len(rf.genres))
rf.Mr

18


array([[5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [3, 0, 0, ..., 0, 0, 0]])

Because we have 18 genre types of movies, I'll use 18 components for the NMF model. This allows us to find the strengths of how each user rates each genre of movie. Using these ratings, NMF allows to fill in the missing ratings for each user.

In [6]:
nmf_model = NMF(n_components=18)
feature_mx = nmf_model.fit_transform(rf.Mr)
component_mx = nmf_model.components_
final_rating_mx = np.dot(feature_mx, component_mx)



In [7]:
final_rating_mx.shape

(6040, 3883)

With our new matrix, we can find our prediction for how each user would rate each movie.

In [8]:
predictions = np.array([final_rating_mx[rf.uid2idx[i], rf.mid2idx[j]] 
                        for i,j in zip(
                            rf.data.test['uID'], rf.data.test['mID'])])

rf.data.test['predictions'] = predictions
rf.data.test

Unnamed: 0,uID,mID,rating,predictions
0,2233,440,4,1.669059
1,4274,587,5,0.373454
2,2498,454,3,0.300891
3,2868,2336,5,0.919306
4,1636,2686,5,0.978893
...,...,...,...,...
300058,810,247,4,0.093123
300059,1193,3210,4,0.842434
300060,6039,2289,4,0.671820
300061,5397,429,3,0.088549


In [9]:
final_rating_mx

array([[1.92472614e+00, 5.78581146e-01, 2.80662932e-02, ...,
        1.18293516e-02, 4.31324180e-03, 7.86849120e-02],
       [1.19334072e+00, 4.03015040e-01, 6.06924515e-02, ...,
        1.74141535e-02, 1.63808665e-05, 6.82180362e-02],
       [9.97092922e-01, 1.30124427e-01, 1.60564657e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [6.31001211e-01, 2.68689673e-02, 3.63032412e-03, ...,
        9.23281456e-04, 1.04313072e-03, 5.72002134e-03],
       [1.37138886e+00, 2.92030763e-01, 9.31304803e-02, ...,
        5.27441046e-02, 0.00000000e+00, 0.00000000e+00],
       [1.15295313e+00, 9.31975314e-02, 1.35167388e-02, ...,
        8.85274198e-02, 8.88952691e-02, 4.44227432e-01]])

We can see the performance of this model is terrible. In fact, it performed much word than all of our metrics from the programming assignment. This is due to the sparsity of the matrix. The ratings were essentially imputed for each user without much data to work with. Our models from week 3 took the weighted mean of the already rated movies for each user. However, in this case, unrated movies were used in rating evaluation.

The best way to fix the issue is to impute more of the missing values in some way. Whether that's by user average, baseline, etc. The less sparse the matrix is the better the NMF implementation will work.

In [10]:
np.sqrt(np.mean((predictions - rf.data.test['rating'])**2))

2.8560364273595185