In [200]:
'''
Problem Statement:
In this part of the exercise, you will implement the collaborative filtering learning algorithm and apply it 
to a dataset of movie ratings. This dataset consists of ratings on a scale of 1 to 5. 
The dataset has nu = 943 users, and nm = 1682 movies.
'''

import pandas as  pd
import numpy as np
import scipy.io as sio
from scipy.optimize import minimize
import matplotlib.pyplot as plt
from matplotlib import rcParams
%matplotlib inline

import warnings 
warnings.filterwarnings('ignore')

# Part 1: Loading Movie Ratings Dataset

In [169]:
# We will start by loading the movie ratings dataset to understand the structure of the data.

print("Loading Movie ratings dataset.\n")

dataM = sio.loadmat('ex8_movies.mat')

Y1 = dataM['Y']
R1 = dataM['R']

'''
Y1 is a 1682x943 matrix, containing ratings (1-5) of 1682 movies on 943 users

R1 is a 1682x943 matrix, where R1(i,j) = 1 if and only if user j gave arating to movie i
'''

# From the matrix, we can compute statistics like average rating.
print('Average rating for movie 1 (Toy Story) is {:.3f} / 5\n'.format(np.mean(Y1[0, R1[0, :]==1])))


Loading Movie ratings dataset.

Average rating for movie 1 (Toy Story) is 3.878 / 5



# Part 2 : Collaborative Filtering Cost Function & Gradient

In [170]:
def cofiCostFunc(params, Y, R, num_users, num_movies, num_features, learning_rate):
    
    '''
    COFICOSTFUNC Collaborative filtering cost function returns the cost and gradient for the
    collaborative filtering problem.
    '''
    
    # Unfold the U and W matrices from params
    X = np.matrix(np.reshape(params[0:num_movies*num_features], (num_movies, num_features)))
    Theta = np.matrix(np.reshape(params[num_movies*num_features:], (num_users, num_features)))


    '''
    Notes:  
    
    X : num_movies  x num_features matrix of movie features
    Theta : num_users  x num_features matrix of user features
    Y : num_movies x num_users matrix of user ratings of movies
    R : num_movies x num_users matrix, where R(i, j) = 1 if the i-th movie was rated by the j-th user
    
    You should set the following variables correctly:
    
    X_grad : num_movies x num_features matrix, containing the partial derivatives w.r.t. to each element of X
    
    Theta_grad : num_users x num_features matrix, containing the partial derivatives w.r.t. to each element of Theta
    '''
    
    J1 = (1.0/2)*np.sum(np.multiply(R,np.power((np.dot(X,Theta.T)-Y),2)))  # cost term without considering regularization 

    Jreg = (float(learning_rate)/2)*np.sum(np.power(X,2))+(float(learning_rate)/2)*np.sum(np.power(Theta,2))
    
    J = J1+Jreg
    
    X_grad = np.dot(np.multiply(R,(np.dot(X,Theta.T)-Y)),Theta) + float(learning_rate)*X
    Theta_grad = np.dot(np.multiply(R.T,(np.dot(X,Theta.T)-Y).T),X) + float(learning_rate)*Theta
    
    grad = np.hstack([X_grad.ravel(),Theta_grad.ravel()])
    
    return J,grad


In [171]:
'''
We will now implement the cost function for collaborative filtering.
To help debug our cost function, we have included set of weights that we trained on that.
'''

# Load pre-trained weights (X, Theta, num_users, num_movies, num_features)
Params = sio.loadmat('ex8_movieParams.mat')
X = Params['X']
Theta = Params['Theta']
num_users = Params['num_users']
num_movies = Params['num_movies']
num_features = Params['num_features']

# Reduce the data set size so that this runs faster
num_users = 4
num_movies = 5
num_features = 3
X = X[0:num_movies, 0:num_features]
Theta = Theta[0:num_users, 0:num_features]
Y = Y[0:num_movies, 0:num_users]
R = R[0:num_movies, 0:num_users]

# Evaluate cost function for regularization rate of 0
J,grad = cofiCostFunc(np.hstack([X.ravel(),Theta.ravel()]), Y, R, num_users, \
                 num_movies, num_features, 0)

print('Cost at loaded parameters: {:.2f} \n (this value should be about 22.22)\n'.format(J))

# Evaluate cost function for regularization rate of 1.5
J,grad = cofiCostFunc(np.hstack([X.ravel(),Theta.ravel()]), Y, R, num_users, \
                 num_movies, num_features, 1.5)

print('Cost at loaded parameters (lambda = 1.5): {:.2f} \n (this value should be about 31.34)\n'.format(J))

Cost at loaded parameters: 20.52 
 (this value should be about 22.22)

Cost at loaded parameters (lambda = 1.5): 29.64 
 (this value should be about 31.34)



# Part 3: Entering Ratings for a new User

In [177]:
'''
Before we will train the collaborative filtering model, we will first add ratings that correspond to 
a new user that we just observed. This part of the code will also allow you to put in your own ratings for the
movies in our dataset!
'''

# Read the fixed movieulary list
fid = open('movie_ids.txt',encoding = "ISO-8859-1")

n = 1682  # Total number of movies 

movieList = []
i = 0 

for line in fid:
    
    movie = line.strip().split(' ')
    movieList.append(str(' '.join(movie[1:])))

fid.close()

# Initialize my ratings
my_ratings = np.zeros((1682, 1))

# Check the file movie_idx.txt for id of each movie in our dataset
# For example, Toy Story (1995) has ID 0, so to rate it "4", you can set
my_ratings[0] = 4

# Or suppose did not enjoy Silence of the Lambs (1991), you can set
my_ratings[97] = 2

# We have selected a few movies we liked / did not like and the ratings we gave are as follows:
my_ratings[6] = 3
my_ratings[11]= 5
my_ratings[53] = 4
my_ratings[63]= 5
my_ratings[65]= 3
my_ratings[68] = 5
my_ratings[182] = 4
my_ratings[225] = 5
my_ratings[354]= 5

print('\n\nNew user ratings:\n')

for i in range(len(my_ratings)):
    if my_ratings[i] > 0.: 
        print('Rated {} for {}'.format(int(my_ratings[i]),movieList[i]))



New user ratings:

Rated 4 for Toy Story (1995)
Rated 3 for Twelve Monkeys (1995)
Rated 5 for Usual Suspects, The (1995)
Rated 4 for Outbreak (1995)
Rated 5 for Shawshank Redemption, The (1994)
Rated 3 for While You Were Sleeping (1995)
Rated 5 for Forrest Gump (1994)
Rated 2 for Silence of the Lambs, The (1991)
Rated 4 for Alien (1979)
Rated 5 for Die Hard 2 (1990)
Rated 5 for Sphere (1998)


# Part 4 : Learning Movie Ratings

In [260]:
''' 
Now, you will train the collaborative filtering model on a movie rating dataset of 1682 movies and 943 users
''' 

'''
We will use the movie data loaded in Part 1 where,
Y1 is a 1682x943 matrix, containing ratings (1-5) of 1682 movies by 943 users
&
R1 is a 1682x943 matrix, where R1(i,j) = 1 if and only if user j gave a
rating to movie i 
'''

# Add our own ratings to the data matrix
Y = np.hstack([my_ratings,Y1])
R = np.hstack([(my_ratings != 0),R1])


#  Normalize Ratings
Ymean = np.zeros((Y.shape[0],1))
Ynorm = np.zeros(Y.shape)

for i in range(Y.shape[0]):
    idx = np.where(R[i,:]==1)
    Ymean[i] = np.mean(Y[i,idx])
    Ynorm[i,idx] = Y[i,idx]-Ymean[i]

# Useful Values
num_movies, num_users = Y.shape
num_features = 10

# Set Initial Parameters (Theta, X)
X = np.random.randn(num_movies, num_features)
Theta = np.random.randn(num_users, num_features)

# Ravel the parameters X and Theta to be passed in our cost function
initial_parameters = np.hstack([X.ravel(),Theta.ravel()])

# Set Regularization
learning_rate = 1.5

fmin = minimize(fun = cofiCostFunc, x0= initial_parameters, args=(Ynorm, R, num_users, num_movies, num_features, learning_rate), \
              jac = True, method = 'TNC', options = {'maxiter':300})

theta = fmin.x

X = np.matrix(np.reshape(theta[0:num_movies*num_features], (num_movies, num_features)))
Theta = np.matrix(np.reshape(theta[num_movies*num_features:], (num_users, num_features)))

print('Recommender system learning completed.\n')

Recommender system learning completed.



# Recommendation For You

In [261]:
# After training the model, you can now make recommendations by computing the predictions matrix.

p = np.dot(X, Theta.T)
my_predictions = p[:,0] + Ymean

idx = np.argsort(my_predictions, axis=0)[::-1]  # Index of sorted predictions in 'descending' oreder

print('\nTop 10 recommendations for you:\n')

for i in range(10):
    j = int(idx[i])
    print('Predicting rating {:.1f} for movie {}'.format(float(my_predictions[j]),movieList[j]))

print('\n\nOriginal ratings provided:\n')

for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print('Rated {:.1f} for {}'.format(float(my_ratings[i]),movieList[i]))



Top 10 recommendations for you:

Predicting rating 5.2 for movie Pillow Book, The (1995)
Predicting rating 5.0 for movie Saint of Fort Washington, The (1993)
Predicting rating 5.0 for movie Prefontaine (1997)
Predicting rating 5.0 for movie They Made Me a Criminal (1939)
Predicting rating 5.0 for movie Marlene Dietrich: Shadow and Light (1996)
Predicting rating 5.0 for movie Santa with Muscles (1996)
Predicting rating 5.0 for movie Aiqing wansui (1994)
Predicting rating 5.0 for movie Entertaining Angels: The Dorothy Day Story (1996)
Predicting rating 5.0 for movie Great Day in Harlem, A (1994)
Predicting rating 5.0 for movie Someone Else's America (1995)


Original ratings provided:

Rated 4.0 for Toy Story (1995)
Rated 3.0 for Twelve Monkeys (1995)
Rated 5.0 for Usual Suspects, The (1995)
Rated 4.0 for Outbreak (1995)
Rated 5.0 for Shawshank Redemption, The (1994)
Rated 3.0 for While You Were Sleeping (1995)
Rated 5.0 for Forrest Gump (1994)
Rated 2.0 for Silence of the Lambs, The (1