In [1]:
# imports

# libraries
import numpy as np
from scipy import io
from scipy.optimize import fmin_cg

# files
import utils

## 1. Take a look at the dataset

In [2]:
# load movie ratings dataset
matDict = io.loadmat('ex8_movies.mat')

matDict.keys()

dict_keys(['__header__', '__version__', '__globals__', 'Y', 'R'])

In [3]:
# Y: 1682x943, contains ratings (1-5) of 1682 movies on 943 users
# R: 1682x943, R(i,j) = 1 if and only if user j gave a rating to movie i

Y = matDict['Y']
R = matDict['R']

Y.shape, R.shape

((1682, 943), (1682, 943))

In [4]:
# sparsity: (no. ratings)/(no. possible ratings)
np.sum(R)/np.size(R)

0.063046693642245313

## 2. Use pre-trained weights to check the implementation of the cost function

In [5]:
# load pre-trained weights (X, Theta, num_users, num_movies, num_features)
# for checking cost function
matDict = io.loadmat('ex8_movieParams.mat')

matDict.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'Theta', 'num_users', 'num_movies', 'num_features'])

In [6]:
# X: num_movies x num_features, movie features
# Theta: num_users x num_features, user features
# num_users, num_movies, num_features: scalars

X = matDict['X']
Theta = matDict['Theta']
num_users = matDict['num_users']
num_movies = matDict['num_movies']
num_features = matDict['num_features']

In [7]:
# reduce the data set size so that this runs faster
num_users = 4
num_movies = 5
num_features = 3

X = X[0:num_movies, 0:num_features]
Theta = Theta[0:num_users, 0:num_features]
Y = Y[0:num_movies, 0:num_users]
R = R[0:num_movies, 0:num_users]
params = np.concatenate((X,Theta))
params = np.reshape(params, params.size)

### 2.1 Check cost

In [8]:
# check cost function's cost without regularization
J = utils.costFunc(params, Y, R, num_users, num_movies, num_features, 0)

J[0]  # cost should be about 22.22

22.224603725685672

In [9]:
# check cost function's cost with regularization
# lambd=1.5
J = utils.costFunc(params, Y, R, num_users, num_movies, num_features, 1.5)
J[0]  # cost should be about 31.34

31.344056244274213

### 2.2 Check gradient

In [10]:
# check cost function's gradient without regularization
utils.checkCostFunction()

[[ -1.5638525   -1.5638525 ]
 [  0.33423115   0.33423115]
 [  1.0550018    1.0550018 ]
 [  2.64924374   2.64924374]
 [ -0.83173464  -0.83173464]
 [ -4.9792135   -4.9792135 ]
 [ -8.90731964  -8.90731964]
 [ -3.09165216  -3.09165216]
 [ 11.4471485   11.4471485 ]
 [ -0.21630701  -0.21630701]
 [ -0.18860546  -0.18860546]
 [  1.56076113   1.56076113]
 [  1.24258394   1.24258394]
 [ -1.67035246  -1.67035246]
 [ -7.01206157  -7.01206157]
 [  1.23356929   1.23356929]
 [  1.1286928    1.1286928 ]
 [  2.35758739   2.35758739]
 [  0.48251309   0.48251309]
 [  2.33111503   2.33111503]
 [  9.32006748   9.32006748]
 [  1.31446895   1.31446895]
 [ -1.53047295  -1.53047295]
 [ -7.1829814   -7.1829814 ]
 [ -1.27656996  -1.27656996]
 [ -1.15664849  -1.15664849]
 [ -2.4241663   -2.4241663 ]]
The above two columns should be very similar.
(Left-Numerical Gradient, Right-Analytical Gradient)


If the cost function implementation is correct, then 
the relative difference will be small (less than 1e-9). 

Rel

In [11]:
# check cost function's gradient with regularization
# lambd=1.5
utils.checkCostFunction(1.5)

[[ -1.46960632  -1.46960632]
 [  2.44468863   2.44468863]
 [  8.08429971   8.08429971]
 [  3.05704021   3.05704021]
 [  5.16908099   5.16908099]
 [ -9.02433266  -9.02433266]
 [  2.07765463   2.07765463]
 [ -0.10045448  -0.10045448]
 [ -2.39082951  -2.39082951]
 [  3.14239509   3.14239509]
 [  3.83539926   3.83539926]
 [  1.45818309   1.45818309]
 [  0.43873725   0.43873725]
 [  2.22756227   2.22756227]
 [ -1.10909823  -1.10909823]
 [  6.37909193   6.37909193]
 [  2.62482604   2.62482604]
 [-15.5092568  -15.5092568 ]
 [  4.30988383   4.30988383]
 [ -0.64570775  -0.64570775]
 [ -0.198471    -0.198471  ]
 [ -2.58738193  -2.58738193]
 [ -2.57459302  -2.57459302]
 [  0.26782615   0.26782615]
 [  0.6964374    0.6964374 ]
 [  3.84689109   3.84689109]
 [  7.01193678   7.01193678]]
The above two columns should be very similar.
(Left-Numerical Gradient, Right-Analytical Gradient)


If the cost function implementation is correct, then 
the relative difference will be small (less than 1e-9). 

Rel

## 3. Set ratings for a new user

In [12]:
# get list of movies from text file
f = open('movie_ids.txt', 'r', encoding='ISO-8859-1')

movieList = []
for l in f.readlines():
    movie = l.split(' ', 1)[1].split('\n')[0]
    movieList.append(movie)

len(movieList)

1682

In [13]:
# initialize new user's ratings
ratings = np.zeros(len(movieList), dtype='uint8')
len(ratings)

1682

In [14]:
# check the file movie_idx.txt for id of each movie in the dataset
# for example, Toy Story (1995) has ID 1, so to rate it "4", we can set
ratings[0] = 4  # index = ID - 1 

# more ratings
ratings[97] = 2
ratings[6] = 3
ratings[11]= 5
ratings[53] = 4
ratings[63]= 5
ratings[65]= 3
ratings[68] = 5
ratings[182] = 4
ratings[225] = 5
ratings[354]= 5

print("new user ratings:\n")
for i in range(len(ratings)):
    if ratings[i] > 0:
        print("rated {} for {}".format(ratings[i], movieList[i]))

new user ratings:

rated 4 for Toy Story (1995)
rated 3 for Twelve Monkeys (1995)
rated 5 for Usual Suspects, The (1995)
rated 4 for Outbreak (1995)
rated 5 for Shawshank Redemption, The (1994)
rated 3 for While You Were Sleeping (1995)
rated 5 for Forrest Gump (1994)
rated 2 for Silence of the Lambs, The (1991)
rated 4 for Alien (1979)
rated 5 for Die Hard 2 (1990)
rated 5 for Sphere (1998)


## 4. Train the collaborative filtering model

In [15]:
# load the movie ratings dataset again
matDict = io.loadmat('ex8_movies.mat')

matDict.keys()

dict_keys(['__header__', '__version__', '__globals__', 'Y', 'R'])

In [16]:
# Y: 1682x943, contains ratings (1-5) of 1682 movies on 943 users
# R: 1682x943, R(i,j) = 1 if and only if user j gave a rating to movie i

Y = matDict['Y']
R = matDict['R']

Y.shape, R.shape

((1682, 943), (1682, 943))

In [17]:
# make sure that the dtypes match
Y.dtype, R.dtype, ratings.dtype

(dtype('uint8'), dtype('uint8'), dtype('uint8'))

In [18]:
# reshape ratings into a column vector
ratings = np.reshape(ratings, (ratings.size,1))

In [19]:
# add the new user's ratings to Y and R
Y = np.concatenate((ratings,Y), axis=1)
R = np.concatenate(((ratings != 0).astype('uint8'), R), axis=1)

Y.shape, R.shape  # both should have +1 on the 2nd axis

((1682, 944), (1682, 944))

In [20]:
# normalize ratings
Ynorm, Ymean = utils.normalizeRatings(Y,R)

In [21]:
# useful values
num_users = Y.shape[1]
num_movies = Y.shape[0]
num_features = 10

# set initial parameters (Theta, X)
X = np.random.randn(num_movies, num_features)
Theta = np.random.randn(num_users, num_features)
initial_parameters = np.concatenate((X,Theta))
initial_parameters = np.reshape(initial_parameters, initial_parameters.size)

# set regularization
lambd = 10

# set functions for the cost and the gradient
f = lambda t: utils.costFunc(t, Ynorm, R, num_users, num_movies, num_features, lambd)[0]
fprime = lambda t: utils.costFunc(t, Ynorm, R, num_users, num_movies, num_features, lambd)[1]

In [22]:
# minimize gradient with fmincg
theta = fmin_cg(f=f, x0=initial_parameters, fprime=fprime, maxiter=100)

         Current function value: 38977.352339
         Iterations: 100
         Function evaluations: 149
         Gradient evaluations: 149


In [23]:
# unfold theta back into X and Theta
X = np.reshape(theta[0:num_movies*num_features], (num_movies, num_features))
Theta = np.reshape(theta[num_movies*num_features:], (num_users, num_features))

X.shape, Theta.shape

((1682, 10), (944, 10))

## 5. Make recommendations

In [24]:
# make predictions by multiplying X and Theta
p = X.dot(Theta.T)
predictions = p[:,0] + Ymean

len(predictions)

1682

In [25]:
# get list of movies from text file
f = open('movie_ids.txt', 'r', encoding='ISO-8859-1')

movieList = []
for l in f.readlines():
    movie = l.split(' ', 1)[1].split('\n')[0]
    movieList.append(movie)

len(movieList)

1682

In [26]:
# get the indices ix that would sort predictions in a descending order
# by predictions[ix]
ix = np.argsort(predictions)[::-1]

predictions[ix]

array([ 5.00001694,  5.00001142,  5.00000841, ...,  0.99997597,
        0.9999731 ,  0.99996951])

In [27]:
print("Top recommendations for the new user:")
for i in range(10):
    j = ix[i]
    print("Predicting rating {} for movie {}".format(np.rint(predictions[j]), movieList[j]))
    
print("\nOriginal ratings provided:")
for i in range(len(ratings)):
    if ratings[i][0] > 0:
        print('Rated {} for {}'.format(ratings[i][0], movieList[i]))

Top recommendations for the new user:
Predicting rating 5.0 for movie Entertaining Angels: The Dorothy Day Story (1996)
Predicting rating 5.0 for movie They Made Me a Criminal (1939)
Predicting rating 5.0 for movie Aiqing wansui (1994)
Predicting rating 5.0 for movie Great Day in Harlem, A (1994)
Predicting rating 5.0 for movie Someone Else's America (1995)
Predicting rating 5.0 for movie Marlene Dietrich: Shadow and Light (1996) 
Predicting rating 5.0 for movie Star Kid (1997)
Predicting rating 5.0 for movie Prefontaine (1997)
Predicting rating 5.0 for movie Santa with Muscles (1996)
Predicting rating 5.0 for movie Saint of Fort Washington, The (1993)

Original ratings provided:
Rated 4 for Toy Story (1995)
Rated 3 for Twelve Monkeys (1995)
Rated 5 for Usual Suspects, The (1995)
Rated 4 for Outbreak (1995)
Rated 5 for Shawshank Redemption, The (1994)
Rated 3 for While You Were Sleeping (1995)
Rated 5 for Forrest Gump (1994)
Rated 2 for Silence of the Lambs, The (1991)
Rated 4 for Alie