In [14]:
# Downloading the movie dataset
!rm -rf ml-latest-small*
!wget https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip ml-latest-small.zip

--2023-08-14 05:26:34--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2023-08-14 05:26:37 (749 KB/s) - ‘ml-latest-small.zip’ saved [978202/978202]

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [48]:
import tensorflow as tf
import numpy as np
import pandas as pd
from IPython.core.display import display, HTML

  from IPython.core.display import display, HTML


In [None]:
df_movies = pd.read_csv('ml-latest-small/movies.csv')
df_ratings = pd.read_csv('ml-latest-small/ratings.csv')
df_ratings = df_ratings.drop(columns=['timestamp'])


In [57]:
# Create a matrix with user id as rows, movieIds as columns, and ratings as values
ratings = df_ratings.pivot(index='movieId', columns='userId', values='rating')
ratings = ratings.fillna(0)

# Copying the the rating df for creating r(i,j) matrix
df_r_ratings = df_ratings.copy()

#To replace the value, we can use the code mentioned here: https://stackoverflow.com/questions/49161120/set-value-of-one-pandas-column-based-on-value-in-another-column
df_r_ratings.loc[df_r_ratings['rating'] > 0, 'rating'] = 1

# Create a r(i,j) matrix with user id as rows, movieIds as columns, and 0 or 1 as values
r_ratings = df_r_ratings.pivot(index='movieId', columns='userId', values='rating')

# Replacing the NaN value with zero, so when we apply our cost function, NaN contents are not used for computing the cost.
r_ij = r_ratings.fillna(0)
r_ij

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
display(HTML('<div><h2>Create the cost function based on the equation below.</h2></div><img src="images/cost_function_cf.png" alt="Alternative text" />'))

In [59]:
def cost_function(X, W, b, Y, R, lambda_):
    """
    Returns the cost for the content-based filtering
    Args:
      X (ndarray (num_movies,num_features)): matrix of item features
      W (ndarray (num_users,num_features)): matrix of user parameters
      b (ndarray (1, num_users): vector of user parameters
      Y (ndarray (num_movies,num_users): matrix of user ratings of movies
      R (ndarray (num_movies,num_users): matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
      lambda_ (float): regularization parameter
    Returns:
      J (float): Cost
    """
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [60]:
num_movies = 30
#ratings=(ratings-ratings.mean())/ratings.std()
Y = tf.constant(ratings.copy().values[:num_movies,])
R = tf.constant(r_ij.copy().values[:num_movies,])
num_users = Y.shape[1]
num_features = 5
# Set Initial Parameters (W, X), use tf.Variable to track these variables
tf.random.set_seed(1234) # for consistent results
W = tf.Variable(tf.random.normal((num_users,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_movies, num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1,          num_users),   dtype=tf.float64),  name='b')

# Instantiate an optimizer.
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-1)


In [61]:
print("Y", Y.shape, "R", R.shape)
print("X", X.shape)
print("W", W.shape)
print("b", b.shape)
print("num_features", num_features)
print("num_movies",   num_movies)
print("num_users",    num_users)
print(Y)
print(R)

Y (30, 610) R (30, 610)
X (30, 5)
W (610, 5)
b (1, 610)
num_features 5
num_movies 30
num_users 610
tf.Tensor(
[[4.  0.  0.  ... 2.5 3.  5. ]
 [0.  0.  0.  ... 2.  0.  0. ]
 [4.  0.  0.  ... 2.  0.  0. ]
 ...
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]], shape=(30, 610), dtype=float64)
tf.Tensor(
[[1. 0. 0. ... 1. 1. 1.]
 [0. 0. 0. ... 1. 0. 0.]
 [1. 0. 0. ... 1. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], shape=(30, 610), dtype=float64)


In [62]:
iterations = 200
lambda_ = 1
for iter in range(iterations):
    
    # Use TensorFlow’s GradientTape
    # to record the operations used to compute the cost 
    with tf.GradientTape() as tape:

        # Compute the cost (forward pass included in cost)
        cost_value = cost_function(X, W, b, Y, R, lambda_)

    # Use the gradient tape to automatically retrieve
    # the gradients of the trainable variables with respect to the loss
    grads = tape.gradient( cost_value, [X,W,b] )

    # Run one step of gradient descent by updating
    # the value of the variables to minimize the loss.
    optimizer.apply_gradients( zip(grads, [X,W,b]) )

    # Log periodically.
    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")


Training loss at iteration 0: 16078.9
Training loss at iteration 20: 1536.8
Training loss at iteration 40: 694.1
Training loss at iteration 60: 420.3
Training loss at iteration 80: 330.6
Training loss at iteration 100: 293.2
Training loss at iteration 120: 269.8
Training loss at iteration 140: 250.1
Training loss at iteration 160: 232.5
Training loss at iteration 180: 217.4


In [70]:
# Make a prediction using trained weights and biases
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()
print(p.shape)

# prediction of the first user in the list
my_predictions = p[:,0]
print(my_predictions)

(30, 610)
[4.06537611 3.37549034 3.81210399 3.14439812 3.60138258 3.94035267
 3.60522032 3.3125285  3.22103391 3.69329973 3.80978694 3.16999284
 3.2193819  3.41199525 3.33185065 3.92644187 3.33776146 3.92732181
 2.84367123 3.14895458 3.17553924 3.79098289 3.4853064  3.34288173
 3.43145639 3.51717496 3.19468999 3.49347199 3.67276244 3.14962299]
