In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
df_ratings = pd.read_csv('Datasets/ml-latest-small/ratings.csv')
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
df_movies = pd.read_csv('Datasets/ml-latest-small/movies.csv')
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movie_list = df_movies["title"].to_list()
new_movie_list = []
allowed_year_list = [str(i) for i in range(2000, 2024)]
for i in movie_list:
    year = i.split(" ")[-1][1:-1]
    if (year in allowed_year_list):
        new_movie_list.append(i)
new_df_movies = df_movies[df_movies["title"].isin(new_movie_list)]
new_df_movies = new_df_movies.reset_index(drop=True)
new_df_movies.head()

Unnamed: 0,movieId,title,genres
0,2769,"Yards, The (2000)",Crime|Drama
1,3177,Next Friday (2000),Comedy
2,3190,Supernova (2000),Adventure|Sci-Fi|Thriller
3,3225,Down to You (2000),Comedy|Romance
4,3273,Scream 3 (2000),Comedy|Horror|Mystery|Thriller


In [5]:
nm = new_df_movies['title'].shape[0]
nu = df_ratings['userId'].unique().shape[0]
print("Number of movies:", nm)
print("Number of users:", nu)

Number of movies: 4773
Number of users: 610


In [6]:
num_features = 10
X = np.zeros((nm, num_features))
W = np.zeros((nu, num_features))
b = np.zeros((nu,))
R = np.zeros((nm, nu))
Y = np.zeros((nm, nu))
userId = df_ratings['userId']
movieId = df_ratings['movieId']
rating = df_ratings['rating']
for i in range(len(userId)):
    if (movieId[i] > nm):
        continue
    m_Id = movieId[i]
    u_Id = userId[i]
    Y[m_Id - 1][u_Id - 1] = rating[i]
    R[m_Id - 1][u_Id - 1] = 1

In [7]:
# this function is not computationaly efficient in calculating the cost
def cofi_cost_func(X, W, b, Y, R, lambda_):
    nm, nu = Y.shape
    J = 0

    for j in range(nu):
        for i in range(nm):
            J += R[i][j]*(np.dot(W[j],X[i])+b[0][j] - Y[i][j])**2
    J = 0.5*J
    
    reg_w = 0
    for j in range(nu):
        for k in range(X.shape[1]):
            reg_w += W[j, k]**2
    J += lambda_*0.5*reg_w
            
    reg_x = 0
    for i in range(nm):
        for k in range(X.shape[1]):
            reg_x += X[i, k]**2
    J += lambda_*0.5*reg_x

    return J

In [8]:
# this function is computationaly efficient in calculating the cost as it uses vectorization
def cofi_cost_func_v(X, W, b, Y, R, lambda_):
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [9]:
# normalizing the ratings to get a better computational speeds
def normalizeRatings(Y, R):
    nm, nu = Y.shape
    Ymean = np.zeros((nm,))
    for i in range(nm):
        val = 0
        for j in range(nu):
            if (R[i][j] == 1):
                val += Y[i][j]
        if (sum(R[i]) != 0):
            val /= sum(R[i])
            Ymean[i] = val
    Ynorm = np.transpose(Y) - Ymean
    Ynorm = np.transpose(Ynorm)
    return Ynorm, Ymean

In [10]:
movieList = new_df_movies['title']
movieList_df = new_df_movies

my_ratings = np.zeros(nm)

my_ratings[2700] = 5   # Ricky Gervais Live 3: Fame (2007)
my_ratings[2609] = 2   # Girl Who Played with Fire, The (Flickan som lekte med elden) (2009)
my_ratings[929]  = 5   # Lord of the Rings: The Return of the King, The
my_ratings[246]  = 5   # Shrek (2001)
my_ratings[2716] = 3   # Grown Ups (2010)
my_ratings[1150] = 5   # Incredibles, The (2004)
my_ratings[382]  = 2   # Amelie (Fabuleux destin d'Amélie Poulain, Le)
my_ratings[366]  = 5   # Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
my_ratings[622]  = 5   # Harry Potter and the Chamber of Secrets (2002)
my_ratings[988]  = 3   # Eternal Sunshine of the Spotless Mind (2004)
my_ratings[2925] = 1   # Louis Theroux: Law & Disorder (2008)
my_ratings[2937] = 1   # Nothing to Declare (Rien à déclarer)
my_ratings[793]  = 5   # Pirates of the Caribbean: The Curse of the Black Pearl (2003)
my_rated = [i for i in range(len(my_ratings)) if my_ratings[i] > 0]

print('\nNew user ratings:\n')
for i in range(len(my_ratings)):
    if (my_ratings[i] > 0):
        print(f'Rated {my_ratings[i]} for  {movieList_df.loc[i,"title"]}')


New user ratings:

Rated 5.0 for  Shrek (2001)
Rated 5.0 for  Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Rated 2.0 for  Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
Rated 5.0 for  Harry Potter and the Chamber of Secrets (2002)
Rated 5.0 for  Pirates of the Caribbean: The Curse of the Black Pearl (2003)
Rated 5.0 for  Lord of the Rings: The Return of the King, The (2003)
Rated 3.0 for  Eternal Sunshine of the Spotless Mind (2004)
Rated 5.0 for  Incredibles, The (2004)
Rated 2.0 for  Girl Who Played with Fire, The (Flickan som lekte med elden) (2009)
Rated 5.0 for  Ricky Gervais Live 3: Fame (2007)
Rated 3.0 for  Grown Ups (2010)
Rated 1.0 for  Louis Theroux: Law & Disorder (2008)
Rated 1.0 for  Nothing to Declare (Rien à déclarer) (2010)


In [11]:
# adding new user data to Y and R matrices, normalizing Y matrix and getting the list of average ratings
Y = np.c_[my_ratings, Y]
R = np.c_[(my_ratings != 0).astype(int), R]
Ynorm, Ymean = normalizeRatings(Y, R)

In [12]:
num_movies, num_users = Y.shape
num_features = 100

# Set Initial Parameters (W, X), use tf.Variable to track these variables
tf.random.set_seed(1234) # for consistent results
W = tf.Variable(tf.random.normal((num_users,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_movies, num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1,          num_users),   dtype=tf.float64),  name='b')

# Instantiate an optimizer.
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-1)

In [13]:
iterations = 200
lambda_ = 1
for iter in range(iterations):
    # Use TensorFlow’s GradientTape
    # to record the operations used to compute the cost 
    with tf.GradientTape() as tape:

        # Compute the cost (forward pass included in cost)
        cost_value = cofi_cost_func_v(X, W, b, Ynorm, R, lambda_)

    # Use the gradient tape to automatically retrieve
    # the gradients of the trainable variables with respect to the loss
    grads = tape.gradient( cost_value, [X,W,b] )

    # Run one step of gradient descent by updating
    # the value of the variables to minimize the loss.
    optimizer.apply_gradients( zip(grads, [X,W,b]) )

    # Log periodically.
    if (iter == 0):
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")
    elif (iter + 1) % 20 == 0:
        print(f"Training loss at iteration {iter + 1}: {cost_value:0.1f}")

Training loss at iteration 0: 3388777.4
Training loss at iteration 20: 152680.2
Training loss at iteration 40: 62217.4
Training loss at iteration 60: 32577.5
Training loss at iteration 80: 19552.5
Training loss at iteration 100: 12922.3
Training loss at iteration 120: 9229.8
Training loss at iteration 140: 7058.9
Training loss at iteration 160: 5733.4
Training loss at iteration 180: 4898.3
Training loss at iteration 200: 4357.1


In [14]:
# shapes of X, W and b
X_np = X.numpy()
print(X_np.shape)
W_np = np.transpose(W.numpy())
print(W_np.shape)
b_np = b.numpy()
print(b_np.shape)


(4773, 100)
(100, 611)
(1, 611)


In [15]:
# A = X.numpy()
# B = W.numpy()
# P = np.zeros((A.shape[0], B.shape[0]))
# n, m = P.shape
# for i in range(n):
#     for j in range(m):
#         P[i][j] = np.dot(A[i], B[j])

In [16]:
# A = X.numpy()
# B = W.numpy()
# P = np.zeros((A.shape[0], B.shape[1]))
# n, m = P.shape
# for i in range(n):
#     for j in range(m):
#         P[i] = A[i] @ B[j]

In [17]:
A = X.numpy()
B = np.transpose(W.numpy())
P = np.zeros((A.shape[0], B.shape[1]))
n, m = P.shape
for i in range(n):
    for j in range(m)
        l1 = A[i] @ B[:, :100]
        l_ = np.concatenate((l1, ))
    P[i] = l_

: 

In [None]:
p = P + b.numpy()
Ymean = np.reshape(Ymean, (4773, 1))
pm = p + Ymean

my_predictions = pm[:,0]

ix = tf.argsort(my_predictions, direction='DESCENDING')

for i in range(10):
    j = ix[i]
    if (j not in my_rated):
        print(f'Predicting rating {my_predictions[j]:0.2f} for movie {movieList[int(j)]}')

print('\n\nOriginal vs Predicted ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print(f'Original {my_ratings[i]}, Predicted {my_predictions[i]:0.2f} for {movieList[i]}')

Predicting rating 5.68 for movie Rush Hour 2 (2001)
Predicting rating 5.11 for movie Lucky Break (2001)
Predicting rating 5.11 for movie Watchmen (2009)
Predicting rating 5.11 for movie Takers (2010)
Predicting rating 5.09 for movie Tarnation (2003)
Predicting rating 5.08 for movie The Jungle Book (2016)
Predicting rating 5.07 for movie Sherlock Holmes: A Game of Shadows (2011)
Predicting rating 5.07 for movie Friends with Kids (2011)
Predicting rating 5.07 for movie Grudge, The (2004)
Predicting rating 5.07 for movie Up at the Villa (2000)


Original vs Predicted ratings:

Original 5.0, Predicted 4.74 for Shrek (2001)
Original 5.0, Predicted 4.87 for Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Original 2.0, Predicted 2.22 for Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
Original 5.0, Predicted 5.02 for Harry Potter and the Chamber of Secrets (2002)
Original 5.0, Predicted 5.02 for Pirates of the Caribbean: The Curse of the Bla

In [None]:
n = R.shape[0]
num_ratings = np.zeros(n)
for i in range(n):
    num_ratings[i] = sum(R[i])
movieList_df["number of ratings"] = num_ratings.astype(np.int64)
movieList_df["mean rating"] = Ymean

In [None]:
filter=(movieList_df["number of ratings"] > 20)
movieList_df["pred"] = my_predictions
movieList_df = movieList_df.reindex(columns=["pred", "mean rating", "number of ratings", "title"])
movieList_df.loc[ix[:300]].loc[filter].sort_values("mean rating", ascending=False)

Unnamed: 0,pred,mean rating,number of ratings,title
317,4.892263,4.429022,317,Rock Star (2001)
921,4.485231,4.333333,27,In America (2002)
245,4.523769,4.293103,29,Angel Eyes (2001)
1234,4.469397,4.288462,26,"Twilight Samurai, The (Tasogare Seibei) (2002)"
929,4.823760,4.285714,21,"Lord of the Rings: The Return of the King, The..."
...,...,...,...,...
2907,4.287309,3.551282,39,Hanna (2011)
379,4.374803,3.497191,178,Baran (2001)
1369,4.313443,3.404412,68,Palindromes (2004)
366,4.872404,3.196203,158,Harry Potter and the Sorcerer's Stone (a.k.a. ...


In [None]:
# getting simillar movies by calculating the distance between feature vactors
movie_idx = 246
num = X_np.shape[0]
distance_list = np.zeros(num)
for i in range(num):
    distance_list[i] = sum((X_np[i] - X_np[movie_idx])**2)

new_df = pd.DataFrame({"movie":movieList, "distance":distance_list})
sorted_by_distance_df = new_df.sort_values(by='distance')
sorted_by_distance_df.head(20)


Unnamed: 0,movie,distance
246,Shrek (2001),0.0
186,Left Behind: The Movie (2000),1.027566
4740,Jurassic World: Fallen Kingdom (2018),1.068009
4258,Nowitzki: The Perfect Shot (2014),1.13282
3988,Ghost in the Shell Arise - Border 1: Ghost Pai...,1.13471
4696,The Greatest Showman (2017),1.136554
237,Driven (2001),1.146133
730,Cowboy Bebop: The Movie (Cowboy Bebop: Tengoku...,1.153043
1160,"SpongeBob SquarePants Movie, The (2004)",1.156235
205,15 Minutes (2001),1.157684


16.546048143114863