In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras


movies_dataset = pd.read_csv("movies.csv")
rating_dataset = pd.read_csv("ratings.csv")
columns_movies_dataset = ["movieId", "title"]
columns_rating_dataset = ["userId", "movieId", "rating"]
movies_dataset_cleaned = movies_dataset[columns_movies_dataset]
rating_dataset_cleaned = rating_dataset[columns_rating_dataset]
print(movies_dataset_cleaned.head())
print(rating_dataset_cleaned.head())

   movieId                               title
0        1                    Toy Story (1995)
1        2                      Jumanji (1995)
2        3             Grumpier Old Men (1995)
3        4            Waiting to Exhale (1995)
4        5  Father of the Bride Part II (1995)
   userId  movieId  rating
0       1        1     4.0
1       1        3     4.0
2       1        6     4.0
3       1       47     5.0
4       1       50     5.0


In [36]:
no_of_users = rating_dataset_cleaned['userId'].unique().shape[0]
print(no_of_users)
#find movies without any user giving rating and remove them
movies_rated = set(rating_dataset_cleaned['movieId'].unique())
total_movies = movies_dataset_cleaned['movieId']
movies_unrated = [movie for movie in total_movies if movie not in movies_rated]
#remove movies_unrated from movies_dataset_cleaned
movies_dataset_refined = movies_dataset_cleaned[~movies_dataset_cleaned['movieId'].isin(movies_unrated)]
movies_dataset_refined.to_csv("refined_movies.csv")
no_of_movies = movies_dataset_refined['movieId'].shape[0]
print(no_of_movies)

610
9724


In [37]:
movies_dataset_refined = pd.read_csv("refined_movies_final.csv")
movies_dataset_refined = movies_dataset_refined.reset_index(drop=True)
movies_dataset_refined["movieIndex"] = movies_dataset_refined.index
movies_dataset_refined.to_csv("final_movies_dataset.csv", index=False)
movie_id_to_index = dict(zip(movies_dataset_refined["movieId"], movies_dataset_refined["movieIndex"]))
rating_dataset_cleaned["movieIndex"] = rating_dataset_cleaned["movieId"].apply(lambda x:movie_id_to_index[x])
print(rating_dataset_cleaned.iloc[:30,:])

    userId  movieId  rating  movieIndex
0        1        1     4.0           0
1        1        3     4.0           2
2        1        6     4.0           5
3        1       47     5.0          43
4        1       50     5.0          46
5        1       70     3.0          62
6        1      101     5.0          89
7        1      110     4.0          97
8        1      151     5.0         124
9        1      157     5.0         130
10       1      163     5.0         136
11       1      216     5.0         184
12       1      223     3.0         190
13       1      231     5.0         197
14       1      235     4.0         201
15       1      260     5.0         224
16       1      296     3.0         257
17       1      316     3.0         275
18       1      333     5.0         291
19       1      349     4.0         307
20       1      356     4.0         314
21       1      362     5.0         320
22       1      367     4.0         325
23       1      423     3.0         367


In [38]:
#we need to form 2 2-d arrays. one is output rating array and one is array R where Rij = 1 if rating is there else Rij = 0
y_actual = np.zeros((no_of_movies,no_of_users))
rating_array = np.zeros((no_of_movies,no_of_users))
for index,row in rating_dataset_cleaned.iterrows():
    movie_index = int(row['movieIndex'])
    user_index = int(row['userId']) - 1
    rating_array[movie_index,user_index] = 1
    y_actual[movie_index,user_index] = row['rating']
print(y_actual[0])
    

[4.  0.  0.  0.  4.  0.  4.5 0.  0.  0.  0.  0.  0.  0.  2.5 0.  4.5 3.5
 4.  0.  3.5 0.  0.  0.  0.  0.  3.  0.  0.  0.  5.  3.  3.  0.  0.  0.
 0.  0.  0.  5.  0.  0.  5.  3.  4.  5.  0.  0.  0.  3.  0.  0.  0.  3.
 0.  0.  5.  0.  0.  0.  0.  0.  5.  4.  0.  4.  0.  2.5 0.  0.  5.  0.
 4.5 0.  0.  0.5 0.  4.  0.  0.  0.  2.5 0.  0.  0.  4.  0.  0.  3.  3.
 4.  0.  3.  0.  0.  5.  0.  4.5 0.  0.  0.  0.  4.  0.  0.  0.  4.  0.
 0.  0.  0.  3.  0.  0.  0.  0.  0.  0.  3.5 0.  4.  0.  0.  4.  0.  0.
 0.  0.  0.  3.  0.  2.  0.  3.  4.  0.  4.  0.  0.  3.  4.  0.  0.  3.5
 5.  0.  0.  0.  0.  0.  5.  0.  2.  0.  3.  4.  0.  0.  4.5 4.  4.  0.
 0.  0.  0.  5.  3.5 0.  4.5 0.  5.  0.  0.  0.  0.  0.  5.  4.  4.  0.
 0.  4.  0.  0.  4.  4.  0.  0.  0.  0.  4.  0.  2.  0.  0.  0.  0.  0.
 0.  3.5 5.  4.  0.  0.  0.  5.  0.  0.  0.  0.  0.  0.  3.5 3.  0.  3.
 4.  0.  3.5 5.  0.  0.  3.5 0.  0.  3.5 0.  0.  5.  0.  0.  3.5 3.  5.
 0.  0.  0.  0.  4.  5.  0.  0.  0.  0.  0.  0.  5.  0.  4.  0

In [39]:
#we have our output values ready
# now we need our X and w matrices. X matrix shape should be no_of_movies x features and w should be no_of_users x features so when x and wt are
# multiplied we get y as no_of_movies x no_of_users which is our y. lets take 10 features
# w = np.random.randn(no_of_users+1, 10) * 0.01
# X = np.random.randn(no_of_movies,10) * 0.01
# b = np.zeros((no_of_movies, 1))

w = tf.Variable(np.random.randn(no_of_users + 1, 20) * 0.01, dtype=tf.float32)
X = tf.Variable(np.random.randn(no_of_movies, 20) * 0.01, dtype=tf.float32)
b = tf.Variable(np.zeros((no_of_movies, 1)), dtype=tf.float32)

#lets print all shapes to verify if we are correct
print("Y shape is ", y_actual.shape)
print("R shape is ", rating_array.shape)
print("w shape is ", w.shape)
print("X shape is ", X.shape)

Y shape is  (9724, 610)
R shape is  (9724, 610)
w shape is  (611, 20)
X shape is  (9724, 20)


In [40]:
# def compute_cost(X, W, b, Y, R, lambda_):
#     y_pred = X @ W.T + b  
#     error = (y_pred - Y) * R 
#     J = 0.5 * np.sum(error ** 2) 
#     J += 0.5 * lambda_ * (np.sum(W ** 2) + np.sum(X ** 2))
#     return J

def compute_cost(X, W, b, Y, R, lambda_):
    y_pred = tf.matmul(X, tf.transpose(W)) + b
    error = (y_pred - Y) * R
    J = 0.5 * tf.reduce_sum(tf.square(error))
    J += 0.5 * lambda_ * (tf.reduce_sum(tf.square(W)) + tf.reduce_sum(tf.square(X)))
    return J  


In [41]:
#now lets add our ratings. we need to increase no of users by 1 which indicate us
random_movies_indices = np.random.choice(no_of_movies, 30, replace=False)
print(random_movies_indices)
random_movies_selected = movies_dataset_refined.loc[random_movies_indices,:]
print(random_movies_selected)

[3688 6298 1982 1158 4356 6732 7026 4413 4112 6611 1681 1706 3715 2975
 4597 4809 8714 3738 8300 3191 7992 5022 6479 5860 5767 5344 8567 6554
 2693  341]
      movieId                                              title  movieIndex
3688     5093                           Collateral Damage (2002)        3688
6298    48516                               Departed, The (2006)        6298
1982     2633                                  Mummy, The (1932)        1982
1158     1529                                     Nowhere (1997)        1158
4356     6379                                  Wrong Turn (2003)        4356
6732    59429  American Pie Presents Beta House (American Pie...        6732
7026    69122                               Hangover, The (2009)        7026
4413     6528             Start the Revolution Without Me (1970)        4413
4112     5903                                 Equilibrium (2002)        4112
6611    56169                                       Awake (2007)        6611

In [42]:
my_ratings = np.zeros(no_of_movies)
# random_ratings = np.random.randint(1,5,30)
# for i, (_, movie) in enumerate(random_movies_selected.iterrows()):
#     movie_index = movie['movieIndex']
#     my_ratings[movie_index] = random_ratings[i]
my_ratings[1] = 4
my_ratings[6182] = 3
my_ratings[6191] = 4
my_ratings[6234] = 3.5
my_ratings[6241] = 4.5
my_ratings[6248] = 4
my_ratings[6299] = 3.5
my_ratings[6314] = 4.5
my_ratings[6364] = 2.5
my_ratings[6450] = 3.5
my_ratings[6693] = 5
my_ratings[6729] = 3.5
my_ratings[6736] = 4.5
my_ratings[6743] = 4
my_ratings[6975] = 2.5
my_ratings[7069] = 3
my_ratings[7114] = 3.5
my_ratings[7226] = 5
my_ratings[7241] = 4.5
my_ratings[7305] = 1.5
my_ratings[7396] = 3.5
my_ratings[7637] = 4.5
my_ratings[7650] = 2.5
my_ratings[7707] = 3.5
my_ratings[7750] = 3
my_ratings[7756] = 2.5
my_ratings[7827] = 3.5
my_ratings[7937] = 2.5
my_ratings[8008] = 3

In [43]:
y_actual = np.c_[my_ratings, y_actual]
print(y_actual.shape)

(9724, 611)


In [44]:
my_rating_exists_array = (my_ratings > 0).astype(int)
rating_array = np.c_[my_rating_exists_array,rating_array]
print(rating_array.shape)

(9724, 611)


In [45]:
#normalize the rating by calc mean rating for every movie
def normalize_function(Y,R):
    Ymean = (np.sum(Y*R,axis=1)/(np.sum(R, axis=1)+1e-12)).reshape(-1,1)
    Ynorm = Y - np.multiply(Ymean, R) 
    return Ynorm, Ymean

In [46]:
y_actual_normalized, y_mean = normalize_function(y_actual,rating_array)


In [47]:
# iterations = 1500
# lambda_ = 0.1
# learning_rate = 0.01
# for iter in range(iterations):
#     print(iter)
#     with tf.GradientTape() as tape:
#         cost_value = compute_cost(X, w, b, y_actual_normalized, rating_array, lambda_)
#     grads = tape.gradient( cost_value, [X,w,b] )
#     dX, dW, db = grads
#     X.assign_sub(learning_rate * dX)
#     w.assign_sub(learning_rate * dW)
#     b.assign_sub(learning_rate * db)
#     if iter % 20 == 0:
#         print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

import tensorflow as tf

iterations = 1000   
lambda_ = 0.05      
learning_rate = 0.01  
min_delta = 1e-5  
previous_loss = float('inf')

for iter in range(iterations):
    with tf.GradientTape() as tape:
        cost_value = compute_cost(X, w, b, y_actual_normalized, rating_array, lambda_)

    grads = tape.gradient(cost_value, [X, w, b])    
    grads = [tf.clip_by_value(g, -0.5, 0.5) for g in grads]  
    dX, dW, db = grads
    X.assign_sub(learning_rate * dX)
    w.assign_sub(learning_rate * dW)
    b.assign_sub(learning_rate * db)
    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value.numpy():.4f}")    
    if abs(previous_loss - cost_value.numpy()) < min_delta:
        print(f"Stopping early at iteration {iter}. Loss stabilized at {cost_value.numpy():.4f}")
        break

    previous_loss = cost_value.numpy()


Training loss at iteration 0: 38714.7031
Training loss at iteration 20: 37264.7891
Training loss at iteration 40: 31005.8926
Training loss at iteration 60: 24461.7656
Training loss at iteration 80: 18670.5234
Training loss at iteration 100: 14346.0527
Training loss at iteration 120: 11510.0635
Training loss at iteration 140: 9747.6113
Training loss at iteration 160: 8622.5098
Training loss at iteration 180: 7864.7993
Training loss at iteration 200: 7333.6455
Training loss at iteration 220: 6946.8159
Training loss at iteration 240: 6655.7451
Training loss at iteration 260: 6430.3003
Training loss at iteration 280: 6251.7139
Training loss at iteration 300: 6107.3638
Training loss at iteration 320: 5988.5063
Training loss at iteration 340: 5889.0679
Training loss at iteration 360: 5804.7769
Training loss at iteration 380: 5732.4380
Training loss at iteration 400: 5669.7417
Training loss at iteration 420: 5614.8936
Training loss at iteration 440: 5566.4858
Training loss at iteration 460: 5

In [48]:
y_pred = tf.matmul(X, tf.transpose(w)) + b 
y_pred_array = y_pred.numpy()
y_pred_array += y_mean
print("Predictions for the last user:")
print(y_pred_array[52,0])  

Predictions for the last user:
1.9117352


In [49]:
print("Actual vs Predicted Ratings:")
print(y_pred_array.shape)
for i, rating in enumerate(my_ratings):
    if rating > 0:
        print(f"Movie Index {i}: Actual = {rating}, Predicted = {y_pred_array[i][0]:.2f}")

Actual vs Predicted Ratings:
(9724, 611)
Movie Index 1: Actual = 4.0, Predicted = 3.93
Movie Index 6182: Actual = 3.0, Predicted = 3.08
Movie Index 6191: Actual = 4.0, Predicted = 3.97
Movie Index 6234: Actual = 3.5, Predicted = 3.48
Movie Index 6241: Actual = 4.5, Predicted = 4.53
Movie Index 6248: Actual = 4.0, Predicted = 4.00
Movie Index 6299: Actual = 3.5, Predicted = 3.50
Movie Index 6314: Actual = 4.5, Predicted = 4.40
Movie Index 6364: Actual = 2.5, Predicted = 2.50
Movie Index 6450: Actual = 3.5, Predicted = 3.50
Movie Index 6693: Actual = 5.0, Predicted = 4.85
Movie Index 6729: Actual = 3.5, Predicted = 3.49
Movie Index 6736: Actual = 4.5, Predicted = 4.49
Movie Index 6743: Actual = 4.0, Predicted = 4.03
Movie Index 6975: Actual = 2.5, Predicted = 2.51
Movie Index 7069: Actual = 3.0, Predicted = 3.00
Movie Index 7114: Actual = 3.5, Predicted = 3.49
Movie Index 7226: Actual = 5.0, Predicted = 5.00
Movie Index 7241: Actual = 4.5, Predicted = 4.41
Movie Index 7305: Actual = 1.5,

In [50]:
import tensorflow as tf

def find_similar_users(w, top_k=5):
    user_vec = w[0]
    
    w_norm = tf.nn.l2_normalize(w, axis=1)
    user_vec_norm = tf.nn.l2_normalize(user_vec, axis=0)
    
    similarity = tf.linalg.matmul(w_norm, tf.expand_dims(user_vec_norm, 1))
    similarity = tf.squeeze(similarity)

    similar_users = tf.argsort(similarity, direction='DESCENDING').numpy()
    similar_users = [i for i in similar_users if i != 0][:top_k]

    scores = similarity.numpy()[similar_users]
    return list(zip(similar_users, scores))

similar_users = find_similar_users(w, top_k=5)
print("Top similar users:", similar_users)


Top similar users: [(318, 0.6792879), (227, 0.6421401), (125, 0.62869483), (274, 0.5592607), (363, 0.5578882)]


In [51]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

def find_similar_users_knn(w, top_k=5):
    w_np = w.numpy() 
    user_vec = w_np[0].reshape(1, -1)
    
    knn = NearestNeighbors(n_neighbors=top_k+1, metric="cosine")
    knn.fit(w_np)

    distances, indices = knn.kneighbors(user_vec)
    similar_users = indices[0][1:]
    similarity_scores = 1 - distances[0][1:]

    return list(zip(similar_users, similarity_scores))

similar_users_knn = find_similar_users_knn(w, top_k=5)
print("Top similar users using KNN:", similar_users_knn)


Top similar users using KNN: [(318, 0.6792879), (227, 0.64214003), (125, 0.6286947), (274, 0.5592607), (363, 0.5578882)]
