In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances

from sklearn.preprocessing import StandardScaler


In [3]:
relevent_threshold = 3.5

**Load the dataset**

In [4]:
ratings_df = pd.read_csv('ratings.csv')
movies_df = pd.read_csv('movies.csv')
merged_df = pd.merge(ratings_df, movies_df, on='movieId')
# Filter out users who have rated fewer movies (optional)
user_counts = ratings_df['userId'].value_counts()
active_users = user_counts[user_counts >= 20].index
ratings_df = ratings_df[ratings_df['userId'].isin(active_users)]

ratings_df.head()
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


#Split the training set and test set

In [5]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(ratings_df, test_size=0.2, random_state=42)

In [5]:
train_set.head()

Unnamed: 0,userId,movieId,rating,timestamp
37865,273,5816,4.5,1466946328
46342,339,2028,4.5,1446663181
64614,461,3895,0.5,1093224965
41974,300,3578,4.5,1086010878
50236,369,292,3.0,847465462


In [6]:
test_set.head()

Unnamed: 0,userId,movieId,rating,timestamp
19090,128,1028,5.0,1049690908
99678,665,4736,1.0,1010197684
18455,120,4002,3.0,1167420604
35755,257,1274,4.0,1348544094
66536,468,6440,4.0,1296191715


**Create the User-Item Matrix**

In [6]:
user_item_matrix = train_set.pivot_table(index='userId', columns='movieId', values='rating', fill_value=0)
user_item_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,160656,160718,161084,161155,161594,161830,161918,161944,162542,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,...,0.0,0,0.0,0.0,0,0,0.0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,4.0,...,0.0,0,0.0,0.0,0,0,0.0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,...,0.0,0,0.0,0.0,0,0,0.0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,...,0.0,0,0.0,0.0,0,0,0.0,0,0,0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,...,0.0,0,0.0,0.0,0,0,0.0,0,0,0


In [74]:
# Calculate User Similarity
user_similarity = cosine_similarity(user_item_matrix)
np.fill_diagonal(user_similarity, 0)  # Set diagonal elements to 0 to avoid self-similarity
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

user_similarity_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.085679,0.019716,0.0,0.084181,0.0,0.014354,0.0,...,0.0,0.0,0.003314,0.035803,0.0,0.0,0.0,0.075423,0.0,0.020265
2,0.0,0.0,0.092037,0.081137,0.072216,0.0,0.149972,0.084527,0.110684,0.05102,...,0.374325,0.069295,0.075608,0.149802,0.462915,0.339362,0.042671,0.02181,0.112026,0.074662
3,0.0,0.092037,0.0,0.050344,0.065398,0.085793,0.061324,0.143879,0.104469,0.017533,...,0.052034,0.029244,0.129791,0.128113,0.121741,0.023173,0.064155,0.025504,0.065198,0.081216
4,0.085679,0.081137,0.050344,0.0,0.103764,0.092846,0.251964,0.153572,0.035802,0.098074,...,0.086965,0.035391,0.109564,0.21565,0.102951,0.060254,0.085024,0.102528,0.050497,0.204885
5,0.019716,0.072216,0.065398,0.103764,0.0,0.086933,0.060305,0.119289,0.080419,0.040818,...,0.177942,0.024758,0.111415,0.177003,0.130934,0.062006,0.054312,0.024675,0.053662,0.181957


#User-based rating prediction && evaluation metrics


**Rating Predictions Approach**

In [8]:
test_ratings = test_set.pivot_table(index='userId', columns='movieId', values='rating', fill_value=0)
train_ratings = train_set.pivot_table(index='userId', columns='movieId', values='rating', fill_value=0)

In [75]:
def calculate_rmse(predicted_ratings, test_ratings):
    squared_errors = []
    predicted_ratings.fillna(0)
    test_ratings.fillna(0)
    for user_id in test_ratings.index:
        if user_id in predicted_ratings.index:
          common_movies = predicted_ratings.loc[user_id][predicted_ratings.loc[user_id] > 0].index.intersection(test_ratings.loc[user_id][test_ratings.loc[user_id] > 0].index)

          squared_diff = (predicted_ratings.loc[user_id, common_movies] - test_ratings.loc[user_id, common_movies]) ** 2
          squared_errors.extend(squared_diff.tolist())

    mean_squared_error = np.mean(squared_errors)
    rmse = np.sqrt(mean_squared_error)

    return rmse


In [70]:
# Get the mean_ratings
mean_rating = ratings_df['rating'].mean()
mean_ratings = ratings_df.groupby('movieId')['rating'].mean()
num_ratings = ratings_df.groupby('movieId')['rating'].count()

mean_ratings[num_ratings <= 50] = mean_rating

print(mean_ratings)

def generate_user_based_predictions_test(user_id, top_n=5):
    # Find similar users in the user neighborhood
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:top_n+1].index

    # Get movies rated by the target user
    user_movies = user_item_matrix.loc[user_id]

    # Calculate the weighted average of ratings based on similar users' ratings
    similarity_sum = user_similarity_df.loc[user_id, similar_users].sum()
    predicted_ratings = np.dot(user_item_matrix.loc[similar_users].T, user_similarity_df.loc[user_id, similar_users])
    predicted_ratings /= similarity_sum


    predicted_ratings = pd.Series(predicted_ratings, index = user_movies.index)

    # Fill missing predicted ratings with the mean rating of the movie
    missing_ratings = predicted_ratings[predicted_ratings == 0]
    missing_item_ids = missing_ratings.index
    default_ratings = mean_ratings[missing_item_ids]
    predicted_ratings[missing_item_ids] = default_ratings

    # Exclude movies that the target user has already rated
    predicted_ratings = predicted_ratings[user_movies == 0]

    # Sort the predicted ratings in descending order
    predicted_ratings = predicted_ratings.sort_values(ascending=False)

    predicted_ratings = predicted_ratings[~predicted_ratings.index.duplicated()]


    return predicted_ratings

# Example usage
user_id = 12
top_n = 8
predicted_ratings = generate_user_based_predictions_test(user_id, top_n)

test_ratings_user =  test_ratings[test_ratings.index ==  2]
# a = test_ratings_user[]
a = ~test_ratings.loc[1].isnull()
index = predicted_ratings.index.intersection(test_ratings.loc[user_id][test_ratings.loc[user_id] > 0].index)
print(index.tolist())
print(len(index))
print(test_ratings.loc[user_id, index].tolist())
print(predicted_ratings.loc[index].tolist())
squared_diff = (predicted_ratings.loc[index] - test_ratings.loc[user_id, index]) ** 2
print(np.mean(squared_diff.tolist()))

movieId
1         3.872470
2         3.401869
3         3.161017
4         3.543608
5         3.267857
            ...   
161944    3.543608
162376    3.543608
162542    3.543608
162672    3.543608
163949    3.543608
Name: rating, Length: 9066, dtype: float64
[2668, 1235, 3841, 3827, 3864, 3770, 608, 3793, 3176, 1197, 3871, 3809, 736]
13
[2.0, 5.0, 2.0, 2.0, 3.0, 3.0, 2.0, 5.0, 3.0, 1.0, 2.0, 3.0, 4.0]
[3.543608255669773, 3.543608255669773, 3.543608255669773, 3.543608255669773, 3.543608255669773, 3.543608255669773, 2.550162551549139, 1.7629064844423823, 0.8691407353676126, 0.8403946486577681, 0.8164770876878025, 0.653181670150242, 0.20892721689918234]
3.5760215434328515


**n=5**

In [76]:
# Test set rmse
predicted_ratings = pd.DataFrame(index=test_ratings.index, columns=train_ratings.columns.union(test_ratings.columns)).fillna(0)
user_num = len(predicted_ratings.index)
index = 1
for user_id in test_ratings.index:
    print("progress: ", index, '/', user_num)
    index = index + 1
    predicted_ratings.loc[user_id] = generate_user_based_predictions_test(user_id=user_id, top_n=5)

rmse = calculate_rmse(predicted_ratings, test_ratings)
print(f"RMSE: {rmse}")

progress:  1 / 671
progress:  2 / 671
progress:  3 / 671
progress:  4 / 671
progress:  5 / 671
progress:  6 / 671
progress:  7 / 671
progress:  8 / 671
progress:  9 / 671
progress:  10 / 671
progress:  11 / 671
progress:  12 / 671
progress:  13 / 671
progress:  14 / 671
progress:  15 / 671
progress:  16 / 671
progress:  17 / 671
progress:  18 / 671
progress:  19 / 671
progress:  20 / 671
progress:  21 / 671
progress:  22 / 671
progress:  23 / 671
progress:  24 / 671
progress:  25 / 671
progress:  26 / 671
progress:  27 / 671
progress:  28 / 671
progress:  29 / 671
progress:  30 / 671
progress:  31 / 671
progress:  32 / 671
progress:  33 / 671
progress:  34 / 671
progress:  35 / 671
progress:  36 / 671
progress:  37 / 671
progress:  38 / 671
progress:  39 / 671
progress:  40 / 671
progress:  41 / 671
progress:  42 / 671
progress:  43 / 671
progress:  44 / 671
progress:  45 / 671
progress:  46 / 671
progress:  47 / 671
progress:  48 / 671
progress:  49 / 671
progress:  50 / 671
progress:

**n=10**

In [77]:
# Test set rmse
predicted_ratings = pd.DataFrame(index=test_ratings.index, columns=train_ratings.columns.union(test_ratings.columns)).fillna(0)
user_num = len(predicted_ratings.index)
index = 1
for user_id in test_ratings.index:
    print("progress: ", index, '/', user_num)
    index = index + 1
    predicted_ratings.loc[user_id] = generate_user_based_predictions_test(user_id=user_id, top_n=10)

rmse = calculate_rmse(predicted_ratings, test_ratings)
print(f"RMSE: {rmse}")

progress:  1 / 671
progress:  2 / 671
progress:  3 / 671
progress:  4 / 671
progress:  5 / 671
progress:  6 / 671
progress:  7 / 671
progress:  8 / 671
progress:  9 / 671
progress:  10 / 671
progress:  11 / 671
progress:  12 / 671
progress:  13 / 671
progress:  14 / 671
progress:  15 / 671
progress:  16 / 671
progress:  17 / 671
progress:  18 / 671
progress:  19 / 671
progress:  20 / 671
progress:  21 / 671
progress:  22 / 671
progress:  23 / 671
progress:  24 / 671
progress:  25 / 671
progress:  26 / 671
progress:  27 / 671
progress:  28 / 671
progress:  29 / 671
progress:  30 / 671
progress:  31 / 671
progress:  32 / 671
progress:  33 / 671
progress:  34 / 671
progress:  35 / 671
progress:  36 / 671
progress:  37 / 671
progress:  38 / 671
progress:  39 / 671
progress:  40 / 671
progress:  41 / 671
progress:  42 / 671
progress:  43 / 671
progress:  44 / 671
progress:  45 / 671
progress:  46 / 671
progress:  47 / 671
progress:  48 / 671
progress:  49 / 671
progress:  50 / 671
progress:

**n=20**

In [78]:
# Test set rmse
predicted_ratings = pd.DataFrame(index=test_ratings.index, columns=train_ratings.columns.union(test_ratings.columns)).fillna(0)
user_num = len(predicted_ratings.index)
index = 1
for user_id in test_ratings.index:
    print("progress: ", index, '/', user_num)
    index = index + 1
    predicted_ratings.loc[user_id] = generate_user_based_predictions_test(user_id=user_id, top_n=20)

rmse = calculate_rmse(predicted_ratings, test_ratings)
print(f"RMSE: {rmse}")

progress:  1 / 671
progress:  2 / 671
progress:  3 / 671
progress:  4 / 671
progress:  5 / 671
progress:  6 / 671
progress:  7 / 671
progress:  8 / 671
progress:  9 / 671
progress:  10 / 671
progress:  11 / 671
progress:  12 / 671
progress:  13 / 671
progress:  14 / 671
progress:  15 / 671
progress:  16 / 671
progress:  17 / 671
progress:  18 / 671
progress:  19 / 671
progress:  20 / 671
progress:  21 / 671
progress:  22 / 671
progress:  23 / 671
progress:  24 / 671
progress:  25 / 671
progress:  26 / 671
progress:  27 / 671
progress:  28 / 671
progress:  29 / 671
progress:  30 / 671
progress:  31 / 671
progress:  32 / 671
progress:  33 / 671
progress:  34 / 671
progress:  35 / 671
progress:  36 / 671
progress:  37 / 671
progress:  38 / 671
progress:  39 / 671
progress:  40 / 671
progress:  41 / 671
progress:  42 / 671
progress:  43 / 671
progress:  44 / 671
progress:  45 / 671
progress:  46 / 671
progress:  47 / 671
progress:  48 / 671
progress:  49 / 671
progress:  50 / 671
progress: