## Import libraries

In [34]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity

# Loading data

In [35]:
ratings_train = pd.read_csv('ratings_train.csv')
ratings_valid = pd.read_csv('ratings_valid.csv')
movies = pd.read_csv('movies.csv')

# Cosine similarity

## Load data

In [36]:
# Construct the user-item matrix from training data
# Pivot the training dataset to create a matrix with users as rows, movies as columns, and ratings as values
# Fill missing ratings with 0 to indicate unrated movies
utility_matrix = ratings_train.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Display the first few rows of the utility matrix to verify its structure
print(utility_matrix.head())

movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           4.0     0.0     4.0     0.0     0.0     4.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

movieId  9       10      ...  193565  193567  193571  193573  193579  193581  \
userId                   ...                                                   
1           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0

## Create cosine similarity matrix

In [37]:
# Calculate the cosine similarity matrix for users
# Use the utility matrix to compute cosine similarity among users
# The resulting matrix has users as both rows and columns, indicating similarity scores between pairs of users
cosine_similarity_matrix = pd.DataFrame(
    cosine_similarity(utility_matrix),
    index=utility_matrix.index,
    columns=utility_matrix.index
)

# Display the user similarity matrix
print(cosine_similarity_matrix)

userId       1         2         3         4         5         6         7    \
userId                                                                         
1       1.000000  0.027283  0.059720  0.210282  0.129080  0.128152  0.158744   
2       0.027283  1.000000  0.000000  0.004167  0.016614  0.025333  0.027585   
3       0.059720  0.000000  1.000000  0.002518  0.005020  0.003936  0.000000   
4       0.210282  0.004167  0.002518  1.000000  0.107718  0.085415  0.117554   
5       0.129080  0.016614  0.005020  0.107718  1.000000  0.300349  0.108342   
...          ...       ...       ...       ...       ...       ...       ...   
606     0.164191  0.028429  0.012993  0.184199  0.106435  0.102123  0.200035   
607     0.269389  0.012948  0.019247  0.136068  0.152866  0.162182  0.186114   
608     0.291097  0.046211  0.021128  0.163608  0.135535  0.178809  0.323541   
609     0.093572  0.027565  0.000000  0.024007  0.261232  0.214234  0.090840   
610     0.145321  0.102427  0.032119  0.

## Find similar users to our target user

In [38]:
def get_users_who_rated_movie(movie_id, utility_matrix):
    """
    Returns a list of users who have rated a specific movie.

    Parameters:
    - movie_id: the ID of the movie.
    - utility_matrix: a user-item matrix (DataFrame) with users as rows, movie IDs as columns, and ratings as values.

    Returns:
    - A list of user IDs who rated the specified movie.
    """
    # Identify users who have rated the specified movie
    rated_users = utility_matrix.index[utility_matrix[movie_id] != 0].tolist()
    return rated_users

def find_similar_users(target_user_id, rated_users, similarity_matrix, k=3):
    """
    Finds and returns the top k users most similar to the target user based on a similarity matrix.

    Parameters:
    - target_user_id: the ID of the target user.
    - rated_users: a list of users who have rated a specific movie.
    - similarity_matrix: a Cosine similarity matrix (DataFrame) representing the similarity scores between users.
    - k: the number of similar users to find (default is 3).

    Returns:
    - A tuple containing two lists: the IDs of the top k similar users and their corresponding similarity scores.
    """
    # Calculate similarity scores between the target user and users who rated the movie
    similarities = similarity_matrix.loc[target_user_id, rated_users]

    # Identify the top k most similar users
    top_similar_users = similarities.nlargest(k)
    return top_similar_users.index.tolist(), top_similar_users.values.tolist()

## Predict rating

In [39]:
def predict_rating_cosine(df, utility_matrix, similarity_matrix):
    """
    Predicts ratings for a given set of user-movie pairs using cosine similarity.

    Parameters:
    - df: DataFrame with columns 'userId' and 'movieId'.
    - utility_matrix: a user-item matrix (DataFrame) with users as rows, movie IDs as columns, and ratings as values.
    - similarity_matrix: a Cosine similarity matrix (DataFrame) representing cosine similarity between users.

    Returns:
    - DataFrame containing columns 'userId', 'movieId', and 'predicted_rating' with the predicted ratings.
    """

    # Initialize an empty list to store predicted ratings
    predicted_ratings = []

    # Iterate over each user-movie pair in the input DataFrame
    for user_id, movie_id in df.values:
        # Find users who have rated the movie
        rated_users = get_users_who_rated_movie(movie_id, utility_matrix)
        # Find similar users and their similarity scores
        similar_users, similarities = find_similar_users(user_id, rated_users, similarity_matrix)

        # Initialize variables for the weighted sum of ratings and sum of weights
        ratings_sum = 0
        weights_sum = 0

        # Calculate the weighted sum of ratings from similar users
        for similar_user, similarity in zip(similar_users, similarities):
            user_rating = utility_matrix.loc[similar_user, movie_id]
            ratings_sum += user_rating * similarity
            weights_sum += similarity

        # Compute the predicted rating
        if weights_sum != 0:
            predicted_rating = ratings_sum / weights_sum
        else:
            # If no similar users, use the average rating for the movie
            nonzero_ratings = utility_matrix.loc[utility_matrix[movie_id] != 0, movie_id]
            predicted_rating = nonzero_ratings.mean() if len(nonzero_ratings) > 0 else 0

        # Append the predicted rating to the list
        predicted_ratings.append([user_id, movie_id, predicted_rating])

    # Convert the list of predicted ratings into a DataFrame
    predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=['userId', 'movieId', 'predicted_rating'])
    return predicted_ratings_df

## Create prediction

In [40]:
# Prepare dataframe for prediction
selected_columns = ratings_valid[['userId', 'movieId']]

# Predict ratings using Cosine similarity
predicted_ratings = predict_rating_cosine(selected_columns, utility_matrix, cosine_similarity_matrix)

## Calculate RMSE

In [41]:
# Calculate and display the RMSE for the predicted ratings

# Prepare data to calculate RMSE by ratings (actual and predicted)
actual_ratings = ratings_valid['rating'].to_numpy()
predicted_ratings = predicted_ratings['predicted_rating'].to_numpy()

# Compute the RMSE
rmse_value = mean_squared_error(actual_ratings, predicted_ratings, squared=False)

# Print the RMSE
print(f"RMSE = {rmse_value:.4f}")

RMSE = 0.9961


# Pearson similarity

## Load data

In [42]:
# Construct the user-item matrix from training data
# Pivot the training dataset to create a matrix with users as rows, movies as columns, and ratings as values
# Fill missing ratings with 0 to indicate unrated movies
utility_matrix = ratings_train.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Display the first few rows of the utility matrix to verify its structure
print(utility_matrix.head())

movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           4.0     0.0     4.0     0.0     0.0     4.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

movieId  9       10      ...  193565  193567  193571  193573  193579  193581  \
userId                   ...                                                   
1           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0

## Create Pearson Similarity Matrix

In [43]:
def pearson_similarity(utility_matrix):
    """
    Computes the Pearson similarity matrix for users based on their ratings.

    Parameters:
    - utility_matrix: a user-item matrix (DataFrame) with users as rows, movie IDs as columns, and ratings as values.

    Returns:
    - A matrix (DataFrame) representing the Pearson similarity between users.
    """

    # Transpose the matrix to have movies as rows and users as columns
    transposed_matrix = utility_matrix.T

    # Center the ratings by subtracting the mean rating for each movie
    centered_matrix = transposed_matrix.sub(transposed_matrix.mean(axis=1), axis=0)

    # Calculate the Pearson correlation coefficient between users
    similarity_matrix = centered_matrix.corr(method='pearson')

    return similarity_matrix

# Calculate and display the Pearson similarity matrix
pearson_similarity_matrix = pearson_similarity(utility_matrix)
print(pearson_similarity_matrix)

userId       1         2         3         4         5         6         7    \
userId                                                                         
1       1.000000 -0.140122 -0.083659  0.085724 -0.068594 -0.002084 -0.013237   
2      -0.140122  1.000000  0.241867 -0.052336  0.065753 -0.061165 -0.094267   
3      -0.083659  0.241867  1.000000 -0.016926  0.124382 -0.051919 -0.076738   
4       0.085724 -0.052336 -0.016926  1.000000 -0.000493 -0.022182 -0.022595   
5      -0.068594  0.065753  0.124382 -0.000493  1.000000  0.194005 -0.071550   
...          ...       ...       ...       ...       ...       ...       ...   
606     0.025781 -0.151882 -0.149200  0.054609 -0.088802 -0.043876  0.050126   
607     0.136872 -0.117103 -0.076055  0.007923 -0.016660  0.040365  0.021911   
608     0.134672 -0.244170 -0.247130 -0.024897 -0.169082  0.005041  0.131082   
609    -0.120499  0.152168  0.208574 -0.075440  0.249909  0.093092 -0.085606   
610    -0.004954 -0.101139 -0.156579 -0.

## Find similar users to our target user

In [44]:
def get_users_who_rated_movie(movie_id, utility_matrix):
    """
    Returns a list of users who have rated a specific movie.

    Parameters:
    - movie_id: the ID of the movie to check.
    - utility_matrix: a user-item matrix (DataFrame) with users as rows, movie IDs as columns, and ratings as values.

    Returns:
    - List of user IDs who rated the specified movie.
    """
    # Filter the utility matrix to find users who have rated the movie
    rated_users = utility_matrix.index[utility_matrix[movie_id] != 0].tolist()
    return rated_users

def find_similar_users(target_user_id, rated_users, similarity_matrix, k=10):
    """
    Finds and returns the top k users most similar to the target user.

    Parameters:
    - target_user_id: the ID of the user for whom to predict ratings.
    - rated_users: list of users who have rated a specific movie.
    - similarity_matrix: a Pearson similarity matrix (DataFrame) containing similarity scores between users.
    - k: number of similar users to find (default is 10).

    Returns:
    - Tuple of two lists: IDs of the top k similar users and their similarity scores.
    """
    # Calculate similarity scores between the target user and users who rated the movie
    similarities = similarity_matrix.loc[target_user_id, rated_users]
    # Select the top k most similar users
    similar_users = similarities.nlargest(k)
    return similar_users.index.tolist(), similar_users.values.tolist()

## Predict rating

In [45]:
def predict_rating_pearson(df, utility_matrix, similarity_matrix):
    """
    Predicts ratings using Pearson correlation for a given set of user-movie pairs.

    Parameters:
    - df: DataFrame with columns 'userId' and 'movieId'.
    - utility_matrix: a user-item matrix (DataFrame) with users as rows, movie IDs as columns, and ratings as values.
    - similarity_matrix: a Pearson similarity matrix (DataFrame) representing Pearson similarity between users.

    Returns:
    - DataFrame containing 'userId', 'movieId', and 'predicted_rating'.
    """

    predicted_ratings = []

    for user_id, movie_id in df.values:
        # Find users who have rated the movie and their similarity scores
        rated_users = get_users_who_rated_movie(movie_id, utility_matrix)
        similar_users, similarities = find_similar_users(user_id, rated_users, similarity_matrix)

        # Calculate the average rating of the target user for rated movies
        target_user_avg_rating = utility_matrix.loc[user_id][utility_matrix.loc[user_id] != 0].mean()

        ratings_sum = 0
        weights_sum = 0

        for similar_user, similarity in zip(similar_users, similarities):
            # Calculate the average rating of the similar user for rated movies
            similar_user_avg_rating = utility_matrix.loc[similar_user][utility_matrix.loc[similar_user] != 0].mean()

            # Adjust the rating for the target movie by the similar user's average rating
            user_rating_adjusted = utility_matrix.at[similar_user, movie_id] - similar_user_avg_rating

            ratings_sum += user_rating_adjusted * similarity
            weights_sum += abs(similarity)

        # Calculate the predicted rating
        if weights_sum != 0:
            predicted_rating = target_user_avg_rating + (ratings_sum / weights_sum)
        else:
            # Use the target user's average rating if no similar users found
            predicted_rating = target_user_avg_rating

        predicted_ratings.append([user_id, movie_id, predicted_rating])

    return pd.DataFrame(predicted_ratings, columns=['userId', 'movieId', 'predicted_rating'])

## Create prediction

In [46]:
# Prepare dataframe for prediction
selected_columns = ratings_valid[['userId', 'movieId']]

# Predict ratings using Pearson correlation
predicted_ratings = predict_rating_pearson(selected_columns, utility_matrix, pearson_similarity_matrix)

## Calculate RMSE

In [47]:
# Calculate and display the RMSE for the predicted ratings

# Prepare data to calculate RMSE by ratings (actual and predicted)
actual_ratings = ratings_valid['rating'].to_numpy()
predicted_ratings = predicted_ratings['predicted_rating'].to_numpy()

# Compute the RMSE
rmse_value = mean_squared_error(actual_ratings, predicted_ratings, squared=False)

# Print the RMSE
print(f"RMSE = {rmse_value:.4f}")

RMSE = 0.8901


# Latent factor model with biases

## Constants

In [53]:
# Number of latent factors to use in matrix factorization
num_latent_factors = 20

# Learning rate: Determines the step size at each iteration while moving toward a minimum of the loss function.
learning_rate = 0.005

# Epochs: Number of complete passes through the training dataset.
epochs = 30

# Regularization parameter: Controls the magnitude of the regularization term added to the loss function to prevent overfitting.
reg_param = 0.03

## Loading Data

In [54]:
# Count unique users and movies
num_users = ratings_train['userId'].nunique()
num_movies = ratings_train['movieId'].nunique()

# Create user-item matrix (users as rows, movies as columns, ratings as values)
user_item_matrix = ratings_train.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Perform Singular Value Decomposition (SVD) on the user-item matrix
U, sigma, Vt = np.linalg.svd(user_item_matrix)

# Initialize user and movie factor matrices using the first 'num_latent_factors' components of SVD
user_factor_matrix = U[:, :num_latent_factors]
movie_factor_matrix = Vt.T[:, :num_latent_factors]

## Defining the MSE for training losses

In [55]:
def mse_loss(actual: np.ndarray, predicted: np.ndarray):
    """
    Calculates the mean squared error between actual and predicted ratings.

    Args:
        actual: array of actual ratings.
        predicted: array of predicted ratings.

    Returns:
        The mean squared error.
    """
    return np.mean(np.square(actual - predicted))

## Mapping of data and defining biases

In [56]:
# Create mappings for userId and movieId to indices
userId_to_index = {userId: idx for idx, userId in enumerate(ratings_train['userId'].unique())}
movieId_to_index = {movieId: idx for idx, movieId in enumerate(ratings_train['movieId'].unique())}

# Calculate global bias (average of all ratings)
global_avg = ratings_train['rating'].mean()

# Calculate user biases (average rating given by each user)
user_avg = ratings_train.groupby('userId')['rating'].mean().to_dict()

# Calculate item biases (average rating received by each movie)
item_avg = ratings_train.groupby('movieId')['rating'].mean().to_dict()

def calculate_user_biases(user_avg, userId_to_index, global_avg, num_users):
    """
    Calculates user biases and stores them in an array.

    Args:
        user_avg: dictionary mapping user IDs to their average ratings.
        userId_to_index: dictionary mapping user IDs to their indices.
        global_bias: the global average rating.
        num_users: the total number of users.

    Returns:
        An array of user biases.
    """
    user_bias = np.zeros(num_users)
    for user_id, rating_mean in user_avg.items():
        user_index = userId_to_index[user_id]
        user_bias[user_index] = rating_mean - global_avg
    return user_bias

def calculate_item_biases(item_avg, movieId_to_index, global_avg, num_movies):
    """
    Calculates item biases and stores them in an array.

    Args:
        item_avg: dictionary mapping movie IDs to their average ratings.
        movieId_to_index: dictionary mapping movie IDs to their indices.
        global_bias: the global average rating.
        num_movies: the total number of movies.

    Returns:
        An array of item biases.
    """
    item_bias = np.zeros(num_movies)
    for movie_id, rating_mean in item_avg.items():
        movie_index = movieId_to_index[movie_id]
        item_bias[movie_index] = rating_mean - global_avg
    return item_bias

# Getting the bias arrays
user_bias_array = calculate_user_biases(user_avg, userId_to_index, global_avg, num_users)
item_bias_array = calculate_item_biases(item_avg, movieId_to_index, global_avg, num_movies)

## Model training

In [66]:
def update_biases_and_factors(
        user_index,
        item_index,
        actual_rating,
        global_bias,
        user_bias_array,
        item_bias_array,
        user_factor_matrix,
        item_factor_matrix,
        learning_rate,
        regularization):
    """
    Updates the biases and factor matrices for a single user-item interaction.

    Args:
    - user_index: index of the user.
    - item_index: index of the item.
    - actual_rating: actual rating given by the user to the item.
    - global_bias: global bias across all ratings.
    - user_biases: array of user biases.
    - item_biases: array of item biases.
    - user_factors: user factor matrix.
    - item_factors: item factor matrix.
    - learning_rate: learning rate for optimization.
    - regularization: regularization parameter.

    Returns:
    Updated user_biases, item_biases, user_factors, item_factors.
    """
    # Calculate the predicted rating
    predicted_rating = global_bias + user_bias_array[user_index] + item_bias_array[item_index] + np.dot(user_factor_matrix[user_index], item_factor_matrix[item_index])

    # Calculate the error between the actual and predicted rating
    error = actual_rating - predicted_rating

    # Update user and item biases
    user_bias_array[user_index] += learning_rate * (error - regularization * user_bias_array[user_index])
    item_bias_array[item_index] += learning_rate * (error - regularization * item_bias_array[item_index])

    # Update user and item factor matrices
    user_factor_matrix[user_index] += learning_rate * (error * item_factor_matrix[item_index] - regularization * user_factor_matrix[user_index])
    item_factor_matrix[item_index] += learning_rate * (error * user_factor_matrix[user_index] - regularization * item_factor_matrix[item_index])

    return user_bias_array, item_bias_array, user_factor_matrix, item_factor_matrix

def train_model(
        ratings_train,
        num_epochs,
        global_bias,
        user_bias_array,
        item_bias_array,
        user_factor_matrix,
        item_factor_matrix,
        learning_rate,
        regularization):
    """
    Trains the model over a specified number of epochs.

    Args:
    - ratings: training dataset containing user IDs, item IDs, and ratings.
    - num_epochs: number of epochs to train for.
    - global_bias: global bias across all ratings.
    - user_bias_array: initial array of user biases.
    - item_bias_array: initial array of item biases.
    - user_factors: initial user factor matrix.
    - item_factors: initial item factor matrix.
    - learning_rate: learning rate for optimization.
    - regularization: regularization parameter.

    Returns:
    The updated user_biases, item_biases, user_factors, item_factors after training.
    """
    for epoch in range(num_epochs):
        for _, row in ratings_train.iterrows():
            user_id, item_id, rating = row['userId'], row['movieId'], row['rating']
            user_index, item_index = userId_to_index[user_id], movieId_to_index[item_id]

            # Update biases and factors for each rating
            updated_user_biases, updated_item_biases, updated_user_factors, updated_item_factors = update_biases_and_factors(
                user_index,
                item_index,
                rating,
                global_bias,
                user_bias_array,
                item_bias_array,
                user_factor_matrix,
                item_factor_matrix,
                learning_rate,
                regularization)

        # Calculate predicted ratings using the updated biases and factors on each epoch
        predicted_ratings = global_avg + updated_user_biases[:, np.newaxis] + updated_item_biases[np.newaxis, :] + np.dot(P, Q.T)

        # Calculate the training loss (MSE) on each epoch
        train_loss = mse_loss(ratings_train['rating'], predicted_ratings[ratings_train['userId'].map(userId_to_index), ratings_train['movieId'].map(movieId_to_index)])

        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {train_loss}")

    return updated_user_biases, updated_item_biases, updated_user_factors, updated_item_factors

# Call the train_model function to train the model and get the updated biases and factors
updated_user_bias_array, updated_item_bias_array, updated_user_factors, updated_item_factors = train_model(
    ratings_train,
    epochs,
    global_avg,
    user_bias_array,
    item_bias_array,
    user_factor_matrix,
    movie_factor_matrix,
    learning_rate,
    reg_param)

Epoch 1/30, Training Loss: 0.6301272789352836
Epoch 2/30, Training Loss: 0.6302706114172105
Epoch 3/30, Training Loss: 0.6304131844276031
Epoch 4/30, Training Loss: 0.6305549222351047
Epoch 5/30, Training Loss: 0.6306957583999488
Epoch 6/30, Training Loss: 0.6308356339485063
Epoch 7/30, Training Loss: 0.6309744959566218
Epoch 8/30, Training Loss: 0.6311122964763516
Epoch 9/30, Training Loss: 0.6312489917427326
Epoch 10/30, Training Loss: 0.6313845416019068
Epoch 11/30, Training Loss: 0.6315189091084851
Epoch 12/30, Training Loss: 0.6316520602477182
Epoch 13/30, Training Loss: 0.6317839637460658
Epoch 14/30, Training Loss: 0.6319145909415255
Epoch 15/30, Training Loss: 0.632043915692145
Epoch 16/30, Training Loss: 0.6321719143071703
Epoch 17/30, Training Loss: 0.6322985654901978
Epoch 18/30, Training Loss: 0.6324238502874733
Epoch 19/30, Training Loss: 0.6325477520372275
Epoch 20/30, Training Loss: 0.6326702563178271
Epoch 21/30, Training Loss: 0.6327913508937169
Epoch 22/30, Training L

## Predict Rating

In [64]:
def predict_ratings(
    user_item_pairs,
    user_to_index_map,
    item_to_index_map,
    global_average,
    updated_user_bias_array,
    updated_item_bias_array,
    updated_user_factors,
    updated_item_factors):
    """
    Predicts ratings for given user-item pairs.

    Args:
    - user_item_pairs: DataFrame with columns ['userId', 'movieId'] representing user-item pairs.
    - user_to_index_map: mapping from user IDs to their corresponding index in the bias and factor arrays.
    - item_to_index_map: mapping from item IDs (movie IDs) to their corresponding index in the bias and factor arrays.
    - global_average: global average rating across all users and items.
    - user_bias_array: array of user-specific bias values.
    - item_bias_array: array of item-specific bias values.
    - user_factors: matrix of user-specific factors.
    - item_factors: matrix of item-specific factors.

    Returns:
    DataFrame: A DataFrame with columns ['userId', 'movieId', 'predicted_rating'] containing the predicted ratings.
    """
    predictions = []

    # Iterate over each user-item pair
    for user_id, movie_id in user_item_pairs.values:
        # Map user and item to their respective indices
        user_index = user_to_index_map[user_id]
        item_index = item_to_index_map[movie_id]

        # Calculate the predicted rating
        predicted_rating = (
            global_average +
            updated_user_bias_array[user_index] +
            updated_item_bias_array[item_index] +
            np.dot(updated_user_factors[user_index], updated_item_factors[item_index])
        )

        # Append the prediction to the list
        predictions.append({
            'userId': user_id,
            'movieId': movie_id,
            'predicted_rating': predicted_rating
        })

    # Convert the list of predictions to a DataFrame
    predicted_ratings_df = pd.DataFrame(predictions)

    return predicted_ratings_df

# Prepare DataFrame for prediction
selected_columns = ratings_valid[['userId', 'movieId']]

# Predict ratings
predicted_ratings_df = predict_ratings(
    selected_columns,
    userId_to_index,
    movieId_to_index,
    global_avg,
    updated_user_bias_array,
    updated_item_bias_array,
    user_factor_matrix,
    movie_factor_matrix
)

## Calculate RMSE

In [65]:
# Calculate the Root Mean Square Error (RMSE) between actual and predicted ratings
actual_ratings = ratings_valid['rating'].to_numpy()
predicted_ratings = predicted_ratings_df['predicted_rating'].to_numpy()

# Compute RMSE
rmse = mean_squared_error(actual_ratings, predicted_ratings, squared=False)

# Display the RMSE result
print(f"RMSE = {rmse:.4f}")

RMSE = 0.8341
