In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from pprint import pprint
from fuzzywuzzy import process



In [2]:
# Take in all of our movies and ratings csvs and read it into pandas
movies = "Resources/ml-latest-small/movies.csv"
movies_df = pd.read_csv(movies)
movies_df.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# Extract the release year from the title column
movies_df["release_year"] = movies_df["title"].str.extract(r'\((\d{4})\)')

# Display the updated DataFrame
movies_df.head()

Unnamed: 0,movieId,title,genres,release_year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [4]:
#cleaning up the genres column to a list instead of a string
movies_df['genres'] = movies_df['genres'].apply(lambda x: x.split('|') )
movies_df.head()

Unnamed: 0,movieId,title,genres,release_year
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",1995
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II (1995),[Comedy],1995



**Addressing the Cold-Start Problem**

Collaborative filtering relies entirely on user-item interactions within the utility matrix. However, this approach faces a challenge when dealing with new users or items that have no interactions, resulting in their exclusion from the recommendation system. This is known as the cold-start problem. One way to address this issue is by using content-based filtering, which generates recommendations based on user and item features.

To implement this, we first need to convert the genres column into binary features. Each genre will have its own column in the dataframe, with values of 0 or 1 indicating the presence or absence of that genre

In [5]:
n_movies = movies_df['movieId'].nunique()
print(f"There are {n_movies} unique movies in our movies dataset.")

There are 9742 unique movies in our movies dataset.


In [6]:
genres = set(g for G in movies_df['genres'] for g in G)
for g in genres:
    movies_df[g] = movies_df.genres.transform(lambda x: int(g in x))
    
movies_genres = movies_df.drop(columns=['movieId', 'title','genres','release_year'])

In [7]:
movies_genres

Unnamed: 0,(no genres listed),Musical,Drama,Comedy,Animation,Adventure,Film-Noir,Romance,Fantasy,Children,Crime,Thriller,Mystery,Western,IMAX,Sci-Fi,War,Documentary,Horror,Action
0,0,0,0,1,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
9738,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9739,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9740,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [8]:
cosine_sim = cosine_similarity(movies_genres, movies_genres)
print(f"Dimensions of our genres cosine similarity matrix: {cosine_sim.shape}")

Dimensions of our genres cosine similarity matrix: (9742, 9742)


In [9]:
cosine_sim[0]

array([1.        , 0.77459667, 0.31622777, ..., 0.        , 0.31622777,
       0.4472136 ])

To receive recommendations for movies similar to Shawshank Redemption, it's essential to use the exact title as listed in our dataset. For example, in our dataset, Shawshank Redemption is recorded as Shawshank Redemption, The (1994).

If the title is misspelled or the release year is omitted, the recommender won't correctly identify the movie.

To make the process more user-friendly, we can utilize the Python package fuzzywuzzy. This package uses string matching algorithms to find the closest title match to a user-provided input. We'll create a function, movie_finder(), to leverage fuzzywuzzy and return the most similar movie title based on the user's input.

In [10]:
def movie_finder(title, threshold=80):
    all_titles = movies_df['title'].tolist()
    matches = process.extract(title, all_titles, limit=None)
    
    # Filter titles based on the threshold
    similar_titles = [match[0] for match in matches if match[1] >= threshold]
    
    # Get movie IDs for all matched titles
    results = []
    for matched_title in similar_titles:
        movie_id = movies_df[movies_df['title'] == matched_title].index[0]
        results.append((matched_title, movie_id))
    
    return results

Lets test it out with your favorite movie example

In [11]:
# PRACTICE
result = movie_finder('Twilight')
pprint(result)

print("chosen title:", result[0][0])
title = result[0][0]

[('Twilight (1998)', 1324),
 ('Twilight Zone: The Movie (1983)', 5514),
 ('Twilight Samurai, The (Tasogare Seibei) (2002)', 5687),
 ('Twilight (2008)', 6905),
 ('Twilight Saga: New Moon, The (2009)', 7188),
 ('Twilight Saga: Eclipse, The (2010)', 7363),
 ('Twilight Saga: Breaking Dawn - Part 1, The (2011)', 7749),
 ('Twilight Saga: Breaking Dawn - Part 2, The (2012)', 8036)]
chosen title: Twilight (1998)


To get relevant recommendations for Shawshank Redemption , we need to find its index in the cosine simialrity matrix. To identify which row we should be looking at, we can create a movie index mapper which maps a movie title to the index that it represents in our matrix.

Let's create a movie index dictionary called movie_idx where the keys are movie titles and values are movie indices:


In [12]:
# Get user input for title selection
user_input_title = input("Enter a movie title to search for: ")
user_result = movie_finder(user_input_title)

# Check if there are any results
if user_result:
    # Display results with numbering
    for idx, (title, movieId) in enumerate(user_result):
        print(f"{idx}: {title}")
    
    # # Get user input for index selection
    # user_idx = int(input("Choose what movie from search list (rder number) you want to select: "))
    # user_title = user_result[user_idx][0]
    # chosen_index = user_result[user_idx][1]
    # print("chosen title:", user_title, chosen_index)
    
    while True:
        try:
            user_idx = int(input("Choose what movie from search list (order number) you want to select: "))
            if 0 <= user_idx < len(user_result):  # Validate the index
                user_title = user_result[user_idx][0]
                chosen_index = user_result[user_idx][1]
                print("chosen title:", user_title, chosen_index)
                break  # Exit the loop if the selection is valid
            else:
                print(f"Please enter a number between 0 and {len(user_result) - 1}.")
        except ValueError:
            print("Invalid input. Please enter a valid integer.")
else:
    print("No results found.")

Enter a movie title to search for:  Twilight


0: Twilight (1998)
1: Twilight Zone: The Movie (1983)
2: Twilight Samurai, The (Tasogare Seibei) (2002)
3: Twilight (2008)
4: Twilight Saga: New Moon, The (2009)
5: Twilight Saga: Eclipse, The (2010)
6: Twilight Saga: Breaking Dawn - Part 1, The (2011)
7: Twilight Saga: Breaking Dawn - Part 2, The (2012)


Choose what movie from search list (order number) you want to select:  3


chosen title: Twilight (2008) 6905


## We now know that the movie index for Shawshank Redemption is 277 in ourr set, we need to get to top 10 recommended movies to this movie.

In [15]:
# n_recommendations=10
n_recommendations = int(input("Enter the number of recommendations you want: "))
sim_scores = [(i, float(score)) for i, score in enumerate(cosine_sim[chosen_index])]
# Exclude the target index
sim_scores = [score for score in sim_scores if score[0] != chosen_index]
# Sort scores by similarity in descending order
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:(n_recommendations+1)]
sim_scores

Enter the number of recommendations you want:  5


[(3713, 0.8944271909999159),
 (7188, 0.8944271909999159),
 (228, 0.8660254037844388),
 (441, 0.8660254037844388),
 (723, 0.8660254037844388)]

In [16]:
similar_movies = [i[0] for i in sim_scores]
similar_movies

[3713, 7188, 228, 441, 723]

In [17]:
print(f"Because you watched {title}:")
movies_df['title'].iloc[similar_movies]

Because you watched Twilight Saga: Breaking Dawn - Part 2, The (2012):


3713                                     Dragonfly (2002)
7188                  Twilight Saga: New Moon, The (2009)
228     Like Water for Chocolate (Como agua para choco...
441                                        Orlando (1992)
723                       Ghost and Mrs. Muir, The (1947)
Name: title, dtype: object

In [18]:
# # try to predict ratings of recommended movies based on user's past ratings and/or ratings of others

In [19]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

In [20]:
# Load the data
ratings = "Resources/ml-latest-small/ratings.csv"
ratings_df = pd.read_csv(ratings)
ratings_df.drop(['timestamp'], axis =1, inplace=True)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [21]:
# check for nulls
null_counts = ratings_df.isnull().sum()
print(null_counts)

userId     0
movieId    0
rating     0
dtype: int64


In [22]:
# Create a user-item matrix
# user_item_matrix = ratings_df.pivot(index='userId', columns='movieId', values='rating').fillna(0)
user_item_matrix = ratings_df.pivot(index='userId', columns='movieId', values='rating')
user_item_matrix
# print(user_item_matrix.isnull().sum())

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [23]:
# Calculate the average rating for each movie (column)
average_ratings = user_item_matrix.mean()

# Fill missing values with the average rating for the respective movie
user_item_matrix_filled = user_item_matrix.fillna(average_ratings)
user_item_matrix_filled

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.00000,3.431818,4.000000,2.357143,3.071429,4.000000,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
2,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
3,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
4,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
5,4.00000,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.50000,3.431818,3.259615,2.357143,3.071429,3.946078,2.500000,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
607,4.00000,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
608,2.50000,2.000000,2.000000,2.357143,3.071429,3.946078,3.185185,2.875,3.125,4.000000,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
609,3.00000,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,4.000000,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0


In [24]:
# Split the data into training and testing sets
# train_data, test_data = train_test_split(ratings_df, test_size=0.2, random_state=42)

train_data, test_data = train_test_split(user_item_matrix_filled, test_size=0.25, random_state=42) #25% use for testing

In [25]:
train_data[1:]

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
512,3.92093,3.000000,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
118,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
394,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
418,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
34,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,5.000000,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
107,4.00000,5.000000,3.259615,2.357143,4.000000,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
271,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
436,4.00000,4.000000,3.259615,2.357143,3.071429,3.946078,3.185185,3.000,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0


In [28]:
# # Create training and testing user-item matrices
# # train_user_item_matrix = train_data.pivot(index='userId', columns='movieId', values='rating').fillna(0)
# train_user_item_matrix = train_data.pivot(index='userId', columns='movieId', values='rating')
# # test_user_item_matrix = test_data.pivot(index='userId', columns='movieId', values='rating').fillna(0)
# test_user_item_matrix = test_data.pivot(index='userId', columns='movieId', values='rating')

# why even do this... seems to complicate everything including indeces, which then makes what I try to do later much harder/impossible, seemingly

# Apply Truncated SVD
svd = TruncatedSVD(n_components=10)  # Number of latent factors 

# fit SVD
latent_matrix = svd.fit_transform(train_data)

# Transform the training and test data
train_svd = svd.transform(train_data)
test_svd = svd.transform(test_data)

# Reconstruct the user-item matrix
train_reconstructed = np.dot(train_svd, svd.components_)
test_reconstructed = np.dot(test_svd, svd.components_)

# Calculate RMSE and MAE
# Flatten the matrices and remove NaN values for comparison
train_true = train_data.values.flatten()
train_pred = train_reconstructed.flatten()

test_true = test_data.values.flatten()
test_pred = test_reconstructed.flatten()

# Remove NaN values from the comparison
mask_train = ~np.isnan(train_true)
mask_test = ~np.isnan(test_true)

# Calculate RMSE
train_rmse = np.sqrt(mean_squared_error(train_true[mask_train], train_pred[mask_train]))
test_rmse = np.sqrt(mean_squared_error(test_true[mask_test], test_pred[mask_test]))

# Calculate MAE
train_mae = mean_absolute_error(train_true[mask_train], train_pred[mask_train])
test_mae = mean_absolute_error(test_true[mask_test], test_pred[mask_test])

print(f'test RMSE: {test_rmse}')
print(f'train RMSE: {train_rmse}')
print(f'test MAE: {test_mae}')
print(f'train MAE: {train_mae}')

test RMSE: 0.11915186073432261
train RMSE: 0.1000556892806417
test MAE: 0.02119381127391397
train MAE: 0.016650054770201355


In [31]:
# train_reconstructed[1:]
train_reconstructed[:1]

array([[3.44935072, 3.00839564, 2.89513907, ..., 3.4998206 , 3.4998206 ,
        3.99979497]])

In [None]:
# test_user_item_matrix[:1]

In [80]:
# # Apply Truncated SVD
# svd = TruncatedSVD(n_components=10)  # Number of latent factors 
# # latent factors tried: 50, 100, 200, 500, 25, 10, 8, 5, 3 - less is better but not past 10
# # latent_matrix = svd.fit_transform(train_user_item_matrix)

# latent_matrix = svd.fit_transform(train_data)

# # Convert to DataFrame for easier handling
# # latent_df = pd.DataFrame(latent_matrix, index=user_item_matrix.index)

In [81]:
# latent_matrix[:1]
# # latent_df

array([[ 3.31278963e+02,  4.18460709e+00, -1.18453397e+00,
        -1.55651388e+00,  1.41399421e+00,  6.76600696e-01,
        -1.27226699e+00,  3.13884589e-01, -6.43001517e-01,
        -1.79893051e+00]])

In [82]:
# # Reconstruct the user-item matrix
# reconstructed_matrix = np.dot(latent_matrix, svd.components_)


In [85]:
# # Make predictions for the test set
# predictions = []
# actuals = []

# # Get the mapping of userId and movieId to the indices in the user-item matrix
# user_index_mapping = {user_id: index for index, user_id in enumerate(train_data.index)}
# movie_index_mapping = {movie_id: index for index, movie_id in enumerate(train_data.columns)}

# for _, row in test_data.iterrows():
#     user_id = row['userId']
#     movie_id = row['movieId']
#     actual_rating = row['rating']
    
#     # Check if user and movie are in the mapping
#     if user_id in user_index_mapping and movie_id in movie_index_mapping:
#         user_index = user_index_mapping[user_id]
#         movie_index = movie_index_mapping[movie_id]
#         predicted_rating = reconstructed_matrix[user_index, movie_index]
#         predictions.append(predicted_rating)
#         actuals.append(actual_rating)

KeyError: 'userId'

In [None]:
# # actuals[1:]
# predictions[1:]

In [None]:
# # Calculate RMSE
# rmse = np.sqrt(mean_squared_error(actuals, predictions))
# print(f'RMSE: {rmse}')

# # from 0 to 5, being off by 3 score points is pretty high!

In [84]:
# # Calculate MAE
# mae = mean_absolute_error(actuals, predictions)
# print(f'MAE: {mae}')
# # from 0 to 5, being off by 3 score points is pretty high!

ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

In [46]:
# function to find id based on title
# need title locator to get id from movies csv
def original_id_finder(title, threshold=80): #threshold refers to similarity threshold
    all_titles = movies_df['title'].tolist()
    matches = process.extract(title, all_titles, limit=None)
    
    # Filter titles based on the threshold
    similar_titles = [match[0] for match in matches if match[1] >= threshold]
    
    # Get movie IDs for all matched titles
    myids = []
    for matched_title in similar_titles:
        # Access the movieId
        movie_id = movies_df[movies_df['title'] == matched_title]['movieId'].values[0]
        myids.append((matched_title, movie_id))
    
    return myids

In [47]:
# find id(s) for reccommended movie(s)
original_id_finder('Twilight Saga')

[('Twilight Saga: New Moon, The (2009)', 72407),
 ('Twilight Saga: Eclipse, The (2010)', 78772),
 ('Twilight Saga: Breaking Dawn - Part 1, The (2011)', 91104),
 ('Twilight Saga: Breaking Dawn - Part 2, The (2012)', 98203),
 ('Twilight Zone: The Movie (1983)', 26492),
 ('Twilight Samurai, The (Tasogare Seibei) (2002)', 27741),
 ('Northmen - A Viking Saga (2014)', 129229)]

In [54]:
reconstructed_matrix.shape

(610, 8983)

In [49]:
# function to match original id to resconstructed_matrix index for the movie/user?
# Assuming ratings_df has columns 'userId' and 'movieId'
def my_rating(user_id, movie_id):
# predicted_ratings = []

# for index, row in ratings_df.iterrows():
#     user_id = row['userId']
#     movie_id = row['movieId']
    
#     # Access the predicted rating from the reconstructed_matrix
#     predicted_rating = reconstructed_matrix[user_id, movie_id]
#     predicted_ratings.append(predicted_rating)

# # Add the predicted ratings to the original ratings_df
# ratings_df['predicted_rating'] = predicted_ratings

rating_predict = []
actuals = []

# Get the mapping of userId and movieId to the indices in the user-item matrix
user_index_mapping = {user_id: index for index, user_id in enumerate(train_user_item_matrix.index)}
movie_index_mapping = {movie_id: index for index, movie_id in enumerate(train_user_item_matrix.columns)}

for _, row in test_data.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    actual_rating = row['rating']
    
    # Check if user and movie are in the mapping
    if user_id in user_index_mapping and movie_id in movie_index_mapping:
        user_index = user_index_mapping[user_id]
        movie_index = movie_index_mapping[movie_id]
        predicted_rating = reconstructed_matrix[user_index, movie_index]
        predictions.append(predicted_rating)
        actuals.append(actual_rating)


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [39]:
# # Predict a specific user's rating for a specific movie
# user_id = 10 # change to reflect a user of interest (ex.user 10 has rated Twilight Saga: Breaking Dawn - Part 2)
# movie_id = 98203 #change to reflect a reccommended movie
# predicted_rating = reconstructed_matrix[user_id, movie_id]

# print(f'Predicted rating for user {user_id} for movie {movie_id}: {predicted_rating}')

IndexError: index 98203 is out of bounds for axis 1 with size 8983

In [None]:
# Assuming the predicted_ratings list is indexed in the same order as ratings_df
user_id = 10  # change to reflect a user of interest
movie_id = 98203  # change to reflect a recommended movie

# Find the index of the rating in ratings_df
index = ratings_df[(ratings_df['userId'] == user_id) & (ratings_df['movieId'] == movie_id)].index[0]

# Get the predicted rating from the predicted_ratings list
predicted_rating = predicted_ratings[index]

print(f'Predicted rating for user {user_id} for movie {movie_id}: {predicted_rating}')