In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from pprint import pprint
from fuzzywuzzy import process
from pymongo import MongoClient



In [2]:
# https://docs.mongodb.com/manual/reference/default-mongodb-port/
strmongo = "mongodb+srv://dataquesters:project3@cluster0.dy07n.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
mongo = MongoClient(strmongo)
# confirm that our new database was created
print(mongo.list_database_names())

['movie_recommendations', 'sample_mflix', 'admin', 'local']


In [3]:
# Access the database
db = mongo['movie_recommendations']

# Access the collection
movie_db = db['movies']
rating_db = db['ratings']

In [4]:
# Find and print documents in the movies collection
documents_movies = movie_db.find()
for document in documents_movies[:5]:
    print(document)

{'_id': ObjectId('675640fbcaf6abb67dda95dd'), 'movieId': 1, 'title': 'Toy Story (1995)', 'genres': 'Adventure|Animation|Children|Comedy|Fantasy'}
{'_id': ObjectId('675640fbcaf6abb67dda95de'), 'movieId': 2, 'title': 'Jumanji (1995)', 'genres': 'Adventure|Children|Fantasy'}
{'_id': ObjectId('675640fbcaf6abb67dda95df'), 'movieId': 3, 'title': 'Grumpier Old Men (1995)', 'genres': 'Comedy|Romance'}
{'_id': ObjectId('675640fbcaf6abb67dda95e0'), 'movieId': 4, 'title': 'Waiting to Exhale (1995)', 'genres': 'Comedy|Drama|Romance'}
{'_id': ObjectId('675640fbcaf6abb67dda95e1'), 'movieId': 5, 'title': 'Father of the Bride Part II (1995)', 'genres': 'Comedy'}


In [5]:
# Find and print documents in the ratings collection
documents_ratings = rating_db.find()
for document in documents_ratings[:5]:
    print(document)

{'_id': ObjectId('67564f35caf6abb67d653dc7'), 'userId': 1, 'movieId': 17, 'rating': 4.0, 'timestamp': 944249077}
{'_id': ObjectId('67564f35caf6abb67d653dc8'), 'userId': 1, 'movieId': 25, 'rating': 1.0, 'timestamp': 944250228}
{'_id': ObjectId('67564f35caf6abb67d653dc9'), 'userId': 1, 'movieId': 29, 'rating': 2.0, 'timestamp': 943230976}
{'_id': ObjectId('67564f35caf6abb67d653dca'), 'userId': 1, 'movieId': 30, 'rating': 5.0, 'timestamp': 944249077}
{'_id': ObjectId('67564f35caf6abb67d653dcb'), 'userId': 1, 'movieId': 32, 'rating': 5.0, 'timestamp': 943228858}


In [6]:
# Retrieve data from the collection
# Convert cursor to list
documents_movies = list(movie_db.find())  
documents_ratings = list(rating_db.find())  

# Convert the documents to a Pandas DataFrame
movies_df = pd.DataFrame(documents_movies)

In [7]:
movies_df.head()

Unnamed: 0,_id,movieId,title,genres
0,675640fbcaf6abb67dda95dd,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,675640fbcaf6abb67dda95de,2,Jumanji (1995),Adventure|Children|Fantasy
2,675640fbcaf6abb67dda95df,3,Grumpier Old Men (1995),Comedy|Romance
3,675640fbcaf6abb67dda95e0,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,675640fbcaf6abb67dda95e1,5,Father of the Bride Part II (1995),Comedy


In [8]:
#cleaning up the genres column to a list instead of a string
movies_df['genres'] = movies_df['genres'].apply(lambda x: x.split('|') )
movies_df.head()

Unnamed: 0,_id,movieId,title,genres
0,675640fbcaf6abb67dda95dd,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,675640fbcaf6abb67dda95de,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,675640fbcaf6abb67dda95df,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,675640fbcaf6abb67dda95e0,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,675640fbcaf6abb67dda95e1,5,Father of the Bride Part II (1995),[Comedy]



**Addressing the Cold-Start Problem**

Collaborative filtering relies entirely on user-item interactions within the utility matrix. However, this approach faces a challenge when dealing with new users or items that have no interactions, resulting in their exclusion from the recommendation system. This is known as the cold-start problem. One way to address this issue is by using content-based filtering, which generates recommendations based on user and item features.

To implement this, we first need to convert the genres column into binary features. Each genre will have its own column in the dataframe, with values of 0 or 1 indicating the presence or absence of that genre

In [11]:
n_movies = movies_df['movieId'].nunique()
print(f"There are {n_movies} unique movies in our movies dataset.")

There are 87585 unique movies in our movies dataset.


In [13]:
genres = set(g for G in movies_df['genres'] for g in G)
for g in genres:
    movies_df[g] = movies_df.genres.transform(lambda x: int(g in x))
    
movies_genres = movies_df.drop(columns=['movieId', '_id', 'title','genres'])

In [14]:
movies_genres

Unnamed: 0,Thriller,Fantasy,IMAX,Romance,War,Action,Western,Children,Comedy,Drama,Documentary,Animation,Mystery,Musical,Film-Noir,Adventure,(no genres listed),Sci-Fi,Horror,Crime
0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0,0
1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87580,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
87581,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0
87582,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
87583,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [15]:
cosine_sim = cosine_similarity(movies_genres, movies_genres)
print(f"Dimensions of our genres cosine similarity matrix: {cosine_sim.shape}")

MemoryError: Unable to allocate 57.2 GiB for an array with shape (87585, 87585) and data type float64

In [9]:
cosine_sim[0]

array([1.        , 0.77459667, 0.31622777, ..., 0.        , 0.31622777,
       0.4472136 ])

To receive recommendations for movies similar to a chosen movie, it's essential to use the exact title as listed in our dataset. For example, in our dataset, Shawshank Redemption is recorded as Shawshank Redemption, The (1994).

If the title is misspelled or the release year is omitted, the recommender won't correctly identify the movie.

To make the process more user-friendly, we can utilize the Python package fuzzywuzzy. This package uses string matching algorithms to find the closest title match to a user-provided input. We'll create a function, movie_finder(), to leverage fuzzywuzzy and return the most similar movie title based on the user's input.

In [10]:
def movie_finder(title, threshold=80):
    all_titles = movies_df['title'].tolist()
    matches = process.extract(title, all_titles, limit=None)
    
    # Filter titles based on the threshold
    similar_titles = [match[0] for match in matches if match[1] >= threshold]
    
    # Get movie IDs for all matched titles
    results = []
    for matched_title in similar_titles:
        movie_id = movies_df[movies_df['title'] == matched_title].index[0]
        results.append((matched_title, movie_id))
    
    return results

**Lets test it out with your favorite movie example**

In [11]:
# PRACTICE
result = movie_finder('Twilight')
pprint(result)

print("chosen title:", result[0][0])
title = result[0][0]

[('Twilight (1998)', 1324),
 ('Twilight Zone: The Movie (1983)', 5514),
 ('Twilight Samurai, The (Tasogare Seibei) (2002)', 5687),
 ('Twilight (2008)', 6905),
 ('Twilight Saga: New Moon, The (2009)', 7188),
 ('Twilight Saga: Eclipse, The (2010)', 7363),
 ('Twilight Saga: Breaking Dawn - Part 1, The (2011)', 7749),
 ('Twilight Saga: Breaking Dawn - Part 2, The (2012)', 8036)]
chosen title: Twilight (1998)


To get relevant recommendations for your chosen movie, we need to find its index in the cosine simialrity matrix. To identify which row we should be looking at, we can create a movie index mapper which maps a movie title to the index that it represents in our matrix. First we will find the original movieId for our movie of choice, then use the mapper to get the index we need for our model.


In [12]:
# Get user input for title selection
user_input_title = input("Enter a movie title to search for: ")
user_result = movie_finder(user_input_title)

# Check if there are any results
if user_result:
    # Display results with numbering
    for idx, (title, movieId) in enumerate(user_result):
        print(f"{idx}: {title}")
    
    while True:
        try:
            user_idx = int(input("Choose what movie from search list (order number) you want to select: "))
            if 0 <= user_idx < len(user_result):  # Validate the index
                user_title = user_result[user_idx][0]
                chosen_index = user_result[user_idx][1]
                print("chosen title:", user_title, chosen_index)
                break  # Exit the loop if the selection is valid
            else:
                print(f"Please enter a number between 0 and {len(user_result) - 1}.")
        except ValueError:
            print("Invalid input. Please enter a valid integer.")
else:
    print("No results found.")

Enter a movie title to search for:  Clueless


0: Clueless (1995)


Choose what movie from search list (order number) you want to select:  1


Please enter a number between 0 and 0.


Choose what movie from search list (order number) you want to select:  0


chosen title: Clueless (1995) 35


We now know that the movie index in our set for the chosen movie, we need to get to top recommended movies to this movie (**you can choose how many are recommended to you**).

In [13]:
n_recommendations = int(input("Enter the number of recommendations you want: "))
sim_scores = [(i, float(score)) for i, score in enumerate(cosine_sim[chosen_index])]

# Exclude the target index
sim_scores = [score for score in sim_scores if score[0] != chosen_index]

# Sort scores by similarity in descending order
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:(n_recommendations+1)]
sim_scores

Enter the number of recommendations you want:  5


[(6, 0.9999999999999998),
 (57, 0.9999999999999998),
 (60, 0.9999999999999998),
 (103, 0.9999999999999998),
 (106, 0.9999999999999998)]

In [14]:
similar_movies = [i[0] for i in sim_scores]
similar_movies

[6, 57, 60, 103, 106]

In [15]:
print(f"Because you watched {title}:")
movies_df['title'].iloc[similar_movies]

Because you watched Clueless (1995):


6                          Sabrina (1995)
57                   Two if by Sea (1996)
60     French Twist (Gazon maudit) (1995)
103                   If Lucy Fell (1996)
106                      Boomerang (1992)
Name: title, dtype: object

**Collaborative Filtering System**

Now that we have content-based (genre) predictions using cosine similarity scores, we can use an SVD model (TruncatedSVD from sklearn) to predict a user's rating of a recommended movie of choice based on other users' and the individual user's past ratings. This is known as a collaborative reccomendation model.

Truncated Singular Value Decomposition (Truncated SVD) is a dimensionality reduction technique that is particularly useful for large, sparse datasets, such as those commonly encountered in natural language processing and recommendation systems. It is a variant of Singular Value Decomposition (SVD) that reduces the number of dimensions in the data while preserving as much information as possible.

In [16]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

In [17]:
# Load the data
# Convert the documents to a Pandas DataFrame
ratings_df = pd.DataFrame(list(documents_ratings))
ratings_df.drop(['timestamp'], axis =1, inplace=True)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [18]:
# Check for nulls
null_counts = ratings_df.isnull().sum()
print(null_counts)

userId     0
movieId    0
rating     0
dtype: int64


In [19]:
# Create a user-item matrix
user_item_matrix = ratings_df.pivot(index='userId', columns='movieId', values='rating')
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [20]:
# Calculate the average rating for each movie (column)
average_ratings = user_item_matrix.mean()

# Fill missing values with the average rating for the respective movie
user_item_matrix_filled = user_item_matrix.fillna(average_ratings)
user_item_matrix_filled

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.00000,3.431818,4.000000,2.357143,3.071429,4.000000,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
2,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
3,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
4,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
5,4.00000,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.50000,3.431818,3.259615,2.357143,3.071429,3.946078,2.500000,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
607,4.00000,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
608,2.50000,2.000000,2.000000,2.357143,3.071429,3.946078,3.185185,2.875,3.125,4.000000,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
609,3.00000,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,4.000000,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0


In [23]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(user_item_matrix_filled, test_size=0.25, random_state=42) #25% use for testing

In [24]:
train_data[1:]

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
512,3.92093,3.000000,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
118,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
394,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
418,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
34,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,5.000000,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
107,4.00000,5.000000,3.259615,2.357143,4.000000,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
271,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
436,4.00000,4.000000,3.259615,2.357143,3.071429,3.946078,3.185185,3.000,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0


In [25]:
# Apply Truncated SVD
svd = TruncatedSVD(n_components=10)  # Number of latent factors 

# Fit SVD
latent_matrix = svd.fit_transform(train_data)

# Transform the training and test data
train_svd = svd.transform(train_data)
test_svd = svd.transform(test_data)

# Reconstruct the user-item matrix
train_reconstructed = np.dot(train_svd, svd.components_)
test_reconstructed = np.dot(test_svd, svd.components_)

# Calculate RMSE and MAE
# Flatten the matrices and remove NaN values for comparison
train_true = train_data.values.flatten()
train_pred = train_reconstructed.flatten()

test_true = test_data.values.flatten()
test_pred = test_reconstructed.flatten()

# Remove NaN values from the comparison (if they still are present after replacing with averages above)
mask_train = ~np.isnan(train_true)
mask_test = ~np.isnan(test_true)

# Calculate RMSE
train_rmse = np.sqrt(mean_squared_error(train_true[mask_train], train_pred[mask_train]))
test_rmse = np.sqrt(mean_squared_error(test_true[mask_test], test_pred[mask_test]))

# Calculate MAE
train_mae = mean_absolute_error(train_true[mask_train], train_pred[mask_train])
test_mae = mean_absolute_error(test_true[mask_test], test_pred[mask_test])

print(f'test RMSE: {test_rmse}')
print(f'train RMSE: {train_rmse}')
print(f'test MAE: {test_mae}')
print(f'train MAE: {train_mae}')

test RMSE: 0.11915462657603451
train RMSE: 0.10005248848741277
test MAE: 0.021197437070116457
train MAE: 0.01664315555575453


In [26]:
# Check a reconstructed array
# train_reconstructed[1:]
train_reconstructed[:1]

array([[3.3879447 , 2.91560851, 2.8675316 , ..., 3.50044203, 3.50044203,
        4.00050518]])

In [27]:
# Create function to find movieId based on title
def original_id_finder(title, threshold=80): #threshold refers to similarity threshold
    all_titles = movies_df['title'].tolist()
    matches = process.extract(title, all_titles, limit=None)
    
    # Filter titles based on the threshold
    similar_titles = [match[0] for match in matches if match[1] >= threshold]
    
    # Get movie IDs for all matched titles
    myids = []
    for matched_title in similar_titles:
        # Access the movieId
        movie_id = movies_df[movies_df['title'] == matched_title]['movieId'].values[0]
        myids.append((matched_title, movie_id))
    
    return myids

In [28]:
# Find id(s) for reccommended movie(s). 
original_id_finder('Sabrina')

[('Sabrina (1995)', 7), ('Sabrina (1954)', 915)]

In [29]:
# Get index of the selected reccommended movie based on movieId
# Create mappings
user_id_to_index = {user_id: index for index, user_id in enumerate(user_item_matrix_filled.index)}
movie_id_to_index = {movie_id: index for index, movie_id in enumerate(user_item_matrix_filled.columns)}

# Example user and movie IDs
chosen_user_id = 10  # Replace with the actual user ID you want to use
chosen_movie_id = 98203  # Replace with the actual movie ID you want to use

try:
    user_index = user_id_to_index[chosen_user_id]  # accessing user index
    movie_index = movie_id_to_index[chosen_movie_id]  # accessing movie index (noticed index was off by 1 for movieId's new index)
    adjusted_movie_index = movie_index + 1
    
    print(f"User index for user ID {chosen_user_id}: {user_index}")
    print(f"Movie index for movie ID {chosen_movie_id}: {adjusted_movie_index}")
except KeyError as e:
    print(f"KeyError: {e} - This ID does not exist in the mapping.")

User index for user ID 10: 9
Movie index for movie ID 98203: 8019


In [30]:
test_column_index = movie_id_to_index.get(98203)
print(test_column_index)

8018


In [31]:
# Get the latent factors for the user and movie
user_latent = train_svd[user_index]  # Latent factors for the user
movie_latent = svd.components_[:, adjusted_movie_index]  # Latent factors for the movie

# Calculate the predicted rating
predicted_rating = np.dot(user_latent, movie_latent)

print(f"The predicted rating for user (userId {chosen_user_id}) at index {user_index} and movie \
(movieId {chosen_movie_id}) at index {adjusted_movie_index} is: {predicted_rating}")


The predicted rating for user (userId 10) at index 9 and movie (movieId 98203) at index 8019 is: 3.4999431602062074


## Now let's find the predicted rating for a recommended  movie of your choice (user input).

In [33]:
# Function ver to get index of the selected reccommended movie based on movieId
def my_rating(my_user_id, my_movie_id):
    # Comment out mapping creation since done above in test
    # # Create mappings
    # user_id_to_index = {user_id: index for index, user_id in enumerate(user_item_matrix_filled.index)}
    # movie_id_to_index = {movie_id: index for index, movie_id in enumerate(user_item_matrix_filled.columns)}
    try:
        my_user_index = user_id_to_index[my_user_id]  # accessing user index
        my_movie_index = movie_id_to_index[my_movie_id]
        my_adjusted_movie_index = my_movie_index + 1 #index is off by one
        # Get the latent factors for the user and movie
        my_user_latent = train_svd[my_user_index]  # Latent factors for the user
        my_movie_latent = svd.components_[:, my_adjusted_movie_index]  # Latent factors for the movie
        
        # Calculate the predicted rating
        my_predicted_rating = np.dot(my_user_latent, my_movie_latent)
        
        print(f"User index for user ID {my_user_id}: {my_user_index}")
        print(f"Movie index for movie ID {my_movie_id}: {my_adjusted_movie_index}")
        print(f"The predicted rating for user (userId {my_user_id}) at index {my_user_index} and movie (movieId {my_movie_id}) at index {my_adjusted_movie_index} is: {my_predicted_rating}")
    except KeyError as e:
        print(f"KeyError: {e} - This ID does not exist in the mapping.")

In [34]:
# Scroll up or uncomment the two lines of code below this to refresh your memory on the reccommended movies.
print(f"Because you watched {title}:")
movies_df['title'].iloc[similar_movies]

Because you watched Clueless (1995):


6                          Sabrina (1995)
57                   Two if by Sea (1996)
60     French Twist (Gazon maudit) (1995)
103                   If Lucy Fell (1996)
106                      Boomerang (1992)
Name: title, dtype: object

In [35]:
# Scroll up or uncomment code below to check movieId for one of your reccommended movies.
# Find id(s) for reccommended movie(s). 
original_id_finder(input("Enter the title of a reccommended movie to get it's movieId: "))

Enter the title of a reccommended movie to get it's movieId:  Sabrina


[('Sabrina (1995)', 7), ('Sabrina (1954)', 915)]

In [40]:
# User input for user and movie IDs
my_user_id = int(input("Enter the userId of interest: "))
my_movie_id = int(input("Enter the movieId of interest: "))

Enter the userId of interest:  10
Enter the movieId of interest:  7


In [41]:
my_rating(my_user_id, my_movie_id)

User index for user ID 10: 9
Movie index for movie ID 7: 7
The predicted rating for user (userId 10) at index 9 and movie (movieId 7) at index 7 is: 2.8761380007116997


In [None]:
# Close the MongoDB connection
mongo.close()