In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from pprint import pprint
from fuzzywuzzy import process



In [2]:
# Take in all of our movies and ratings csvs and read it into pandas
movies = "Resources/ml-latest-small/movies.csv"
movies_df = pd.read_csv(movies)
movies_df.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# Extract the release year from the title column
movies_df["release_year"] = movies_df["title"].str.extract(r'\((\d{4})\)')

# Display the updated DataFrame
movies_df.head()

Unnamed: 0,movieId,title,genres,release_year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [4]:
#cleaning up the genres column to a list instead of a string
movies_df['genres'] = movies_df['genres'].apply(lambda x: x.split('|') )
movies_df.head()

Unnamed: 0,movieId,title,genres,release_year
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",1995
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II (1995),[Comedy],1995



**Addressing the Cold-Start Problem**

Collaborative filtering relies entirely on user-item interactions within the utility matrix. However, this approach faces a challenge when dealing with new users or items that have no interactions, resulting in their exclusion from the recommendation system. This is known as the cold-start problem. One way to address this issue is by using content-based filtering, which generates recommendations based on user and item features.

To implement this, we first need to convert the genres column into binary features. Each genre will have its own column in the dataframe, with values of 0 or 1 indicating the presence or absence of that genre

In [5]:
n_movies = movies_df['movieId'].nunique()
print(f"There are {n_movies} unique movies in our movies dataset.")

There are 9742 unique movies in our movies dataset.


In [6]:
genres = set(g for G in movies_df['genres'] for g in G)
for g in genres:
    movies_df[g] = movies_df.genres.transform(lambda x: int(g in x))
    
movies_genres = movies_df.drop(columns=['movieId', 'title','genres','release_year'])

In [7]:
movies_genres

Unnamed: 0,Adventure,Crime,Thriller,Animation,Film-Noir,Documentary,Musical,Western,Mystery,Romance,Sci-Fi,Fantasy,Horror,War,Comedy,Children,Drama,Action,IMAX,(no genres listed)
0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0
9738,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
9739,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
9740,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [8]:
cosine_sim = cosine_similarity(movies_genres, movies_genres)
print(f"Dimensions of our genres cosine similarity matrix: {cosine_sim.shape}")

Dimensions of our genres cosine similarity matrix: (9742, 9742)


In [9]:
cosine_sim[0]

array([1.        , 0.77459667, 0.31622777, ..., 0.        , 0.31622777,
       0.4472136 ])

To receive recommendations for movies similar to Shawshank Redemption, it's essential to use the exact title as listed in our dataset. For example, in our dataset, Shawshank Redemption is recorded as Shawshank Redemption, The (1994).

If the title is misspelled or the release year is omitted, the recommender won't correctly identify the movie.

To make the process more user-friendly, we can utilize the Python package fuzzywuzzy. This package uses string matching algorithms to find the closest title match to a user-provided input. We'll create a function, movie_finder(), to leverage fuzzywuzzy and return the most similar movie title based on the user's input.

In [10]:
def movie_finder(title, threshold=80):
    all_titles = movies_df['title'].tolist()
    matches = process.extract(title, all_titles, limit=None)
    
    # Filter titles based on the threshold
    similar_titles = [match[0] for match in matches if match[1] >= threshold]
    
    # Get movie IDs for all matched titles
    results = []
    for matched_title in similar_titles:
        movie_id = movies_df[movies_df['title'] == matched_title].index[0]
        results.append((matched_title, movie_id))
    
    return results

Lets test it out with your favorite movie example

In [11]:
# PRACTICE
result = movie_finder('Twilight')
pprint(result)

print("chosen title:", result[0][0])
title = result[0][0]

[('Twilight (1998)', 1324),
 ('Twilight Zone: The Movie (1983)', 5514),
 ('Twilight Samurai, The (Tasogare Seibei) (2002)', 5687),
 ('Twilight (2008)', 6905),
 ('Twilight Saga: New Moon, The (2009)', 7188),
 ('Twilight Saga: Eclipse, The (2010)', 7363),
 ('Twilight Saga: Breaking Dawn - Part 1, The (2011)', 7749),
 ('Twilight Saga: Breaking Dawn - Part 2, The (2012)', 8036)]
chosen title: Twilight (1998)


To get relevant recommendations for Shawshank Redemption , we need to find its index in the cosine simialrity matrix. To identify which row we should be looking at, we can create a movie index mapper which maps a movie title to the index that it represents in our matrix.

Let's create a movie index dictionary called movie_idx where the keys are movie titles and values are movie indices:


In [12]:
# Get user input for title selection
user_input_title = input("Enter a movie title to search for: ")
user_result = movie_finder(user_input_title)

# Check if there are any results
if user_result:
    # Display results with numbering
    for idx, (title, movieId) in enumerate(user_result):
        print(f"{idx}: {title}")
    
    # # Get user input for index selection
    # user_idx = int(input("Choose what movie from search list (rder number) you want to select: "))
    # user_title = user_result[user_idx][0]
    # chosen_index = user_result[user_idx][1]
    # print("chosen title:", user_title, chosen_index)
    
    while True:
        try:
            user_idx = int(input("Choose what movie from search list (order number) you want to select: "))
            if 0 <= user_idx < len(user_result):  # Validate the index
                user_title = user_result[user_idx][0]
                chosen_index = user_result[user_idx][1]
                print("chosen title:", user_title, chosen_index)
                break  # Exit the loop if the selection is valid
            else:
                print(f"Please enter a number between 0 and {len(user_result) - 1}.")
        except ValueError:
            print("Invalid input. Please enter a valid integer.")
else:
    print("No results found.")

Enter a movie title to search for:  Twilight


0: Twilight (1998)
1: Twilight Zone: The Movie (1983)
2: Twilight Samurai, The (Tasogare Seibei) (2002)
3: Twilight (2008)
4: Twilight Saga: New Moon, The (2009)
5: Twilight Saga: Eclipse, The (2010)
6: Twilight Saga: Breaking Dawn - Part 1, The (2011)
7: Twilight Saga: Breaking Dawn - Part 2, The (2012)


Choose what movie from search list (order number) you want to select:  3


chosen title: Twilight (2008) 6905


## We now know that the movie index for Shawshank Redemption is 277 in ourr set, we need to get to top 10 recommended movies to this movie.

In [25]:
# n_recommendations=10
n_recommendations = int(input("Enter the number of recommendations you want: "))
sim_scores = [(i, float(score)) for i, score in enumerate(cosine_sim[chosen_index])]
# Exclude the target index
sim_scores = [score for score in sim_scores if score[0] != chosen_index]
# Sort scores by similarity in descending order
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:(n_recommendations+1)]
sim_scores

Enter the number of recommendations you want:  5


[(3713, 0.8944271909999159),
 (7188, 0.8944271909999159),
 (228, 0.8660254037844388),
 (441, 0.8660254037844388),
 (723, 0.8660254037844388)]

In [14]:
similar_movies = [i[0] for i in sim_scores]
similar_movies

[3713, 7188, 228, 441, 723]

In [15]:
print(f"Because you watched {title}:")
movies_df['title'].iloc[similar_movies]

Because you watched Twilight Saga: Breaking Dawn - Part 2, The (2012):


3713                                     Dragonfly (2002)
7188                  Twilight Saga: New Moon, The (2009)
228     Like Water for Chocolate (Como agua para choco...
441                                        Orlando (1992)
723                       Ghost and Mrs. Muir, The (1947)
Name: title, dtype: object

In [16]:
# # try to predict ratings of recommended movies based on user's past ratings and/or ratings of others

In [26]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

In [27]:
# Load the data
ratings = "Resources/ml-latest-small/ratings.csv"
ratings_df = pd.read_csv(ratings)
ratings_df.drop(['timestamp'], axis =1, inplace=True)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [124]:
# check for nulls
null_counts = ratings_df.isnull().sum()
print(null_counts)

userId     0
movieId    0
rating     0
dtype: int64


In [28]:
# Create a user-item matrix
user_item_matrix = ratings_df.pivot(index='userId', columns='movieId', values='rating').fillna(0)
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(ratings_df, test_size=0.2, random_state=42)

In [30]:
train_data[:1]

Unnamed: 0,userId,movieId,rating
80568,509,7347,3.0


In [31]:
# Create training and testing user-item matrices
train_user_item_matrix = train_data.pivot(index='userId', columns='movieId', values='rating').fillna(0)
test_user_item_matrix = test_data.pivot(index='userId', columns='movieId', values='rating').fillna(0)

In [119]:
# Apply Truncated SVD
svd = TruncatedSVD(n_components=10)  # Number of latent factors 
# latent factors tried: 50, 100, 200, 500, 25, 10, 8, 5, 3 - less is better but not past 10
latent_matrix = svd.fit_transform(train_user_item_matrix)

In [120]:
# Reconstruct the user-item matrix
reconstructed_matrix = np.dot(latent_matrix, svd.components_)

In [121]:
# Make predictions for the test set
predictions = []
actuals = []

# Get the mapping of userId and movieId to the indices in the user-item matrix
user_index_mapping = {user_id: index for index, user_id in enumerate(train_user_item_matrix.index)}
movie_index_mapping = {movie_id: index for index, movie_id in enumerate(train_user_item_matrix.columns)}

for _, row in test_data.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    actual_rating = row['rating']
    
    # Check if user and movie are in the mapping
    if user_id in user_index_mapping and movie_id in movie_index_mapping:
        user_index = user_index_mapping[user_id]
        movie_index = movie_index_mapping[movie_id]
        predicted_rating = reconstructed_matrix[user_index, movie_index]
        predictions.append(predicted_rating)
        actuals.append(actual_rating)

In [122]:
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(actuals, predictions))
print(f'RMSE: {rmse}')

# from 0 to 5, being off by 3 score points is pretty high!

RMSE: 2.934356044422654


In [123]:
# Calculate MAE
mae = mean_absolute_error(actuals, predictions)
print(f'MAE: {mae}')
# from 0 to 5, being off by 3 score points is pretty high!

MAE: 2.698266106472718


In [37]:
# function to find id based on title
# need title locator to get id from movies csv
def original_id_finder(title, threshold=80): #threshold refers to similarity threshold
    all_titles = movies_df['title'].tolist()
    matches = process.extract(title, all_titles, limit=None)
    
    # Filter titles based on the threshold
    similar_titles = [match[0] for match in matches if match[1] >= threshold]
    
    # Get movie IDs for all matched titles
    myids = []
    for matched_title in similar_titles:
        # Access the movieId
        movie_id = movies_df[movies_df['title'] == matched_title]['movieId'].values[0]
        myids.append((matched_title, movie_id))
    
    return myids

In [38]:
# find id(s) for reccommended movie(s)
original_id_finder('Twilight Saga: Breaking Dawn - Part 2, The (2012)')

[('Twilight Saga: Breaking Dawn - Part 2, The (2012)', 98203),
 ('Twilight Saga: Breaking Dawn - Part 1, The (2011)', 91104),
 ('American President, The (1995)', 11),
 ('Cry, the Beloved Country (1995)', 40),
 ('Usual Suspects, The (1995)', 50),
 ('Big Green, The (1995)', 54),
 ('Home for the Holidays (1995)', 57),
 ('Postman, The (Postino, Il) (1994)', 58),
 ("Don't Be a Menace to South Central While Drinking Your Juice in the Hood (1996)",
  63),
 ('From Dusk Till Dawn (1996)', 70),
 ('Crossing Guard, The (1995)', 78),
 ('Juror, The (1996)', 79),
 ('In the Bleak Midwinter (1995)', 96),
 ('Boys of St. Vincent, The (1992)', 121),
 ('NeverEnding Story III, The (1994)', 126),
 ('Pie in the Sky (1996)', 129),
 ('Man of the Year (1995)', 137),
 ('Birdcage, The (1996)', 141),
 ('Brothers McMullen, The (1995)', 144),
 ('Basketball Diaries, The (1995)', 147),
 ('Addiction, The (1995)', 152),
 ('Blue in the Face (1995)', 156),
 ('Doom Generation, The (1995)', 166),
 ('Net, The (1995)', 185),
 

In [None]:
# function to match original id to resconstructed_matrix index for the movie/user?


In [39]:
# Predict a specific user's rating for a specific movie
user_id = 10 # change to reflect a user of interest (ex.user 10 has rated Twilight Saga: Breaking Dawn - Part 2)
movie_id = 98203 #change to reflect a reccommended movie
predicted_rating = reconstructed_matrix[user_id, movie_id]

print(f'Predicted rating for user {user_id} for movie {movie_id}: {predicted_rating}')

IndexError: index 98203 is out of bounds for axis 1 with size 8983