In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from pprint import pprint
from fuzzywuzzy import process



In [2]:
# Take in all of our movies and ratings csvs and read it into pandas
movies = "Resources/ml-latest-small/movies.csv"
movies_df = pd.read_csv(movies)
movies_df.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# Extract the release year from the title column
movies_df["release_year"] = movies_df["title"].str.extract(r'\((\d{4})\)')

# Display the updated DataFrame
movies_df.head()

Unnamed: 0,movieId,title,genres,release_year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [4]:
#cleaning up the genres column to a list instead of a string
movies_df['genres'] = movies_df['genres'].apply(lambda x: x.split('|') )
movies_df.head()

Unnamed: 0,movieId,title,genres,release_year
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",1995
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II (1995),[Comedy],1995



**Addressing the Cold-Start Problem**

Collaborative filtering relies entirely on user-item interactions within the utility matrix. However, this approach faces a challenge when dealing with new users or items that have no interactions, resulting in their exclusion from the recommendation system. This is known as the cold-start problem. One way to address this issue is by using content-based filtering, which generates recommendations based on user and item features.

To implement this, we first need to convert the genres column into binary features. Each genre will have its own column in the dataframe, with values of 0 or 1 indicating the presence or absence of that genre

In [5]:
n_movies = movies_df['movieId'].nunique()
print(f"There are {n_movies} unique movies in our movies dataset.")

There are 9742 unique movies in our movies dataset.


In [6]:
genres = set(g for G in movies_df['genres'] for g in G)
for g in genres:
    movies_df[g] = movies_df.genres.transform(lambda x: int(g in x))
    
movies_genres = movies_df.drop(columns=['movieId', 'title','genres','release_year'])

In [7]:
movies_genres

Unnamed: 0,Horror,Animation,Film-Noir,Mystery,Children,Sci-Fi,Drama,(no genres listed),Romance,IMAX,Fantasy,Action,Western,Adventure,Documentary,Comedy,War,Thriller,Musical,Crime
0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0
9738,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
9739,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
9740,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [8]:
cosine_sim = cosine_similarity(movies_genres, movies_genres)
print(f"Dimensions of our genres cosine similarity matrix: {cosine_sim.shape}")

Dimensions of our genres cosine similarity matrix: (9742, 9742)


In [9]:
cosine_sim[0]

array([1.        , 0.77459667, 0.31622777, ..., 0.        , 0.31622777,
       0.4472136 ])

To receive recommendations for movies similar to Shawshank Redemption, it's essential to use the exact title as listed in our dataset. For example, in our dataset, Shawshank Redemption is recorded as Shawshank Redemption, The (1994).

If the title is misspelled or the release year is omitted, the recommender won't correctly identify the movie.

To make the process more user-friendly, we can utilize the Python package fuzzywuzzy. This package uses string matching algorithms to find the closest title match to a user-provided input. We'll create a function, movie_finder(), to leverage fuzzywuzzy and return the most similar movie title based on the user's input.

In [10]:
def movie_finder(title, threshold=80):
    all_titles = movies_df['title'].tolist()
    matches = process.extract(title, all_titles, limit=None)
    
    # Filter titles based on the threshold
    similar_titles = [match[0] for match in matches if match[1] >= threshold]
    
    # Get movie IDs for all matched titles
    results = []
    for matched_title in similar_titles:
        movie_id = movies_df[movies_df['title'] == matched_title].index[0]
        results.append((matched_title, movie_id))
    
    return results

Lets test it out with your favorite movie example

In [11]:
# PRACTICE
result = movie_finder('Twilight')
pprint(result)

print("chosen title:", result[0][0])
title = result[0][0]

[('Twilight (1998)', 1324),
 ('Twilight Zone: The Movie (1983)', 5514),
 ('Twilight Samurai, The (Tasogare Seibei) (2002)', 5687),
 ('Twilight (2008)', 6905),
 ('Twilight Saga: New Moon, The (2009)', 7188),
 ('Twilight Saga: Eclipse, The (2010)', 7363),
 ('Twilight Saga: Breaking Dawn - Part 1, The (2011)', 7749),
 ('Twilight Saga: Breaking Dawn - Part 2, The (2012)', 8036)]
chosen title: Twilight (1998)


To get relevant recommendations for Shawshank Redemption , we need to find its index in the cosine simialrity matrix. To identify which row we should be looking at, we can create a movie index mapper which maps a movie title to the index that it represents in our matrix.

Let's create a movie index dictionary called movie_idx where the keys are movie titles and values are movie indices:


In [12]:
# Get user input for title selection
user_input_title = input("Enter a movie title to search for: ")
user_result = movie_finder(user_input_title)

# Check if there are any results
if user_result:
    # Display results with numbering
    for idx, (title, movieId) in enumerate(user_result):
        print(f"{idx}: {title}")
    
    # # Get user input for index selection
    # user_idx = int(input("Choose what movie from search list (rder number) you want to select: "))
    # user_title = user_result[user_idx][0]
    # chosen_index = user_result[user_idx][1]
    # print("chosen title:", user_title, chosen_index)
    
    while True:
        try:
            user_idx = int(input("Choose what movie from search list (order number) you want to select: "))
            if 0 <= user_idx < len(user_result):  # Validate the index
                user_title = user_result[user_idx][0]
                chosen_index = user_result[user_idx][1]
                print("chosen title:", user_title, chosen_index)
                break  # Exit the loop if the selection is valid
            else:
                print(f"Please enter a number between 0 and {len(user_result) - 1}.")
        except ValueError:
            print("Invalid input. Please enter a valid integer.")
else:
    print("No results found.")

Enter a movie title to search for:  Clueless


0: Clueless (1995)


Choose what movie from search list (order number) you want to select:  10


Please enter a number between 0 and 0.


Choose what movie from search list (order number) you want to select:  0


chosen title: Clueless (1995) 35


## We now know that the movie index for Shawshank Redemption is 277 in ourr set, we need to get to top 10 recommended movies to this movie.

In [13]:
# n_recommendations=10
n_recommendations = int(input("Enter the number of recommendations you want: "))
sim_scores = [(i, float(score)) for i, score in enumerate(cosine_sim[chosen_index])]
# Exclude the target index
sim_scores = [score for score in sim_scores if score[0] != chosen_index]
# Sort scores by similarity in descending order
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:(n_recommendations+1)]
sim_scores

Enter the number of recommendations you want:  10


[(6, 0.9999999999999998),
 (57, 0.9999999999999998),
 (60, 0.9999999999999998),
 (103, 0.9999999999999998),
 (106, 0.9999999999999998),
 (111, 0.9999999999999998),
 (152, 0.9999999999999998),
 (157, 0.9999999999999998),
 (203, 0.9999999999999998),
 (216, 0.9999999999999998)]

In [14]:
similar_movies = [i[0] for i in sim_scores]
similar_movies

[6, 57, 60, 103, 106, 111, 152, 157, 203, 216]

In [15]:
print(f"Because you watched {title}:")
movies_df['title'].iloc[similar_movies]

Because you watched Clueless (1995):


6                          Sabrina (1995)
57                   Two if by Sea (1996)
60     French Twist (Gazon maudit) (1995)
103                   If Lucy Fell (1996)
106                      Boomerang (1992)
111                 Pie in the Sky (1996)
152                       Mallrats (1995)
157                    Nine Months (1995)
203                   Forget Paris (1995)
216                           I.Q. (1994)
Name: title, dtype: object