In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
metadata = pd.read_csv('movies_metadata.csv', low_memory=False)

metadata['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [3]:
# Create a TfidfVectorizer object to convert text into TF-IDF vectors, ignoring common English stop words
tfidf = TfidfVectorizer(stop_words='english')

# Replace missing values (NaN) in the overview column with empty strings
metadata['overview'] = metadata['overview'].fillna('')

# Compute the TF-IDF matrix by fitting and transforming the overview text data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

# Output the dimensions of the TF-IDF matrix, showing the number of documents and features
tfidf_matrix.shape

(45466, 75827)

In [4]:
tfidf.get_feature_names_out()[3000:3010]

array(['anabolic', 'anachronistic', 'anacleto', 'anaconda', 'anacondas',
       'anadolu', 'anaheim', 'anahí', 'anais', 'anakata'], dtype=object)

In [5]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

cosine_sim.shape

(45466, 45466)

In [6]:
cosine_sim[1]

array([0.01504121, 1.        , 0.04681953, ..., 0.        , 0.02198641,
       0.00929411])

In [7]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

indices[:10]

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

In [8]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

In [9]:
get_recommendations('The Godfather')

1178               The Godfather: Part II
44030    The Godfather Trilogy: 1972-1990
1914              The Godfather: Part III
23126                          Blood Ties
11297                    Household Saints
34717                   Start Liquidation
10821                            Election
38030            A Mother Should Be Loved
17729                   Short Sharp Shock
26293                  Beck 28 - Familjen
Name: title, dtype: object

In [10]:
get_recommendations('Pulp Fiction')

34007                            From Mexico With Love
45423    The Fortunes and Misfortunes of Moll Flanders
14803           The First Day of the Rest of Your Life
1190                                         The Sting
640                                      Moll Flanders
32614                                Kill Your Friends
30706                                 Baby Face Nelson
19015                           Ladies They Talk About
3563                                    Prizzi's Honor
40700                                  Watch Your Left
Name: title, dtype: object