In [1]:
# import
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
# load data
df = pd.read_csv('../data/movies_metadata.csv')
df = df.head(20000) # use only 20,000
df = df[['title', 'overview']] # use only 'title' and 'overview' columns
df.head(5)

  df = pd.read_csv('../data/movies_metadata.csv')


Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [30]:
# check null
df['overview'] = df['overview'].fillna('') # fill null with empty string

print(df['title'].isnull().sum())
print(df['overview'].isnull().sum())

2
0


In [31]:
# apply tf-idf
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['overview'])
print(tfidf_matrix.shape) # (20000, 47487) : 20,000 movies, 47,487 words

(20000, 47487)


In [35]:
# calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim.shape)
print(cosine_sim[1]) # similarity scores of the #'1' movie

(20000, 20000)
[0.01575748 1.         0.04907345 ... 0.         0.         0.        ]


In [36]:
# functions for processing
title_index_map = dict(zip(df['title'], df.index)) # key: title, value: index

def get_recommendations(title):
    idx = title_index_map[title] 
    sim_scores = list(enumerate(cosine_sim[idx])) 
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # sort by similarity scores
    sim_scores = sim_scores[1:11] 
    
    movie_indices = [i[0] for i in sim_scores] # get indices of the tuples
    return df['title'].iloc[movie_indices] # get the titles of the indices

# enumerate(): The built-in Python enumerate() function returns an iterator that produces pairs (tuples) containing indices (starting from 0) and values from the iterable passed to it. 

# list(...): This wraps around the enumerate() function to convert the iterator into an actual list of tuples.

# key=lambda x: x[1]: The key function is used to extract a comparison key from each element in sim_scores. In this case, the lambda function takes in a tuple x and returns its second element (x[1]). This means that the sorting will be done based on the second elements of the tuples.

In [37]:
# run
get_recommendations('The Dark Knight Rises')

12481                            The Dark Knight
150                               Batman Forever
1328                              Batman Returns
15511                 Batman: Under the Red Hood
585                                       Batman
9230          Batman Beyond: Return of the Joker
18035                           Batman: Year One
19792    Batman: The Dark Knight Returns, Part 1
3095                Batman: Mask of the Phantasm
10122                              Batman Begins
Name: title, dtype: object