In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
md = pd.read_csv('Dataset/movies_metadata.csv',low_memory=False)

In [37]:
# Take 20000 Sample, Cause Local machine unable to allocate full data
md = md.head(20000)

In [38]:
md['title'].sample(10)

12860                            A Moment to Remember
4932                                          Seconds
16146                                      Feet First
1830                      The Best Years of Our Lives
8374                                         Dead End
1734                                          Illtown
11563                                       Red Angel
3481                                The Gay Deceivers
13463    I Manegen med Glenn Killing: Live från Berns
1868               Friday the 13th: The Final Chapter
Name: title, dtype: object

In [39]:
md['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [40]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

In [41]:
#Replace NaN with an empty string
md['overview'] = md['overview'].fillna('')

In [42]:
#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(md['overview'])

In [43]:
tfidf_matrix.shape

(20000, 47487)

Since you have used the TF-IDF vectorizer, calculating the dot product between each vector will directly give you the cosine similarity score. Therefore, you will use sklearn's linear_kernel() instead of cosine_similarities() since it is faster.

In [44]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [45]:
cosine_sim.shape

(20000, 20000)

In [46]:
cosine_sim[1]

array([0.01575748, 1.        , 0.04907345, ..., 0.        , 0.        ,
       0.        ])

In [47]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(md.index, index=md['title']).drop_duplicates()

In [48]:
indices[:10]

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

In [49]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return md['title'].iloc[movie_indices]

In [71]:
film = md['title'].sample(1).values[0]
film

'Curse of the Pink Panther'

In [72]:
get_recommendations(film)

7869        Revenge of the Pink Panther
10860                Inspector Clouseau
6514     The Pink Panther Strikes Again
6005          Trail of the Pink Panther
10793                  The Pink Panther
13460                The Pink Panther 2
10861           Son of the Pink Panther
1090     The Return of the Pink Panther
12408                 Definitely, Maybe
4810               The First Deadly Sin
Name: title, dtype: object