In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Reading ratings file
# Ignore the timestamp column

# Reading movies file
df = pd.read_csv('movie_metadata.csv',usecols=['movie_title', 'genres'])

In [5]:
# Break up the big genre string into a string array
df['genres'] = df['genres'].str.split('|')
# Convert genres to string value
df['genres'] = df['genres'].fillna("").astype('str')
df

Unnamed: 0,genres,movie_title
0,"['Action', 'Adventure', 'Fantasy', 'Sci-Fi']",Avatar
1,"['Action', 'Adventure', 'Fantasy']",Pirates of the Caribbean: At World's End
2,"['Action', 'Adventure', 'Thriller']",Spectre
3,"['Action', 'Thriller']",The Dark Knight Rises
4,['Documentary'],Star Wars: Episode VII - The Force Awakens ...
5,"['Action', 'Adventure', 'Sci-Fi']",John Carter
6,"['Action', 'Adventure', 'Romance']",Spider-Man 3
7,"['Adventure', 'Animation', 'Comedy', 'Family',...",Tangled
8,"['Action', 'Adventure', 'Sci-Fi']",Avengers: Age of Ultron
9,"['Adventure', 'Family', 'Fantasy', 'Mystery']",Harry Potter and the Half-Blood Prince


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['genres'])
tfidf_matrix.shape

(5043, 206)

In [7]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[:4, :4]

array([[1.        , 0.71772763, 0.29121259, 0.0816324 ],
       [0.71772763, 1.        , 0.40574248, 0.11373729],
       [0.29121259, 0.40574248, 1.        , 0.20751026],
       [0.0816324 , 0.11373729, 0.20751026, 1.        ]])

In [12]:
# Build a 1-dimensional array with movie titles
titles = df['movie_title']
indices = pd.Series(df.index, index=df['movie_title'])

# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    newtitle = title + '\xa0'
    idx = indices[newtitle]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [13]:
genre_recommendations('Avatar').head(20)

15                                        Man of Steel 
39                            The Amazing Spider-Man 2 
236       Star Wars: Episode III - Revenge of the Sith 
237       Star Wars: Episode II - Attack of the Clones 
240          Star Wars: Episode I - The Phantom Menace 
520              The League of Extraordinary Gentlemen 
1536        Star Wars: Episode VI - Return of the Jedi 
2051    Star Wars: Episode V - The Empire Strikes Back 
2687                               Highlander: Endgame 
3024                Star Wars: Episode IV - A New Hope 
3634         Beastmaster 2: Through the Portal of Time 
4690                                           Destiny 
34                               X-Men: The Last Stand 
47                          X-Men: Days of Future Past 
123                           X-Men Origins: Wolverine 
210                                            X-Men 2 
769                                      Reign of Fire 
1145                             Underworld: Evo