In [1]:
# Dependencies
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv('resources/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# use TfidfVectorizer function from scikit-learn, which transforms text to 
# feature vectors that can be used as input to estimator.
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])

In [4]:
# used the TF-IDF Vectorizer, calculating the Dot 
# Product will directly give me the Cosine Similarity Score.
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


RangeIndex(start=0, stop=9742, step=1)

In [19]:
# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])
indices
# indices['Jumanji (1995)']
# titles

title
Toy Story (1995)                                                      0
Jumanji (1995)                                                        1
Grumpier Old Men (1995)                                               2
Waiting to Exhale (1995)                                              3
Father of the Bride Part II (1995)                                    4
Heat (1995)                                                           5
Sabrina (1995)                                                        6
Tom and Huck (1995)                                                   7
Sudden Death (1995)                                                   8
GoldenEye (1995)                                                      9
American President, The (1995)                                       10
Dracula: Dead and Loving It (1995)                                   11
Balto (1995)                                                         12
Nixon (1995)                                              

In [6]:
# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [7]:
genre_recommendations('Jumanji (1995)').head(20)

53                     Indian in the Cupboard, The (1995)
109                     NeverEnding Story III, The (1994)
767                       Escape to Witch Mountain (1975)
1514            Darby O'Gill and the Little People (1959)
1556                                  Return to Oz (1985)
1617                        NeverEnding Story, The (1984)
1618    NeverEnding Story II: The Next Chapter, The (1...
1799                        Santa Claus: The Movie (1985)
3574    Harry Potter and the Sorcerer's Stone (a.k.a. ...
6075    Chronicles of Narnia: The Lion, the Witch and ...
6389                          Bridge to Terabithia (2007)
6629                           Golden Compass, The (2007)
6655          Water Horse: Legend of the Deep, The (2007)
6751     Chronicles of Narnia: Prince Caspian, The (2008)
7426                           Alice in Wonderland (1933)
7478    Chronicles of Narnia: The Voyage of the Dawn T...
8230                Percy Jackson: Sea of Monsters (2013)
8641          