In [1]:
# Dependencies
import numpy as np
import pandas as pd

In [25]:
movies = pd.read_csv('resources/movie_metadata.csv')
movies.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [26]:
movies = movies[['movie_title', 'genres']]
movies.head()

Unnamed: 0,movie_title,genres
0,Avatar,Action|Adventure|Fantasy|Sci-Fi
1,Pirates of the Caribbean: At World's End,Action|Adventure|Fantasy
2,Spectre,Action|Adventure|Thriller
3,The Dark Knight Rises,Action|Thriller
4,Star Wars: Episode VII - The Force Awakens ...,Documentary


In [27]:
movies = movies.rename(columns={'movie_title':'title'})
movies.head()

Unnamed: 0,title,genres
0,Avatar,Action|Adventure|Fantasy|Sci-Fi
1,Pirates of the Caribbean: At World's End,Action|Adventure|Fantasy
2,Spectre,Action|Adventure|Thriller
3,The Dark Knight Rises,Action|Thriller
4,Star Wars: Episode VII - The Force Awakens ...,Documentary


In [28]:
# Break up the big genre string into a string array
movies['genres'] = movies['genres'].str.split('|')
# print(movies.head())
# Convert genres to string value
movies['genres'] = movies['genres'].fillna("").astype('str')
movies.head()

Unnamed: 0,title,genres
0,Avatar,"['Action', 'Adventure', 'Fantasy', 'Sci-Fi']"
1,Pirates of the Caribbean: At World's End,"['Action', 'Adventure', 'Fantasy']"
2,Spectre,"['Action', 'Adventure', 'Thriller']"
3,The Dark Knight Rises,"['Action', 'Thriller']"
4,Star Wars: Episode VII - The Force Awakens ...,['Documentary']


In [29]:
# use TfidfVectorizer function from scikit-learn, which transforms text to 
# feature vectors that can be used as input to estimator.
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])

print(tfidf_matrix)

  (0, 192)	0.28638162337694284
  (0, 125)	0.4886959715869942
  (0, 25)	0.4705300067159289
  (0, 1)	0.3113168159451126
  (0, 130)	0.28638162337694284
  (0, 191)	0.28638162337694284
  (0, 118)	0.2872840620894273
  (0, 17)	0.2490871917524086
  (0, 0)	0.2285601369325064
  (1, 25)	0.6555829674799378
  (1, 1)	0.43375342509647713
  (1, 118)	0.40026892067689596
  (1, 17)	0.34704974815532885
  (1, 0)	0.3184496858418125
  (2, 31)	0.7174143014523583
  (2, 198)	0.2894536295096709
  (2, 1)	0.42925367826849575
  (2, 17)	0.3434494630323553
  (2, 0)	0.3151460970265319
  (3, 14)	0.8745431852007914
  (3, 198)	0.3280417562073654
  (3, 0)	0.35715938095371597
  (4, 80)	1.0
  (5, 30)	0.5617660735567105
  (5, 192)	0.35037358435208504
  :	:
  (5036, 89)	0.364277001968279
  (5037, 50)	0.7705794255925253
  (5037, 89)	0.40884783457632434
  (5037, 47)	0.488928212538144
  (5038, 50)	0.7705794255925253
  (5038, 89)	0.40884783457632434
  (5038, 47)	0.488928212538144
  (5039, 97)	0.5046003623540747
  (5039, 175)	0.46

In [7]:
# used the TF-IDF Vectorizer, calculating the Dot 
# Product will directly give me the Cosine Similarity Score.
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [8]:
movies['title'][0]

'Avatar\xa0'

In [12]:
titles = movies['title']
newtitle = titles + '\xa0'
indices = pd.Series(movies.index, index=movies['title'])
indices

title
Avatar                                                        0
Pirates of the Caribbean: At World's End                      1
Spectre                                                       2
The Dark Knight Rises                                         3
Star Wars: Episode VII - The Force Awakens                    4
John Carter                                                   5
Spider-Man 3                                                  6
Tangled                                                       7
Avengers: Age of Ultron                                       8
Harry Potter and the Half-Blood Prince                        9
Batman v Superman: Dawn of Justice                           10
Superman Returns                                             11
Quantum of Solace                                            12
Pirates of the Caribbean: Dead Man's Chest                   13
The Lone Ranger                                              14
Man of Steel                      

In [10]:
# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    newtitle = title + '\xa0'
    idx = indices[newtitle]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [11]:
genre_recommendations('The Polar Express').head(20)

93                           How to Train Your Dragon 
156                             Rise of the Guardians 
304                                              Epic 
503                         Arthur and the Invisibles 
1347                      Dragon Nest: Warriors' Dawn 
1517                                            Ponyo 
2047                             Howl's Moving Castle 
2183                             Return to Never Land 
2373                                    Spirited Away 
2528                                   Dragon Hunters 
3408                              The Secret of Kells 
312     Legend of the Guardians: The Owls of Ga'Hoole 
2241              Yu-Gi-Oh! Duel Monsters             
494                         Walking with Dinosaurs 3D 
2006              A Turtle's Tale: Sammy's Adventures 
2749                                           Khumba 
2913                             The Land Before Time 
3538                                       Snow Queen 
744       