In [1]:
# Dependencies
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv('resources/movie_metadata.csv')
movies.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [11]:
new_movies = pd.read_csv('resources/movies_2019.csv')
new_movies.head()

Unnamed: 0.1,Unnamed: 0,genre,image,name,release_date
0,0,"Drama, Sci-Fi, Thriller",https://m.media-amazon.com/images/M/MV5BMTY1OT...,Glass,Release date - January 18th
1,1,"Action, Adventure, Family",https://m.media-amazon.com/images/M/MV5BZjk1ZD...,The Kid Who Would Be King,Release date - January 25th
2,2,"Action, Crime, Drama",https://m.media-amazon.com/images/M/MV5BMTcxNz...,Miss Bala,Release date - February 1st
3,3,"Animation, Action, Adventure",https://m.media-amazon.com/images/M/MV5BMTkyOT...,The Lego Movie 2: The Second Part,Release date - February 8th
4,4,"Comedy, Fantasy, Romance",https://m.media-amazon.com/images/M/MV5BMTYxNj...,What Men Want,Release date - February 8th


Clean up the old movies dataset

In [4]:
# only keep necessary columns and rename
movies = movies[['movie_title', 'genres']]
movies = movies.rename(columns={'movie_title':'title'})
movies.head()

Unnamed: 0,title,genres
0,Avatar,Action|Adventure|Fantasy|Sci-Fi
1,Pirates of the Caribbean: At World's End,Action|Adventure|Fantasy
2,Spectre,Action|Adventure|Thriller
3,The Dark Knight Rises,Action|Thriller
4,Star Wars: Episode VII - The Force Awakens ...,Documentary


In [5]:
# Break up the big genre string into a string array
movies['genres'] = movies['genres'].str.split('|')
# Convert genres to string value
movies['genres'] = movies['genres'].fillna("").astype('str')

Clean up the new dataset

In [12]:
new_movies = new_movies[['name', 'genre']]
new_movies = new_movies.rename(columns = {'name':'title', 'genre':'genres'})
new_movies.head()

Unnamed: 0,title,genres
0,Glass,"Drama, Sci-Fi, Thriller"
1,The Kid Who Would Be King,"Action, Adventure, Family"
2,Miss Bala,"Action, Crime, Drama"
3,The Lego Movie 2: The Second Part,"Animation, Action, Adventure"
4,What Men Want,"Comedy, Fantasy, Romance"


In [13]:
# Break up the big genre string into a string array
new_movies['genres'] = new_movies['genres'].str.split(',')
# Convert genres to string value
new_movies['genres'] = new_movies['genres'].fillna("").astype('str')

transfer text into vector

In [21]:
# use TfidfVectorizer function from scikit-learn, which transforms text to 
# feature vectors that can be used as input to estimator.
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])

<5043x206 sparse matrix of type '<class 'numpy.float64'>'
	with 25213 stored elements in Compressed Sparse Row format>

In [15]:
# transfer the new dataset
new_tfidf_matrix = tf.fit_transform(new_movies['genres'])

In [17]:
# used the TF-IDF Vectorizer, calculating the Dot 
# Product will directly give me the Cosine Similarity Score.
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [18]:
new_movies['title'][0]

'Glass'

In [25]:
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

In [36]:
# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    newtitle = title + '\xa0'
    idx = indices[newtitle]
    print(idx)
    sim_scores = list(enumerate(cosine_sim[idx]))
    print(cosine_sim[idx])
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # print(sim_scores)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [37]:
genre_recommendations('Avatar').head(20)

0
[1.         0.71772763 0.29121259 ... 0.         0.         0.        ]
[(0, 1.0), (1, 0.7177276257262251), (2, 0.29121258569700703), (3, 0.08163239701751054), (4, 0.0), (5, 0.5594171429502688), (6, 0.2625343721697032), (7, 0.10734222622867012), (8, 0.5594171429502688), (9, 0.14140926732799525), (10, 0.5594171429502688), (11, 0.5594171429502688), (12, 0.4595675414777576), (13, 0.7177276257262251), (14, 0.221470981128068), (15, 1.0), (16, 0.332346823136316), (17, 0.5594171429502688), (18, 0.7177276257262251), (19, 0.6756444867170261), (20, 0.6049587164805506), (21, 0.7177276257262251), (22, 0.2504590332566898), (23, 0.6049587164805506), (24, 0.18186403787402555), (25, 0.2851406237118978), (26, 0.0), (27, 0.5594171429502688), (28, 0.496588137294155), (29, 0.496588137294155), (30, 0.29121258569700703), (31, 0.5790041294934107), (32, 0.5594171429502688), (33, 0.18186403787402555), (34, 0.920673399163741), (35, 0.14429151364526338), (36, 0.5594171429502688), (37, 0.5594171429502688), (38,

15                                        Man of Steel 
39                            The Amazing Spider-Man 2 
236       Star Wars: Episode III - Revenge of the Sith 
237       Star Wars: Episode II - Attack of the Clones 
240          Star Wars: Episode I - The Phantom Menace 
520              The League of Extraordinary Gentlemen 
1536        Star Wars: Episode VI - Return of the Jedi 
2051    Star Wars: Episode V - The Empire Strikes Back 
2687                               Highlander: Endgame 
3024                Star Wars: Episode IV - A New Hope 
3634         Beastmaster 2: Through the Portal of Time 
4690                                           Destiny 
34                               X-Men: The Last Stand 
47                          X-Men: Days of Future Past 
123                           X-Men Origins: Wolverine 
210                                            X-Men 2 
769                                      Reign of Fire 
1145                             Underworld: Evo