In [36]:
# Dependencies
import numpy as np
import pandas as pd

In [37]:
new_movies = pd.read_csv('resources/upcoming_movies_2019.csv')
new_movies.head()

Unnamed: 0.1,Unnamed: 0,genre,image,name,release_date
0,49,"Action, Comedy",https://m.media-amazon.com/images/M/MV5BOGE1Zj...,Stuber,Release date - July 12th 2019
1,50,"Animation, Adventure, Drama",https://m.media-amazon.com/images/M/MV5BMjIwMj...,The Lion King,Release date - July 19th
2,51,"Comedy, Drama",https://m.media-amazon.com/images/M/MV5BOTg4ZT...,Once Upon a Time ... in Hollywood,Release date - July 26th
3,52,"Action, Adventure, Comedy",https://m.media-amazon.com/images/M/MV5BMTc4Nz...,Fast & Furious Presents: Hobbs & Shaw,Release date - August 2nd
4,53,Horror,https://m.media-amazon.com/images/M/MV5BMjA0MD...,Scary Stories to Tell in the Dark,Release date - August 9th


Clean up the new dataset

In [38]:
new_movies = new_movies[['name', 'genre']]
new_movies = new_movies.rename(columns = {'name':'title', 'genre':'genres'})
new_movies.head()

Unnamed: 0,title,genres
0,Stuber,"Action, Comedy"
1,The Lion King,"Animation, Adventure, Drama"
2,Once Upon a Time ... in Hollywood,"Comedy, Drama"
3,Fast & Furious Presents: Hobbs & Shaw,"Action, Adventure, Comedy"
4,Scary Stories to Tell in the Dark,Horror


In [39]:
# Break up the big genre string into a string array
new_movies['genres'] = new_movies['genres'].str.split(',')
# Convert genres to string value
new_movies['genres'] = new_movies['genres'].fillna("").astype('str')

transfer text into vector

In [40]:
# use TfidfVectorizer function from scikit-learn, which transforms text to 
# feature vectors that can be used as input to estimator.
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
# transfer the new dataset
tfidf_matrix = tf.fit_transform(new_movies['genres'])

In [41]:
# used the TF-IDF Vectorizer, calculating the Dot 
# Product will directly give me the Cosine Similarity Score.
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [42]:
new_movies['title'][0]

'Stuber'

In [43]:
titles = new_movies['title']
indices = pd.Series(new_movies.index, index=new_movies['title'])
indices

title
Stuber                                             0
The Lion King                                      1
Once Upon a Time ... in Hollywood                  2
Fast & Furious Presents: Hobbs & Shaw              3
Scary Stories to Tell in the Dark                  4
Dora and the Lost City of Gold                     5
The Kitchen                                        6
The Art of Racing in the Rain                      7
The Nightingale                                    8
The Angry Birds Movie 2                            9
Where'd You Go, Bernadette                        10
Ready or Not                                      11
Playmobil: The Movie                              12
It Chapter Two                                    13
Ad Astra                                          14
Rambo: Last Blood                                 15
Downton Abbey                                     16
Abominable                                        17
The Hunt                                

In [45]:
# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [47]:
genre_recommendations('Stuber').head(2)

46         Superintelligence
23    Zombieland: Double Tap
Name: title, dtype: object