In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Reading ratings file
# Ignore the timestamp column

# Reading movies file
df = pd.read_csv('../Resources/movie_metadata.csv',usecols=['movie_title', 'genres'])

In [2]:
# Break up the big genre string into a string array
df['genres'] = df['genres'].str.split('|')
# Convert genres to string value
df['genres'] = df['genres'].fillna("").astype('str')
df.head()

Unnamed: 0,genres,movie_title
0,"['Action', 'Adventure', 'Fantasy', 'Sci-Fi']",Avatar
1,"['Action', 'Adventure', 'Fantasy']",Pirates of the Caribbean: At World's End
2,"['Action', 'Adventure', 'Thriller']",Spectre
3,"['Action', 'Thriller']",The Dark Knight Rises
4,['Documentary'],Star Wars: Episode VII - The Force Awakens ...


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['genres'])
tfidf_matrix.shape

(5043, 206)

In [4]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[:4, :4]

array([[1.        , 0.71772763, 0.29121259, 0.0816324 ],
       [0.71772763, 1.        , 0.40574248, 0.11373729],
       [0.29121259, 0.40574248, 1.        , 0.20751026],
       [0.0816324 , 0.11373729, 0.20751026, 1.        ]])

In [5]:
# Build a 1-dimensional array with movie titles
titles = df['movie_title']
indices = pd.Series(df.index, index=df['movie_title'])

# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    newtitle = title + '\xa0'
    idx = indices[newtitle]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [6]:
genre_recommendations('Avatar').head(20)

15                                        Man of Steel 
39                            The Amazing Spider-Man 2 
236       Star Wars: Episode III - Revenge of the Sith 
237       Star Wars: Episode II - Attack of the Clones 
240          Star Wars: Episode I - The Phantom Menace 
520              The League of Extraordinary Gentlemen 
1536        Star Wars: Episode VI - Return of the Jedi 
2051    Star Wars: Episode V - The Empire Strikes Back 
2687                               Highlander: Endgame 
3024                Star Wars: Episode IV - A New Hope 
3634         Beastmaster 2: Through the Portal of Time 
4690                                           Destiny 
34                               X-Men: The Last Stand 
47                          X-Men: Days of Future Past 
123                           X-Men Origins: Wolverine 
210                                            X-Men 2 
769                                      Reign of Fire 
1145                             Underworld: Evo

In [10]:
import sqlite3
import pandas as pd

# Create your connection.
cnx = sqlite3.connect('../db/allmoviedata.sqlite')
df_movie = pd.read_sql_query("SELECT * FROM new_data", cnx)
df_img = pd.read_sql_query("SELECT * FROM new_images", cnx)
df2 = pd.merge(df_movie, df_img, on='name')
# df2 = df2.drop_duplicates(subset="name")
df2.head()

Unnamed: 0,movieID_x,name,total_votes,rating,duration,gross_earnings,genre,movieID_y,image
0,1,Avatar,886204,7.9,178,760505847,Action|Adventure|Fantasy|Sci-Fi,1,https://m.media-amazon.com/images/M/MV5BMTYwOT...
1,2,Pirates of the Caribbean: At World's End,471220,7.1,169,309404152,Action|Adventure|Fantasy,2,https://m.media-amazon.com/images/M/MV5BMjIyNj...
2,3,Spectre,275868,6.8,148,200074175,Action|Adventure|Thriller,3,https://m.media-amazon.com/images/M/MV5BOWQ1MD...
3,4,The Dark Knight Rises,1144337,8.5,164,448130642,Action|Thriller,4,https://m.media-amazon.com/images/M/MV5BMTk4OD...
4,5,John Carter,212204,6.6,132,73058679,Action|Adventure|Sci-Fi,6,https://m.media-amazon.com/images/M/MV5BMDEwZm...


In [11]:
# Break up the big genre string into a string array
df2['genre'] = df2['genre'].str.split('|')
# Convert genres to string value
df2['genre'] = df2['genre'].fillna("").astype('str')

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df2['genre'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Build a 1-dimensional array with movie titles
titles = df2['name']
indices = pd.Series(df2.index, index=df2['name'])

# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    newtitle = title
    idx = indices[newtitle]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

indices

name
Avatar                                            0
Pirates of the Caribbean: At World's End          1
Spectre                                           2
The Dark Knight Rises                             3
John Carter                                       4
Spider-Man 3                                      5
Spider-Man 3                                      6
Spider-Man 3                                      7
Spider-Man 3                                      8
Tangled                                           9
Avengers: Age of Ultron                          10
Harry Potter and the Half-Blood Prince           11
Batman v Superman: Dawn of Justice               12
Superman Returns                                 13
Quantum of Solace                                14
Pirates of the Caribbean: Dead Man's Chest       15
The Lone Ranger                                  16
Man of Steel                                     17
The Chronicles of Narnia: Prince Caspian         18
The Ave

In [4]:
x = genre_recommendations('The Polar Express').head(18).tolist()
x

['Avengers: Age of Ultron',
 'Batman v Superman: Dawn of Justice',
 'Superman Returns',
 'The Avengers',
 'Captain America: Civil War',
 'Iron Man 3',
 'Transformers: Revenge of the Fallen',
 'Transformers: Age of Extinction',
 'TRON: Legacy',
 'Green Lantern',
 'Terminator Salvation',
 'Star Trek Into Darkness',
 'Pacific Rim',
 'Transformers: Dark of the Moon',
 '2012',
 'Jupiter Ascending',
 'X-Men: Apocalypse',
 'Iron Man']