In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Reading ratings file
# Ignore the timestamp column

# Reading movies file
df = pd.read_csv('../Resources/movie_metadata.csv',usecols=['movie_title', 'genres'])

In [2]:
# Break up the big genre string into a string array
df['genres'] = df['genres'].str.split('|')
# Convert genres to string value
df['genres'] = df['genres'].fillna("").astype('str')
df.head()

Unnamed: 0,genres,movie_title
0,"['Action', 'Adventure', 'Fantasy', 'Sci-Fi']",Avatar
1,"['Action', 'Adventure', 'Fantasy']",Pirates of the Caribbean: At World's End
2,"['Action', 'Adventure', 'Thriller']",Spectre
3,"['Action', 'Thriller']",The Dark Knight Rises
4,['Documentary'],Star Wars: Episode VII - The Force Awakens ...


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['genres'])
tfidf_matrix.shape

(5043, 206)

In [4]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[:4, :4]

array([[1.        , 0.71772763, 0.29121259, 0.0816324 ],
       [0.71772763, 1.        , 0.40574248, 0.11373729],
       [0.29121259, 0.40574248, 1.        , 0.20751026],
       [0.0816324 , 0.11373729, 0.20751026, 1.        ]])

In [5]:
# Build a 1-dimensional array with movie titles
titles = df['movie_title']
indices = pd.Series(df.index, index=df['movie_title'])

# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    newtitle = title + '\xa0'
    idx = indices[newtitle]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [6]:
genre_recommendations('Avatar').head(20)

15                                        Man of Steel 
39                            The Amazing Spider-Man 2 
236       Star Wars: Episode III - Revenge of the Sith 
237       Star Wars: Episode II - Attack of the Clones 
240          Star Wars: Episode I - The Phantom Menace 
520              The League of Extraordinary Gentlemen 
1536        Star Wars: Episode VI - Return of the Jedi 
2051    Star Wars: Episode V - The Empire Strikes Back 
2687                               Highlander: Endgame 
3024                Star Wars: Episode IV - A New Hope 
3634         Beastmaster 2: Through the Portal of Time 
4690                                           Destiny 
34                               X-Men: The Last Stand 
47                          X-Men: Days of Future Past 
123                           X-Men Origins: Wolverine 
210                                            X-Men 2 
769                                      Reign of Fire 
1145                             Underworld: Evo

In [1]:
import sqlite3
import pandas as pd

# Create your connection.
cnx = sqlite3.connect('../db/movie_data.sqlite')
df2 = pd.read_sql_query("SELECT * FROM movies", cnx)
df2.head()

Unnamed: 0,movieID,name,total_votes,rating,duration,gross_earnings,genre
0,1,Pirates of the Caribbean: At World's End,471220,7.1,169,309404152.0,Action|Adventure|Fantasy
1,2,Spectre,275868,6.8,148,200074175.0,Action|Adventure|Thriller
2,3,The Dark Knight Rises,1144337,8.5,164,448130642.0,Action|Thriller
3,4,John Carter,212204,6.6,132,73058679.0,Action|Adventure|Sci-Fi
4,5,Spider-Man 3,383056,6.2,156,336530303.0,Action|Adventure|Romance


In [2]:
# Break up the big genre string into a string array
df2['genre'] = df2['genre'].str.split('|')
# Convert genres to string value
df2['genre'] = df2['genre'].fillna("").astype('str')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df2['genre'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Build a 1-dimensional array with movie titles
titles = df2['name']
indices = pd.Series(df2.index, index=df2['name'])

# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    newtitle = title + '\xa0'
    idx = indices[newtitle]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [6]:
x = genre_recommendations('Spectre').head(20).tolist()
x

['Skyfall\xa0',
 'Mission: Impossible - Rogue Nation\xa0',
 'Mission: Impossible III\xa0',
 'Die Another Day\xa0',
 'Mission: Impossible - Ghost Protocol\xa0',
 'The World Is Not Enough\xa0',
 'The Bourne Legacy\xa0',
 'Mission: Impossible II\xa0',
 "Dante's Peak\xa0",
 'Tomorrow Never Dies\xa0',
 'Live Free or Die Hard\xa0',
 'Casino Royale\xa0',
 'The Expendables 2\xa0',
 'The Expendables 3\xa0',
 'Die Hard with a Vengeance\xa0',
 'The Expendables\xa0',
 'Mission: Impossible\xa0',
 'The Rock\xa0',
 'xXx\xa0',
 'Space Cowboys\xa0']

In [5]:
df2['name'].iloc[0].replace('\xa0', '')

"Pirates of the Caribbean: At World's End"