Import the libraries

In [0]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Load the data

In [3]:
from google.colab import files
uploaded = files.upload()

Saving movie_dataset.csv to movie_dataset.csv


Store the data

In [0]:
df = pd.read_csv('movie_dataset.csv')

Print the first 3 rows of data

In [6]:
df.head(3)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes


Get a count of the # of rows and columns

In [7]:
df.shape # 4803 movies and 24 data points about each movie

(4803, 24)

Create a list of important columns to keep

In [8]:
features = ['keywords', 'cast', 'genres', 'director']
df[features].head(3)

Unnamed: 0,keywords,cast,genres,director
0,culture clash future space war space colony so...,Sam Worthington Zoe Saldana Sigourney Weaver S...,Action Adventure Fantasy Science Fiction,James Cameron
1,ocean drug abuse exotic island east india trad...,Johnny Depp Orlando Bloom Keira Knightley Stel...,Adventure Fantasy Action,Gore Verbinski
2,spy based on novel secret agent sequel mi6,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,Action Adventure Crime,Sam Mendes


Clean and process the data

In [0]:
for feature in features:
  df[feature] = df[feature].fillna('') # Filling any missing values with an empty string

Create a function to combine the values of the important columns into a single string

In [0]:
def combine_features(row):
  return row['keywords'] + '' + row['cast'] + '' + row['genres'] + '' + row['director']

Apply the function to each row in the data set to store the combine strings into a new column called 'combined_features'

In [0]:
df['combine_features'] = df.apply(combine_features, axis=1)

Print the dataframe

In [13]:
df.head(3)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director,combine_features
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron,culture clash future space war space colony so...
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski,ocean drug abuse exotic island east india trad...
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes,spy based on novel secret agent sequel mi6Dani...


Convert a collection of text to a matrix of token counts

In [0]:
count_matrix = CountVectorizer().fit_transform(df['combine_features'])

Get the cosine similarity matrix from the count matrix

In [15]:
cosine_sim = cosine_similarity(count_matrix)
print(cosine_sim) # Shows similarity scores for each moive

[[1.         0.03928371 0.04564355 ... 0.         0.         0.        ]
 [0.03928371 1.         0.         ... 0.0410305  0.         0.        ]
 [0.04564355 0.         1.         ... 0.         0.         0.        ]
 ...
 [0.         0.0410305  0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


Get the # of rows and columns in cosine_sim

In [16]:
cosine_sim.shape

(4803, 4803)

Create a helper function to get the title from the index

In [0]:
def get_title_from_index(index):
  return df[df.index == index]['title'].values[0]

# Helper function to get the index from the title
def get_index_from_title(title):
  return df[df.title == title]['index'].values[0] 

Get the title of the movie that the user likes

In [19]:
movie_user_likes = 'The Amazing Spider-Man'

#Find that movies index
movie_index = get_index_from_title(movie_user_likes)
movie_index

20

-Enumerate through all the similarity scores of 'The Amazing Spider-Man'

-Make a tuple of movie index and similarity scores

-NOTE: We will return a list of tuples in the form (movie index, similarity score)

In [22]:
similar_movies = list(enumerate(cosine_sim[movie_index]))
similar_movies

[(0, 0.044543540318737404),
 (1, 0.04199605255658081),
 (2, 0.04879500364742666),
 (3, 0.0),
 (4, 0.04652421051992354),
 (5, 0.04652421051992354),
 (6, 0.0),
 (7, 0.16798421022632323),
 (8, 0.10012523486435176),
 (9, 0.14285714285714288),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.04652421051992354),
 (14, 0.08728715609439697),
 (15, 0.0),
 (16, 0.12371791482634838),
 (17, 0.0),
 (18, 0.04199605255658081),
 (19, 0.04879500364742666),
 (20, 1.0000000000000004),
 (21, 0.0),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.09100315103865803),
 (27, 0.04761904761904762),
 (28, 0.04761904761904762),
 (29, 0.05006261743217588),
 (30, 0.18200630207731605),
 (31, 0.09304842103984708),
 (32, 0.0),
 (33, 0.13093073414159545),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.05292561240249632),
 (38, 0.5005173307126191),
 (39, 0.0),
 (40, 0.04652421051992354),
 (41, 0.04550157551932901),
 (42, 0.0),
 (43, 0.0),
 (44, 0.0),
 (45, 0.0),
 (46, 0.17118419700436519),
 (47, 0.051434449987363975),
 (

Sort the list of similar movies accorsing to the similarity scores in descending order

In [0]:
sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)[1:]

Print sorted_similar_movies

In [24]:
sorted_similar_movies

[(38, 0.5005173307126191),
 (448, 0.26622333025588873),
 (1365, 0.22750787759664506),
 (2989, 0.22750787759664506),
 (3390, 0.22454435656953592),
 (2476, 0.222717701593687),
 (2886, 0.19518001458970663),
 (2592, 0.1928791874526149),
 (30, 0.18200630207731605),
 (234, 0.18200630207731605),
 (631, 0.18200630207731605),
 (2136, 0.18200630207731605),
 (512, 0.17817416127494962),
 (3688, 0.17817416127494962),
 (126, 0.17457431218879393),
 (788, 0.17457431218879393),
 (1719, 0.17457431218879393),
 (46, 0.17118419700436519),
 (1028, 0.17118419700436519),
 (2521, 0.17118419700436519),
 (2790, 0.17118419700436519),
 (3881, 0.17118419700436519),
 (7, 0.16798421022632323),
 (3728, 0.16798421022632323),
 (2561, 0.1649572197684645),
 (3029, 0.1649572197684645),
 (602, 0.15430334996209194),
 (1760, 0.15430334996209194),
 (1812, 0.15430334996209194),
 (2491, 0.15430334996209194),
 (3704, 0.15430334996209194),
 (85, 0.15018785229652765),
 (825, 0.15018785229652765),
 (1320, 0.15018785229652765),
 (237

Create a loop to print the first 5 entires from the sorted_similar_movies list

In [26]:
i=0
print(f'The Top 5 Similar Movies To {movie_user_likes} Are:')
for element in sorted_similar_movies:
  print(get_title_from_index(element[0]))
  i += 1
  if i >= 5:
    break

The Top 5 Similar Movies To The Amazing Spider-Man Are:
The Amazing Spider-Man 2
Cold Mountain
True Grit
Happy Gilmore
Hesher
