In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Data Pre-Processing

In [2]:
# Load the data
movie_data = pd.read_csv('movies.csv')
movie_data.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [4]:
movie_data.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

In [5]:
# Shape of the data
movie_data.shape

(4803, 24)

In [6]:
# Select relevant features for recommendation
selected_features = ['genres', 'keywords', 'title', 'tagline', 'cast', 'director']

In [7]:
# Replace null values with null string
for feature in selected_features:
    movie_data[feature] = movie_data[feature].fillna('')

In [9]:
# Combining all the 5 selected features
combined_features = movie_data['genres']+' '+movie_data['keywords']+' '+movie_data['title']+' '+movie_data['tagline']+' '+movie_data['cast']+' '+movie_data['director']
combined_features.sample(5)

1176    Drama Music black people soul country music lo...
3716    Crime Drama suicide sex sweden underground nig...
1982    Drama Music musical perfectionist rowboat cour...
3117    Comedy Romance independent film The Good Guy  ...
989     Action Adventure Comedy Family baby hoodlum lo...
dtype: object

## Feature Extraction

In [10]:
# Convert text data to feature vectors. You cannot find cosine similarity using text data. It needs to be numerical.
vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(combined_features)

In [11]:
print(feature_vectors)

  (0, 2720)	0.16779665077750838
  (0, 8651)	0.10950771233518675
  (0, 14400)	0.18869504935935055
  (0, 11288)	0.15600564366704217
  (0, 9709)	0.22061174669983705
  (0, 16170)	0.1471845509560594
  (0, 18477)	0.1927717674394528
  (0, 15542)	0.20008526614580363
  (0, 14718)	0.21153518149440187
  (0, 19148)	0.19502634639381394
  (0, 18845)	0.22968831190527225
  (0, 14749)	0.14568185359096344
  (0, 12700)	0.2552737122112953
  (0, 12356)	0.07571236182305312
  (0, 18835)	0.12179929157015999
  (0, 16904)	0.05365726945306952
  (0, 5519)	0.22061174669983705
  (0, 1183)	0.2771429775697421
  (0, 15785)	0.20449485056097091
  (0, 3587)	0.2424810120582838
  (0, 18385)	0.1206723617514615
  (0, 15901)	0.3255851082321633
  (0, 6519)	0.15884357175977007
  (0, 3408)	0.21574818782392277
  (0, 4103)	0.2078190465468255
  :	:
  (4801, 7731)	0.25682086501772416
  (4801, 12869)	0.1918304797254379
  (4801, 1880)	0.1383742924803071
  (4801, 12062)	0.1190062927991386
  (4801, 8323)	0.094776382167548
  (4801, 4234)

## Cosine Similarity

In [12]:
# Getting similarity score using cosine similarity
similarity = cosine_similarity(feature_vectors)
print(similarity)

[[1.         0.07294698 0.03533251 ... 0.         0.         0.        ]
 [0.07294698 1.         0.02792771 ... 0.04419983 0.         0.        ]
 [0.03533251 0.02792771 1.         ... 0.         0.04636139 0.        ]
 ...
 [0.         0.04419983 0.         ... 1.         0.         0.05551043]
 [0.         0.         0.04636139 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.05551043 0.         1.        ]]


In [13]:
similarity.shape

(4803, 4803)

In [14]:
# Get movie name from user
movie_name = input('Enter your favorite movie: ')

Enter your favorite movie: super man


In [15]:
# Create a list with all the movie names in the dataset

list_of_all_titles = movie_data['title'].tolist()
print(list_of_all_titles)

['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre', 'The Dark Knight Rises', 'John Carter', 'Spider-Man 3', 'Tangled', 'Avengers: Age of Ultron', 'Harry Potter and the Half-Blood Prince', 'Batman v Superman: Dawn of Justice', 'Superman Returns', 'Quantum of Solace', "Pirates of the Caribbean: Dead Man's Chest", 'The Lone Ranger', 'Man of Steel', 'The Chronicles of Narnia: Prince Caspian', 'The Avengers', 'Pirates of the Caribbean: On Stranger Tides', 'Men in Black 3', 'The Hobbit: The Battle of the Five Armies', 'The Amazing Spider-Man', 'Robin Hood', 'The Hobbit: The Desolation of Smaug', 'The Golden Compass', 'King Kong', 'Titanic', 'Captain America: Civil War', 'Battleship', 'Jurassic World', 'Skyfall', 'Spider-Man 2', 'Iron Man 3', 'Alice in Wonderland', 'X-Men: The Last Stand', 'Monsters University', 'Transformers: Revenge of the Fallen', 'Transformers: Age of Extinction', 'Oz: The Great and Powerful', 'The Amazing Spider-Man 2', 'TRON: Legacy', 'Cars 2', 'Green Lant

In [16]:
# Find close match for the movie name entered by the user
find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['Superman', 'Superman II', 'Superman III']


In [17]:
close_match = find_close_match[0]
close_match

'Superman'

In [18]:
# Find the index of the movie with title
movie_index = movie_data[movie_data.title == close_match]['index'].values[0]
print(movie_index)

813


In [19]:
# Get a list of similarity with other movies
similarity_score = list(enumerate(similarity[movie_index]))
print(similarity_score)

[(0, 0.060441819634290496), (1, 0.05669981611859783), (2, 0.021278790879104493), (3, 0.16459567586204848), (4, 0.046325213302610774), (5, 0.04323643596368685), (6, 0.0073813468155804135), (7, 0.02999416491858132), (8, 0.02053678717828277), (9, 0.13955987895192726), (10, 0.23895404391561495), (11, 0.018152077852427916), (12, 0.0391597906516315), (13, 0.01602448881509599), (14, 0.30270722551728496), (15, 0.018904522952480605), (16, 0.03343648072738728), (17, 0.024315293048254995), (18, 0.026497370051818264), (19, 0.043357481499554304), (20, 0.043216465914501304), (21, 0.010586789756561077), (22, 0.03288473120153997), (23, 0.020715042988756833), (24, 0.030808306151389842), (25, 0.0), (26, 0.030431413430579046), (27, 0.03232542178290488), (28, 0.049187533220854195), (29, 0.013384776792300601), (30, 0.039517528039169844), (31, 0.05211938770842943), (32, 0.03333495071135013), (33, 0.03369854838450429), (34, 0.0), (35, 0.034224855159124205), (36, 0.02672736127812642), (37, 0.02046870161027083

In [20]:
len(similarity_score)

4803

In [21]:
# Sorting the movies based on their similarity score
sorted_similar_movies = sorted(similarity_score, key = lambda x: x[1], reverse = True)
print(sorted_similar_movies)

[(813, 1.0), (870, 0.48162813751699074), (2433, 0.4191936078241661), (1296, 0.31479309911227676), (14, 0.30270722551728496), (10, 0.23895404391561495), (823, 0.19758085411761356), (72, 0.17472382916672577), (2793, 0.16843946018851474), (428, 0.16760208907677923), (3, 0.16459567586204848), (119, 0.15933820410425395), (65, 0.15585013073673037), (3337, 0.147131467834266), (1420, 0.14562494375771348), (4267, 0.14033186582859658), (9, 0.13955987895192726), (1359, 0.13670426148197926), (210, 0.13604512258279622), (2492, 0.12046791369509309), (1282, 0.11935380709111135), (1024, 0.11859218958408402), (1238, 0.11707220055489738), (587, 0.11464049671616472), (164, 0.11360817270045326), (1510, 0.11355111784694479), (4432, 0.11307067108423653), (955, 0.11034741958488571), (613, 0.11033072353352182), (1183, 0.10932478411474335), (3552, 0.10435375441406358), (1890, 0.1039081716070553), (41, 0.10352990194936233), (1740, 0.10348775542181109), (1477, 0.10289077221455084), (634, 0.10263152511271866), (1

In [24]:
# Print name of the similar movies based on the index
print('Movies suggested for you:\n')

i = 1

for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = movie_data[movie_data.index == index]['title'].values[0]
    if i <= 30:
        print(i, '.', title_from_index)
        i += 1

Movies suggested for you:

1 . Superman
2 . Superman II
3 . Superman IV: The Quest for Peace
4 . Superman III
5 . Man of Steel
6 . Superman Returns
7 . Crimson Tide
8 . Suicide Squad
9 . The Killer Inside Me
10 . Batman Returns
11 . The Dark Knight Rises
12 . Batman Begins
13 . The Dark Knight
14 . The Godfather
15 . Nanny McPhee and the Big Bang
16 . Batman
17 . Batman v Superman: Dawn of Justice
18 . Batman
19 . Batman & Robin
20 . Steel
21 . The Hunting Party
22 . Dick Tracy
23 . The Island of Dr. Moreau
24 . The Abyss
25 . Lethal Weapon 4
26 . 1941
27 . On the Waterfront
28 . The Peacemaker
29 . The Score
30 . The Mexican


## Movie Recommendation System

In [26]:
# Get movie name from user
movie_name = input('Enter your favorite movie: ')

list_of_all_titles = movie_data['title'].tolist()

# Find close match for the movie name entered by the user
find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

movie_index = movie_data[movie_data.title == close_match]['index'].values[0]

# Get a list of similarity with other movies
similarity_score = list(enumerate(similarity[movie_index]))

# Sorting the movies based on their similarity score
sorted_similar_movies = sorted(similarity_score, key = lambda x: x[1], reverse = True)

# Print name of the similar movies based on the index
print('Movies suggested for you:\n')

i = 1

for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = movie_data[movie_data.index == index]['title'].values[0]
    if i <= 30:
        print(i, '.', title_from_index)
        i += 1

Enter your favorite movie: expandables
Movies suggested for you:

1 . The Expendables
2 . The Expendables 2
3 . The Expendables 3
4 . Rocky Balboa
5 . Nighthawks
6 . F.I.S.T.
7 . Grudge Match
8 . Bullet to the Head
9 . The Specialist
10 . Cliffhanger
11 . Cop Land
12 . Driven
13 . Get Carter
14 . Daylight
15 . Escape Plan
16 . Assassins
17 . D-Tox
18 . Men of War
19 . First Blood
20 . Skin Trade
21 . Zookeeper
22 . Death Race 2000
23 . Creed
24 . Rambo: First Blood Part II
25 . Rocky
26 . Rambo III
27 . Spy Kids 3-D: Game Over
28 . An Alan Smithee Film: Burn, Hollywood, Burn
29 . The Mechanic
30 . Tango & Cash
