In [1]:
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 

In [2]:
# Importing the dataset
data = pd.read_csv(r'C:\Users\Vishal\Desktop\csv files\movies.csv')

In [3]:
data.shape

(4803, 24)

In [4]:
data.head()


Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [5]:
#Selecting features for model

sel_features = ['genres', 'keywords', 'title', 'production_companies', 'cast', 'director']

In [6]:
data[sel_features].isna().sum()

genres                   28
keywords                412
title                     0
production_companies      0
cast                     43
director                 30
dtype: int64

In [7]:
# filling null with null string
for i in sel_features:
    data[i] = data[i].fillna('')

In [8]:
data[sel_features].isna().sum()

genres                  0
keywords                0
title                   0
production_companies    0
cast                    0
director                0
dtype: int64

In [9]:
#concating all selected features

com_features = data['genres'] + ' ' + data['keywords'] + ' ' + data['title'] + ' ' + data['production_companies'] + ' ' + data['cast']+ ' ' + data['director']

In [24]:
com_features

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  Newlyweds [] Edward Burns Kerr...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      Shanghai Calling [] Daniel Henney Eliza Coup...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object

In [10]:
#converting combined features to vector features

vector = TfidfVectorizer()

vec_features = vector.fit_transform(com_features)

In [13]:
# Finding similarity between vector features using cosine Similarity

similarity = cosine_similarity(vec_features)

In [14]:
similarity.shape

(4803, 4803)

In [15]:
# Getting name of favourite movie from user 

movie_name = input('Type your favourite movie : ')

Type your favourite movie : batman


In [18]:
# creating a list all movies

movie_list = data['title'].tolist()

In [19]:
# finding the close match of user's favourite movie in movie list

close_match = difflib.get_close_matches(movie_name, movie_list)[0]
print(close_match)


Batman


In [20]:
# finding the movie index in the movie dataset

index_user = data[data.title == close_match]['index'].values[0]
print(index_user)

1359


In [23]:
# getting similarity between user input movies and all other movies

similarity_score = list(enumerate(similarity[index_user]))


In [24]:
#Sorting the movies in decreasing order of similarity

sort_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)


In [25]:
#Giving the list of 20 movies recommended based upon user favourite movie

print('Movies suggestion for you : \n')

k = 1

for i in sort_movies:
  ind = i[0]
  title_from_index = data[data.index==ind]['title'].values[0]
  if (k<21):
    print(k, '.',title_from_index)
    k+=1

Movies suggestion for you : 

1 . Batman
2 . Batman Returns
3 . Batman & Robin
4 . Batman Begins
5 . The Dark Knight
6 . Batman Forever
7 . The Dark Knight Rises
8 . Batman: The Dark Knight Returns, Part 2
9 . Batman v Superman: Dawn of Justice
10 . The Color Purple
11 . The Hudsucker Proxy
12 . Suicide Squad
13 . A History of Violence
14 . The Clan of the Cave Bear
15 . Watchmen
16 . Superman Returns
17 . Barney's Great Adventure
18 . Beetlejuice
19 . Corpse Bride
20 . Man of Steel


# Movie Recommendation System

In [26]:
movie_name = input('Type your favourite movie : ')

movie_list = data['title'].tolist()

close_match = difflib.get_close_matches(movie_name, movie_list)[0]

index_user = data[data.title == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_user]))

sort_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)

print('Movies suggestion for you : \n')

k = 1

for i in sort_movies:
  ind = i[0]
  title_from_index = data[data.index==ind]['title'].values[0]
  if (k<21):
    print(k, '.',title_from_index)
    k+=1

Type your favourite movie : Iron Man 2
Movies suggestion for you : 

1 . Iron Man 2
2 . Iron Man 3
3 . Iron Man
4 . The Avengers
5 . Avengers: Age of Ultron
6 . Ant-Man
7 . Captain America: The Winter Soldier
8 . Thor: The Dark World
9 . Captain America: Civil War
10 . The Incredible Hulk
11 . X-Men
12 . X-Men: Apocalypse
13 . X2
14 . Deadpool
15 . The Amazing Spider-Man 2
16 . X-Men: Days of Future Past
17 . Captain America: The First Avenger
18 . Thor
19 . Guardians of the Galaxy
20 . Sin City
