In [1]:
#importing all required libraries
import numpy as np
import pandas as pd
#for converting a string to a list
import ast
#for calculating the number of times a word from a specific tag of a movie occurs
from sklearn.feature_extraction.text import CountVectorizer
#for removing similar words, such as 'loved' and 'loving' from the dataset
import nltk
from nltk.stem.porter import PorterStemmer
#finding similarity for each movie with respect to each of the other movies
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
#importing dataset
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [5]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [6]:
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [7]:
#joining the above two dataframes on the column title
movies = movies.merge(credits, on = "title")

In [11]:
#keeping only the following columns as only they are relevant to the problem statement
movies = movies[["movie_id", "title", "overview", "genres", "keywords", "cast", "crew"]]

In [12]:
#checking null values
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [13]:
#since there are only 3 nulls in the overview column, it wouldn't harm to get rid of them
movies.dropna(inplace = True)

In [14]:
#checking duplicates
movies.duplicated().sum()

0

In [15]:
#now, we need to concatenate all the relevant information in all columns to create a column called "tags". 
#Here, we define helper functions to do the same for all the relevant columns
#for the genres column - 
import ast
def dict_to_list(a):
    l = []
    for i in ast.literal_eval(a):
        l.append(i["name"])
    return l
#to get the name of the top 3 cast members of the movie, define this function
def convert_three(a):
    a = ast.literal_eval(a)
    l = []
    for i in a[:3]:
        l.append(i["name"])
    return l
#to extract the name of the director from the 'crew' column, we use the function below
def extract_director(a):
    a = ast.literal_eval(a)
    l = []
    for i in a:
        if i["job"] == "Director":
            l.append(i["name"])
            return l

In [16]:
#now applying those functions to the columns-
movies['genres'] = movies['genres'].apply(dict_to_list)
movies['keywords'] = movies['keywords'].apply(dict_to_list)
movies ["cast"] = movies["cast"].apply(convert_three)
movies["crew"] = movies["crew"].apply(extract_director)
#finally, converting the overview column to a list
movies["overview"] = movies["overview"].apply(lambda x: x.split())

In [17]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [27]:
#removing nulls and removing spaces for each of the columns that we transformed above
movies.dropna(inplace = True)
movies["keywords"] = movies["keywords"].apply(lambda x:[i.replace(" ", "") for i in x])
movies["cast"] = movies["cast"].apply(lambda x:[i.replace(" ", "") for i in x])
movies["crew"] = movies["crew"].apply(lambda x:[i.replace(" ", "") for i in x])

In [28]:
#now, we create the column that will help make recommendations
movies["tags"] = movies["overview"] + movies["genres"] + movies["keywords"] + movies["cast"] + movies["crew"]
#creating a new dataframe with only the necessary columns
new_df = movies[["movie_id", "title", "tags"]]

In [30]:
#converting the tags list to a string object
new_df["tags"] = new_df["tags"].apply(lambda x: " ".join(x))
new_df["tags"] = new_df["tags"].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tags"] = new_df["tags"].apply(lambda x: " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tags"] = new_df["tags"].str.lower()


In [33]:
#removing same words with difference tenses using PorterStemmer
ps = PorterStemmer()
def stem(a):
    y = []
    for i in a.split():
        y.append(ps.stem(i))
    return " ".join(y)
#vectorizing the tags column and removing identical words using above helper functions
cv = CountVectorizer(max_features = 5000, stop_words = 'english')
vectors = cv.fit_transform(new_df["tags"]).toarray()
new_df["tags"] = new_df["tags"].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tags"] = new_df["tags"].apply(stem)


In [35]:
#calculating cosine similarity for each movie tag with each of the other movies
similarity = cosine_similarity(vectors)
#function that will make recommendations to user
def recommend(movie):
    movie_index = new_df[new_df["title"] == movie].index[0]
    similarities = similarity[movie_index]
    movies_list = sorted(list(enumerate(similarities)), reverse = True, key = lambda x:x[1])[1:6]
    for i in movies_list:
        print(new_df["title"][i[0]])

In [40]:
#some trial runs of the model:
print(recommend("The Dark Knight"))

The Dark Knight Rises
Batman Begins
Batman Returns
Batman Forever
Batman & Robin
None


In [41]:
print(recommend("Avatar"))

Small Soldiers
Titan A.E.
Independence Day
Lifeforce
Aliens vs Predator: Requiem
None


In [42]:
print(recommend("Interstellar"))

The Martian
Space Pirate Captain Harlock
Bleeding Hearts
Moonraker
Gattaca
None


In [43]:
print(recommend("The Wolf of Wall Street"))

Wall Street
Machete
They Came Together
The Big Short
The Infiltrator
None
