# Movie Recommender System using content based method and NLP

In [1]:
# ignore unnecessary warnings by libraries
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd

# 

# DataFrame Format :-
{'Movie_Title': [list_of_movies], 'Genres': [list_of_genres], 'Director': 'director_name', 'Cast': [top_3_cast], 'tagline': 'tagline_description'}

# Preprocessing tmdb dataframe

In [3]:
tmdbMovies_df = pd.read_csv('current_datasets/tmdb_5000_movies.csv')
tmdbCredits_df = pd.read_csv('current_datasets/tmdb_5000_credits.csv')

In [4]:
tmdbMovies_df.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [5]:
tmdbCredits_df.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [6]:
# the director in tmdbCredits_crew is given at crew column
# tmdbCredits_df['crew'][0]

In [7]:
final_df1 = tmdbMovies_df.merge(tmdbCredits_df, on='title')

In [8]:
# we need title, genres, director, cast, tagline

final_df1 = final_df1[['title', 'genres', 'overview', 'tagline', 'keywords', 'cast', 'crew']]

In [9]:
#now check if there is any sort of missing data

final_df1.isnull().sum()

title         0
genres        0
overview      3
tagline     844
keywords      0
cast          0
crew          0
dtype: int64

In [10]:
final_df1=final_df1[final_df1['overview'].notna()]

In [11]:
final_df1.isnull().sum()

title         0
genres        0
overview      0
tagline     841
keywords      0
cast          0
crew          0
dtype: int64

In [12]:
# fill all nan values in 'tagline' with empty string ""

final_df1 = final_df1.replace(np.nan, '', regex=True)

In [13]:
final_df1.isnull().sum()

title       0
genres      0
overview    0
tagline     0
keywords    0
cast        0
crew        0
dtype: int64

In [14]:
final_df1.head(1)

Unnamed: 0,title,genres,overview,tagline,keywords,cast,crew
0,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","In the 22nd century, a paraplegic Marine is di...",Enter the World of Pandora.,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


# 

# Preprocessing the-movies-dataframe

In [15]:
movies_df = pd.read_csv('current_datasets/movies_metadata.csv', low_memory=False)
credits_df = pd.read_csv('current_datasets/credits.csv', low_memory=False)

In [16]:
credits_df.head(1)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862


In [17]:
movies_df.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0


In [18]:
final_df2 = pd.concat([movies_df, credits_df], axis=1)

In [19]:
final_df2

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,id.1
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45471,,,,,,,,,,,...,,,,,,,,"[{'cast_id': 0, 'character': '', 'credit_id': ...","[{'credit_id': '5894a97d925141426c00818c', 'de...",439050
45472,,,,,,,,,,,...,,,,,,,,"[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...",111109
45473,,,,,,,,,,,...,,,,,,,,"[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de...",67758
45474,,,,,,,,,,,...,,,,,,,,"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",227506


In [20]:

final_df2 = final_df2[['title', 'genres', 'overview', 'tagline', 'belongs_to_collection', 'cast', 'crew']]
final_df2 = final_df2.rename(columns={'belongs_to_collection': 'keywords'})

In [21]:
final_df2=final_df2[final_df2['title'].notna()]

In [22]:
final_df2

Unnamed: 0,title,genres,overview,tagline,keywords,cast,crew
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",,"{'id': 10194, 'name': 'Toy Story Collection', ...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,"{'id': 119050, 'name': 'Grumpy Old Men Collect...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,"{'id': 96871, 'name': 'Father of the Bride Col...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."
...,...,...,...,...,...,...,...
45461,Subdue,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",Rising and falling between a man and woman.,Rising and falling between a man and woman,,"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '52fe4a71c3a36847f81ce35b', 'de..."
45462,Century of Birthing,"[{'id': 18, 'name': 'Drama'}]",An artist struggles to finish his work while a...,,,[],"[{'credit_id': '539ef1090e0a263dd00000d7', 'de..."
45463,Betrayal,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...","When one of her hits goes wrong, a professiona...",A deadly game of wits.,,"[{'cast_id': 1, 'character': 'Devki Sabarwal',...","[{'credit_id': '58ee55bbc3a3683df500bd0f', 'de..."
45464,Satan Triumphant,[],"In a small town live two brothers, one a minis...",,,"[{'cast_id': 0, 'character': 'Lauren', 'credit...","[{'credit_id': '587626f4c3a3682b33008299', 'de..."


In [23]:
final_df2.shape

(45460, 7)

In [24]:
# double check and drop all nan values
final_df2 = final_df2.dropna()
final_df2

Unnamed: 0,title,genres,overview,tagline,keywords,cast,crew
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,"{'id': 119050, 'name': 'Grumpy Old Men Collect...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,"{'id': 96871, 'name': 'Father of the Bride Col...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."
9,GoldenEye,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",James Bond must unmask the mysterious head of ...,No limits. No fears. No substitutes.,"{'id': 645, 'name': 'James Bond Collection', '...","[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '52fe426ec3a36847f801e14b', 'de..."
12,Balto,"[{'id': 10751, 'name': 'Family'}, {'id': 16, '...",An outcast half-wolf risks his life to prevent...,Part Dog. Part Wolf. All Hero.,"{'id': 117693, 'name': 'Balto Collection', 'po...","[{'cast_id': 1, 'character': 'Balto (voice)', ...","[{'credit_id': '593f24b9c3a3680369002371', 'de..."
18,Ace Ventura: When Nature Calls,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...","Summoned from an ashram in Tibet, Ace finds hi...",New animals. New adventures. Same hair.,"{'id': 3167, 'name': 'Ace Ventura Collection',...","[{'cast_id': 1, 'character': 'Ace Ventura', 'c...","[{'credit_id': '52fe44dfc3a36847f80af28b', 'de..."
...,...,...,...,...,...,...,...
45240,Schoolgirl Report Part 5: What All Parents Sho...,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",Seven more first-hand accounts of sexual awake...,"Young, Willing and Oh So Eager to Please!","{'id': 304809, 'name': 'Schoolgirl Report Coll...","[{'cast_id': 6, 'character': 'Franck', 'credit...","[{'credit_id': '52fe4a1cc3a36847f81ba4a5', 'de..."
45241,Schoolgirl Report Part 4: What Drives Parents ...,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",A fake documentary about the sex lives of teen...,Straight A's all the way ... But not in the cl...,"{'id': 304809, 'name': 'Schoolgirl Report Coll...","[{'cast_id': 1, 'character': 'Bruno', 'credit_...","[{'credit_id': '5396c2cc0e0a266dc5008807', 'de..."
45258,Descendants 2,"[{'id': 10770, 'name': 'TV Movie'}, {'id': 107...",When the pressure to be royal becomes too much...,Long live evil.,"{'id': 466463, 'name': 'Descendants Collection...","[{'cast_id': 4, 'character': 'Manikkam / Manic...","[{'credit_id': '58bc1ba0c3a368515d01dc3e', 'de..."
45353,Frankenstein Created Woman,"[{'id': 27, 'name': 'Horror'}, {'id': 878, 'na...",A deformed tormented girl drowns herself after...,Now Frankenstein has created a beautiful woman...,"{'id': 123720, 'name': 'Frankenstein (Hammer S...","[{'cast_id': 2, 'character': 'Le comte Godefro...","[{'credit_id': '52fe4f37c3a36847f82c5efb', 'de..."


# 

# Cancatenate:   final_df1   +   final_df2  =  final_df

In [25]:
final_df1.shape

(4806, 7)

In [26]:
final_df2.shape

(2572, 7)

In [27]:
final_df = pd.concat([final_df2, final_df1], axis=0, ignore_index=True)

In [28]:
final_df.shape

(7378, 7)

In [29]:
final_df

Unnamed: 0,title,genres,overview,tagline,keywords,cast,crew
0,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,"{'id': 119050, 'name': 'Grumpy Old Men Collect...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
1,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,"{'id': 96871, 'name': 'Father of the Bride Col...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."
2,GoldenEye,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",James Bond must unmask the mysterious head of ...,No limits. No fears. No substitutes.,"{'id': 645, 'name': 'James Bond Collection', '...","[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '52fe426ec3a36847f801e14b', 'de..."
3,Balto,"[{'id': 10751, 'name': 'Family'}, {'id': 16, '...",An outcast half-wolf risks his life to prevent...,Part Dog. Part Wolf. All Hero.,"{'id': 117693, 'name': 'Balto Collection', 'po...","[{'cast_id': 1, 'character': 'Balto (voice)', ...","[{'credit_id': '593f24b9c3a3680369002371', 'de..."
4,Ace Ventura: When Nature Calls,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...","Summoned from an ashram in Tibet, Ace finds hi...",New animals. New adventures. Same hair.,"{'id': 3167, 'name': 'Ace Ventura Collection',...","[{'cast_id': 1, 'character': 'Ace Ventura', 'c...","[{'credit_id': '52fe44dfc3a36847f80af28b', 'de..."
...,...,...,...,...,...,...,...
7373,El Mariachi,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",El Mariachi just wants to play his guitar and ...,"He didn't come looking for trouble, but troubl...","[{""id"": 5616, ""name"": ""united states\u2013mexi...","[{""cast_id"": 1, ""character"": ""El Mariachi"", ""c...","[{""credit_id"": ""52fe44eec3a36847f80b280b"", ""de..."
7374,Newlyweds,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10749, ""...",A newlywed couple's honeymoon is upended by th...,A newlywed couple's honeymoon is upended by th...,[],"[{""cast_id"": 1, ""character"": ""Buzzy"", ""credit_...","[{""credit_id"": ""52fe487dc3a368484e0fb013"", ""de..."
7375,"Signed, Sealed, Delivered","[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...","""Signed, Sealed, Delivered"" introduces a dedic...",,"[{""id"": 248, ""name"": ""date""}, {""id"": 699, ""nam...","[{""cast_id"": 8, ""character"": ""Oliver O\u2019To...","[{""credit_id"": ""52fe4df3c3a36847f8275ecf"", ""de..."
7376,Shanghai Calling,[],When ambitious New York attorney Sam is sent t...,A New Yorker in Shanghai,[],"[{""cast_id"": 3, ""character"": ""Sam"", ""credit_id...","[{""credit_id"": ""52fe4ad9c3a368484e16a36b"", ""de..."


In [30]:
final_df = final_df.dropna()

In [31]:
final_df.shape

(7378, 7)

In [32]:
# drop first 12867 rows
# N = 39312
# final_df = final_df.iloc[:-N]
# final_df

# 

# Editing genres

In [33]:
# hence there are no 'nan' values in final_df
final_df.isnull().sum()

title       0
genres      0
overview    0
tagline     0
keywords    0
cast        0
crew        0
dtype: int64

In [34]:
# remove all empty genre columns
final_df.iloc[0].genres

"[{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]"

In [35]:
# it is string so convert it into integer using --> 'ast' module

import ast

#helper function to return list of genres, keywords etc.
def convert(obj):
    List = []
    for i in ast.literal_eval(obj):
        List.append(i['name'])
    return List

In [36]:
final_df['genres'] = final_df['genres'].apply(convert)

In [37]:
final_df.head(3)

Unnamed: 0,title,genres,overview,tagline,keywords,cast,crew
0,Grumpier Old Men,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,"{'id': 119050, 'name': 'Grumpy Old Men Collect...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
1,Father of the Bride Part II,[Comedy],Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,"{'id': 96871, 'name': 'Father of the Bride Col...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."
2,GoldenEye,"[Adventure, Action, Thriller]",James Bond must unmask the mysterious head of ...,No limits. No fears. No substitutes.,"{'id': 645, 'name': 'James Bond Collection', '...","[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '52fe426ec3a36847f801e14b', 'de..."


# 

# Editing cast

In [38]:
#helper function to return first 3 cast names of each movie

def convertCast(obj):
    List = []
    counter = 0;
    for i in ast.literal_eval(obj):
        if counter < 3:
            List.append(i['name'])
            counter += 1
        else:
            break
    return List

In [39]:
#store result in movies['cast']
final_df['cast'] = final_df['cast'].apply(convertCast)

In [40]:
final_df.head(3)

Unnamed: 0,title,genres,overview,tagline,keywords,cast,crew
0,Grumpier Old Men,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,"{'id': 119050, 'name': 'Grumpy Old Men Collect...","[Walter Matthau, Jack Lemmon, Ann-Margret]","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
1,Father of the Bride Part II,[Comedy],Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,"{'id': 96871, 'name': 'Father of the Bride Col...","[Steve Martin, Diane Keaton, Martin Short]","[{'credit_id': '52fe44959251416c75039ed7', 'de..."
2,GoldenEye,"[Adventure, Action, Thriller]",James Bond must unmask the mysterious head of ...,No limits. No fears. No substitutes.,"{'id': 645, 'name': 'James Bond Collection', '...","[Pierce Brosnan, Sean Bean, Izabella Scorupco]","[{'credit_id': '52fe426ec3a36847f801e14b', 'de..."


# 

# Editing crew

In [41]:
#we only need director for our purpose

#helper function to fetch director
def fetch_director(obj):
    List = []
    for i in ast.literal_eval(obj):
        if i['job'] == "Director":
            List.append(i['name'])
            break
    return List

In [42]:
final_df['crew'] = final_df['crew'].apply(fetch_director)

In [43]:
final_df.head(3)

Unnamed: 0,title,genres,overview,tagline,keywords,cast,crew
0,Grumpier Old Men,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,"{'id': 119050, 'name': 'Grumpy Old Men Collect...","[Walter Matthau, Jack Lemmon, Ann-Margret]",[Howard Deutch]
1,Father of the Bride Part II,[Comedy],Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,"{'id': 96871, 'name': 'Father of the Bride Col...","[Steve Martin, Diane Keaton, Martin Short]",[Charles Shyer]
2,GoldenEye,"[Adventure, Action, Thriller]",James Bond must unmask the mysterious head of ...,No limits. No fears. No substitutes.,"{'id': 645, 'name': 'James Bond Collection', '...","[Pierce Brosnan, Sean Bean, Izabella Scorupco]",[Martin Campbell]


# 

# Editing overview

In [44]:
# check overview data format
final_df['overview'][0]

"A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is in cooking up a hot time with Max."

In [45]:
# convert this string into list
final_df['overview'] = final_df['overview'].apply(lambda x: x.split())

In [46]:
final_df.head(3)

Unnamed: 0,title,genres,overview,tagline,keywords,cast,crew
0,Grumpier Old Men,"[Romance, Comedy]","[A, family, wedding, reignites, the, ancient, ...",Still Yelling. Still Fighting. Still Ready for...,"{'id': 119050, 'name': 'Grumpy Old Men Collect...","[Walter Matthau, Jack Lemmon, Ann-Margret]",[Howard Deutch]
1,Father of the Bride Part II,[Comedy],"[Just, when, George, Banks, has, recovered, fr...",Just When His World Is Back To Normal... He's ...,"{'id': 96871, 'name': 'Father of the Bride Col...","[Steve Martin, Diane Keaton, Martin Short]",[Charles Shyer]
2,GoldenEye,"[Adventure, Action, Thriller]","[James, Bond, must, unmask, the, mysterious, h...",No limits. No fears. No substitutes.,"{'id': 645, 'name': 'James Bond Collection', '...","[Pierce Brosnan, Sean Bean, Izabella Scorupco]",[Martin Campbell]


# 

# Remove spaces between words of same entity

In [47]:
# final_df['genres'] = final_df['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
# final_df['cast'] = final_df['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
# final_df['crew'] = final_df['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

In [48]:
final_df.head(1)

Unnamed: 0,title,genres,overview,tagline,keywords,cast,crew
0,Grumpier Old Men,"[Romance, Comedy]","[A, family, wedding, reignites, the, ancient, ...",Still Yelling. Still Fighting. Still Ready for...,"{'id': 119050, 'name': 'Grumpy Old Men Collect...","[Walter Matthau, Jack Lemmon, Ann-Margret]",[Howard Deutch]


# 

# Create a final Data Frame Composed only of 3 columns
# 'Movie-Title' & 'Movie-elements'

In [49]:
type(final_df['tagline'][0])

str

In [50]:
final_df['tagline'] = final_df['tagline'].apply(lambda x: x.split())

In [51]:
# final_df.iloc[:6] = final_df['overview'] + final_df['genres']  + final_df['cast'] + final_df['crew']
lst = []
lst = final_df['overview']  + final_df['tagline'] + final_df['genres'] + final_df['cast'] + final_df['crew']

In [52]:
lst

0       [A, family, wedding, reignites, the, ancient, ...
1       [Just, when, George, Banks, has, recovered, fr...
2       [James, Bond, must, unmask, the, mysterious, h...
3       [An, outcast, half-wolf, risks, his, life, to,...
4       [Summoned, from, an, ashram, in, Tibet,, Ace, ...
                              ...                        
7373    [El, Mariachi, just, wants, to, play, his, gui...
7374    [A, newlywed, couple's, honeymoon, is, upended...
7375    ["Signed,, Sealed,, Delivered", introduces, a,...
7376    [When, ambitious, New, York, attorney, Sam, is...
7377    [Ever, since, the, second, grade, when, he, fi...
Length: 7378, dtype: object

In [53]:
final_df['Movie Elements'] = [i for i in lst]
final_df['Movie Elements'] = final_df['Movie Elements'].apply(lambda x: " ".join(x))

In [54]:
final_df.head(1)

Unnamed: 0,title,genres,overview,tagline,keywords,cast,crew,Movie Elements
0,Grumpier Old Men,"[Romance, Comedy]","[A, family, wedding, reignites, the, ancient, ...","[Still, Yelling., Still, Fighting., Still, Rea...","{'id': 119050, 'name': 'Grumpy Old Men Collect...","[Walter Matthau, Jack Lemmon, Ann-Margret]",[Howard Deutch],A family wedding reignites the ancient feud be...


In [55]:
# Now, we have ['overview', 'tagline', 'genres', 'cast', 'crew'] in one seperate column --> 'Movie Elements'!!

final_df = final_df[['title', 'Movie Elements']]

In [56]:
# Demo
final_df['Movie Elements'][0]

"A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is in cooking up a hot time with Max. Still Yelling. Still Fighting. Still Ready for Love. Romance Comedy Walter Matthau Jack Lemmon Ann-Margret Howard Deutch"

In [57]:
final_df.head(5)

Unnamed: 0,title,Movie Elements
0,Grumpier Old Men,A family wedding reignites the ancient feud be...
1,Father of the Bride Part II,Just when George Banks has recovered from his ...
2,GoldenEye,James Bond must unmask the mysterious head of ...
3,Balto,An outcast half-wolf risks his life to prevent...
4,Ace Ventura: When Nature Calls,"Summoned from an ashram in Tibet, Ace finds hi..."


In [58]:
# rename 'title' --> 'Movie Title'
final_df = final_df.rename(columns={'title': 'Movie Title'})

In [59]:
final_df = final_df.drop_duplicates(keep='first')

In [60]:
final_df

Unnamed: 0,Movie Title,Movie Elements
0,Grumpier Old Men,A family wedding reignites the ancient feud be...
1,Father of the Bride Part II,Just when George Banks has recovered from his ...
2,GoldenEye,James Bond must unmask the mysterious head of ...
3,Balto,An outcast half-wolf risks his life to prevent...
4,Ace Ventura: When Nature Calls,"Summoned from an ashram in Tibet, Ace finds hi..."
...,...,...
7372,Cavite,"Adam, a security guard, travels from Californi..."
7374,Newlyweds,A newlywed couple's honeymoon is upended by th...
7375,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic..."
7376,Shanghai Calling,When ambitious New York attorney Sam is sent t...


In [61]:
# reset index values for better accuracy by model
final_df = final_df.reset_index(drop=True)

In [62]:
final_df

Unnamed: 0,Movie Title,Movie Elements
0,Grumpier Old Men,A family wedding reignites the ancient feud be...
1,Father of the Bride Part II,Just when George Banks has recovered from his ...
2,GoldenEye,James Bond must unmask the mysterious head of ...
3,Balto,An outcast half-wolf risks his life to prevent...
4,Ace Ventura: When Nature Calls,"Summoned from an ashram in Tibet, Ace finds hi..."
...,...,...
6607,Cavite,"Adam, a security guard, travels from Californi..."
6608,Newlyweds,A newlywed couple's honeymoon is upended by th...
6609,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic..."
6610,Shanghai Calling,When ambitious New York attorney Sam is sent t...


# 

# Step-1: Stemming (text cleaning) using nlp

In [63]:
import nltk
from nltk.stem.snowball import SnowballStemmer

In [64]:
stemmer_tool = SnowballStemmer(language="english")

In [65]:
# Helper function to facilitate stemming

def stemMovieElements(text):
    text = list(text.split(" "))
    stemmed_text = [stemmer_tool.stem(i) for i in text]
    return ' '.join(stemmed_text)

In [66]:
print("Text before stemming:\n",final_df['Movie Elements'][0])

Text before stemming:
 A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is in cooking up a hot time with Max. Still Yelling. Still Fighting. Still Ready for Love. Romance Comedy Walter Matthau Jack Lemmon Ann-Margret Howard Deutch


In [67]:
text = stemMovieElements(final_df['Movie Elements'][0])
print("Text after stemming:\n", text)

Text after stemming:
 a famili wed reignit the ancient feud between next-door neighbor and fish buddi john and max. meanwhile, a sultri italian divorcé open a restaur at the local bait shop, alarm the local who worri she'll scare the fish away. but she less interest in seafood than she is in cook up a hot time with max. still yelling. still fighting. still readi for love. romanc comedi walter matthau jack lemmon ann-margret howard deutch


In [68]:
# now apply stemming to all of the 'Movie Elements'
final_df['Movie Elements'] = final_df['Movie Elements'].apply(stemMovieElements)

In [69]:
final_df.head(5)

Unnamed: 0,Movie Title,Movie Elements
0,Grumpier Old Men,a famili wed reignit the ancient feud between ...
1,Father of the Bride Part II,just when georg bank has recov from his daught...
2,GoldenEye,jame bond must unmask the mysteri head of the ...
3,Balto,an outcast half-wolf risk his life to prevent ...
4,Ace Ventura: When Nature Calls,"summon from an ashram in tibet, ace find himse..."


# 

## Introducing Term Frequency-Inverse Document Frequency (Tf-Idf)
### (a text vectorization technique)

### -- TF (term frequency): records frequency of words in a document. It then normalizes that frequency

### normalization formula   : TF(word: w) = (# of times w occurs in a text) / (total # of words in the text)  




### -- IDF (inverse document frequency): computes importance of a word in a document. It is the frequency of docs in the corpus containing the word. This frequency is then inversed. It prioritizes meaningful, less frequently occuring words in the doc and scales down the weight of frequently occuring needless words eg. stop words.

### formula: IDF(word: w) = log[(total # of texts) / (# of texts containing w in corpus C)]



### -- TF-IDF: In this approach weight of a word in a text/document is first calculated by (TF), then (IDF) of the word in the corpus is calculated. product(tf(w), idf(w)) = tf-idf(w)

### formula: TF-IDF(word: w, text: t, corpus: C) = TF(w, t) . IDF(w, t, C)

### meaningful words in a text -> higher tf-idf.  less meaningful words in text -> almost 0 tf-idf

# Step-2: Text Vectorization

In [70]:
# using TF-IDF technique
from sklearn.feature_extraction.text import TfidfVectorizer

In [71]:
# initialize vectorizer
tfidfVectorizer = TfidfVectorizer(max_features=6612, stop_words=None, use_idf=True, sublinear_tf=True)

In [72]:
# create a matrix to store the vectorized data
tfidf_matrix  = tfidfVectorizer.fit_transform(final_df['Movie Elements'])

In [73]:
tfidf_matrix = tfidf_matrix.toarray()

In [74]:
# sparse tfidf matrix. because this matrix stores unique words in the Corpus(C) and counts their frequency in a mathematical
# way. Giving importance to less occuring meaningful words
tfidf_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [75]:
# it is sparse but its most heavy word is ccalculated mathematically using log
print(max(tfidf_matrix[200]))

0.2629258398265039


In [76]:
tfidf_matrix.shape

(6612, 6612)

# 

# Step-3: calculate Cosine Similarity between a movie to every other movie

In [77]:
from sklearn.metrics.pairwise import cosine_similarity

In [78]:
# lets get our cosine similarity matrix
cosine_similarity_matrix = cosine_similarity(tfidf_matrix)

In [79]:
cosine_similarity_matrix.shape

(6612, 6612)

In [80]:
cosine_similarity_matrix[0]

array([1.        , 0.04226986, 0.00816718, ..., 0.03052583, 0.01181607,
       0.02358081])

In [81]:
# create a fn to return list('movie name', similarity score)

In [82]:
from sklearn.neighbors import NearestNeighbors

In [83]:
def recommendMoviesCosineSimilarity(movie_name):
    movie_idx = final_df.index[final_df['Movie Title'] == movie_name][0]
    movie_cosinesimilarity_vector =  cosine_similarity_matrix[movie_idx]
    movie_list = sorted(list(enumerate(movie_cosinesimilarity_vector)), reverse=True, key=lambda x: x[1])
    
    # movielist = list of tuples -> (movie_index in final_df, similarity)
    print(movie_list[0])
    movie_list = movie_list[0: 100]
    
    # create recommended list of (movie_name, similarity score)
    recommended_list = []
    
    for i in movie_list:
        recommended_list.append((final_df.iloc[i[0]][0], i))
        
    return recommended_list

In [84]:
# Function to create k nearest neighbors on cosine_similarity_matrix we created above

def recommendMoviesKnn(movie_name):
    movie_idx = final_df.index[final_df['Movie Title'] == movie_name][0]
    
    model_knn = NearestNeighbors(metric="euclidean", algorithm='brute')
    model_knn.fit(cosine_similarity_matrix)

    distances, indices = model_knn.kneighbors(cosine_similarity_matrix[movie_idx].reshape(1, -1), n_neighbors = 10)

    recommended_list = []
    for i in range(0, len(distances.flatten())):
        recommended_list.append((final_df.iloc[indices.flatten()[i]][0], distances.flatten()[i]))
    
    return recommended_list

In [85]:
# print(final_df.index[final_df['Movie Title'] == 'Iron Man'])
final_df['Movie Title'][4]

'Ace Ventura: When Nature Calls'

In [86]:
final_df.iloc[573][1]

'two mutants, rogu and wolverine, come to a privat academi for their kind whose resid superhero team, the x-men, must oppos a terrorist organ with similar powers. evolut begin adventur action scienc fiction patrick stewart hugh jackman ian mckellen bryan singer'

In [87]:
movie_idx = final_df.index[final_df['Movie Title'] == 'Toy Story']
movie_cosinesimilarity_vector =  tfidf_matrix[movie_idx]

In [88]:
# the cosing similarity vector of a movie is sparse by nature because a lot of movies are unrelated to one movie
movie_cosinesimilarity_vector

array([[0., 0., 0., ..., 0., 0., 0.]])

# 

# judgement time!!

In [89]:
# some sample movies to test on

# ''Aladdin', Avatar', 'Iron Man', 'Predator', 'Batman', 'Toy Story', 'Jurassic Park', 'Spider-Man', 'Thor'
# 'The Avengers', 'Dracula', 'Godzilla', 'Warcraft', 'X-Men', 'The Matrix', 'Dead Man Down', 'Kung Fu Panda 3'
# 'Terminator Genisys', 'Man of Steel', 'Harry Potter and the Half-Blood Prince', 'Spectre'
# 'Insidious', 'Transformers: Age of Extinction', 'Annabelle', 'Cinderella III: A Twist in Time'
# 'Step Up', 'Never Back Down', 'Hulk', 'Mission: Impossible II', 'Cars', 'Street Fighter', 'The Matrix'

In [90]:
movie_idx = final_df.index[final_df['Movie Title'] == 'Anabelle']
final_df['Movie Title']


0                     Grumpier Old Men
1          Father of the Bride Part II
2                            GoldenEye
3                                Balto
4       Ace Ventura: When Nature Calls
                     ...              
6607                            Cavite
6608                         Newlyweds
6609         Signed, Sealed, Delivered
6610                  Shanghai Calling
6611                 My Date with Drew
Name: Movie Title, Length: 6612, dtype: object

In [91]:
movie_name = 'Annabelle'

In [92]:
# movies using cosine similarity

recommended_list_cosine = recommendMoviesCosineSimilarity(movie_name)
recommended_list_cosine[:10]

(1966, 1.0)


[('Annabelle', (1966, 1.0)),
 ('Annabelle', (5354, 0.8946060938984212)),
 ('The Crow: City of Angels', (95, 0.1441904849252451)),
 ('The Texas Chainsaw Massacre: The Beginning', (1239, 0.13931605971233357)),
 ('About a Boy', (3945, 0.13385558887099866)),
 ('The Howling: Reborn', (2250, 0.13289592760003663)),
 ("The Emperor's New Groove", (620, 0.13113420639523748)),
 ('If I Stay', (4919, 0.1306041918204071)),
 ('Wind Walkers', (6101, 0.13042549227369038)),
 ('Fast Five', (1605, 0.13026423518155725))]

In [93]:
# movies using k nearest neighbors

recommended_list = recommendMoviesKnn(movie_name)
print("The Seeded Movie is: ", movie_name, '\n')

cnt = 1

for i in recommended_list:
    print("Recommended Movie {0}: {1}, {2}".format(cnt, i[0], float("{0:.5f}".format(i[1]))))
    cnt += 1

The Seeded Movie is:  Annabelle 

Recommended Movie 1: Annabelle, 0.0
Recommended Movie 2: Annabelle, 0.48389
Recommended Movie 3: The House of Fear, 2.27452
Recommended Movie 4: The Bishop Murder Case, 2.3036
Recommended Movie 5: Zombieland, 2.3043
Recommended Movie 6: The Vampire Lovers, 2.31754
Recommended Movie 7: Nine Dead, 2.32522
Recommended Movie 8: Anacondas: The Hunt for the Blood Orchid, 2.33434
Recommended Movie 9: Tremors, 2.33698
Recommended Movie 10: Tremors 4: The Legend Begins, 2.33887


In [94]:
final_df

Unnamed: 0,Movie Title,Movie Elements
0,Grumpier Old Men,a famili wed reignit the ancient feud between ...
1,Father of the Bride Part II,just when georg bank has recov from his daught...
2,GoldenEye,jame bond must unmask the mysteri head of the ...
3,Balto,an outcast half-wolf risk his life to prevent ...
4,Ace Ventura: When Nature Calls,"summon from an ashram in tibet, ace find himse..."
...,...,...
6607,Cavite,"adam, a secur guard, travel from california to..."
6608,Newlyweds,a newlyw coupl honeymoon is upend by the arriv...
6609,"Signed, Sealed, Delivered","""signed, sealed, delivered"" introduc a dedic q..."
6610,Shanghai Calling,when ambiti new york attorney sam is sent to s...


In [95]:
# to find a particular movie in corpus for further inspection
movie_idx = final_df.index[final_df['Movie Title'] == 'Iron Man 3'][0]
print(movie_idx)

1804


In [96]:
pip install pickle

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement pickle (from versions: none)
ERROR: No matching distribution found for pickle


In [97]:
import pickle

In [98]:
pickle.dump(final_df, open('movies.pkl', 'wb'))

In [99]:
pickle.dump(cosine_similarity_matrix, open('cosine_similarity_matrix.pkl', 'wb'))