# METADATA BASED RECOMMENDER SYSTEM

We'll build a recommender system based on the following metadata: the 3 top actors, the director, related genres, and the movie plot keywords.

In [1]:
import pandas as pd

In [2]:
path = "C:/Users/dmcul/Downloads/recommender_class/"
metadata = pd.read_csv( path + "movies_metadata.csv", low_memory=False)
metadata.head(5)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()
indices[:15]

title
Toy Story                       0
Jumanji                         1
Grumpier Old Men                2
Waiting to Exhale               3
Father of the Bride Part II     4
Heat                            5
Sabrina                         6
Tom and Huck                    7
Sudden Death                    8
GoldenEye                       9
The American President         10
Dracula: Dead and Loving It    11
Balto                          12
Nixon                          13
Cutthroat Island               14
dtype: int64

In [4]:
# Load keywords and credits
credits = pd.read_csv(path + 'credits.csv', low_memory=False)
keywords = pd.read_csv(path + 'keywords.csv', low_memory=False)

# Remove rows with bad IDs.
metadata = metadata.drop([19730, 29503, 35587])

# Convert IDs to int. Required for merging
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

# Merge keywords and credits into your main metadata dataframe
metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')

# Print the first two movies of your newly merged metadata
metadata.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."


From your new features, cast, crew, and keywords, you need to extract the three most important actors, the director and the keywords associated with that movie.

In [5]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)

Write functions that will extract the required information from each feature

In [6]:
import numpy as np

# Get the director's name from the crew feature. If the director is not listed, return NaN
def get_director(df):
    for i in df:
        if i['job'] == 'Director':
            return i['name']
    return np.nan


# Write a function that will return the top 3 elements or the entire list, whichever is more. 
# Here the list refers to the cast, keywords, and genres.
def get_list(df):
    if isinstance(df, list):
        names = [i['name'] for i in df]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names
#Return empty list in case of missing/malformed data
    return []

# Define new director, cast, genres and keywords features that are in a suitable form.
metadata['director'] = metadata['crew'].apply(get_director)
features = ['cast', 'keywords', 'genres']

for feature in features:
    metadata[feature] = metadata[feature].apply(get_list)
    
# Print the new features of the first 3 films
metadata[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"


Convert the names and keyword instances into lowercase and strip all the spaces between them

In [7]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(df):
    if isinstance(df, list):
        return [str.lower(i.replace(" ", "")) for i in df]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(df, str):
            return str.lower(df.replace(" ", ""))
        else:
            return ''
        
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)

We're now in a position to create the "metadata embedding vector" - a vector containing all the metadata we want to feed to the vectorizer (actors, director and keywords).

The embed_vector function will simply join all the required columns by a space. 

This is the final preprocessing step, and the output of this function will be fed into the word vector model.

The DIRECTOR has increased weight

In [8]:
def embed_vector(df):
    return ' '.join(df['keywords']) + ' ' + ' '.join(df['cast']) + ' ' + df['director']*4 + ' ' + ' '.join(df['genres'])
# Create a new embedded feature
metadata['embedded_vector'] = metadata.apply(embed_vector, axis=1)
metadata[['embedded_vector']].head(5)

Unnamed: 0,embedded_vector
0,jealousy toy boy tomhanks timallen donrickles ...
1,boardgame disappearance basedonchildren'sbook ...
2,fishing bestfriend duringcreditsstinger walter...
3,basedonnovel interracialrelationship singlemot...
4,baby midlifecrisis confidence stevemartin dian...


The next steps are the same we did in the plot description based recommender. 

One key difference is that we'll use the CountVectorizer() instead of TF-IDF because we do not want to downweight the actor/director's presence if he or she has acted or directed in relatively more movies. It doesn't make much intuitive sense to downweight them in this context.

The major difference between CountVectorizer() and TF-IDF is the inverse document frequency (IDF) component which is present in later and not in the former.

In [9]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata['embedded_vector'])
count_matrix.shape

(46628, 77403)

In [12]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

# Reset index of your main DataFrame and construct reverse mapping as before
metadata = metadata.reset_index()
indices = pd.Series(metadata.index, index=metadata['title'])

In [13]:
metadata.head()

Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,tagline,title,video,vote_average,vote_count,cast,crew,keywords,director,embedded_vector
0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[animation, comedy, family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,...,,Toy Story,False,7.7,5415.0,"[tomhanks, timallen, donrickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousy, toy, boy]",johnlasseter,jealousy toy boy tomhanks timallen donrickles ...
1,1,False,,65000000,"[adventure, fantasy, family]",,8844,tt0113497,en,Jumanji,...,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[robinwilliams, jonathanhyde, kirstendunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[boardgame, disappearance, basedonchildren'sbook]",joejohnston,boardgame disappearance basedonchildren'sbook ...
2,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[romance, comedy]",,15602,tt0113228,en,Grumpier Old Men,...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[waltermatthau, jacklemmon, ann-margret]","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[fishing, bestfriend, duringcreditsstinger]",howarddeutch,fishing bestfriend duringcreditsstinger walter...
3,3,False,,16000000,"[comedy, drama, romance]",,31357,tt0114885,en,Waiting to Exhale,...,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[whitneyhouston, angelabassett, lorettadevine]","[{'credit_id': '52fe44779251416c91011acb', 'de...","[basedonnovel, interracialrelationship, single...",forestwhitaker,basedonnovel interracialrelationship singlemot...
4,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[comedy],,11862,tt0113041,en,Father of the Bride Part II,...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[stevemartin, dianekeaton, martinshort]","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[baby, midlifecrisis, confidence]",charlesshyer,baby midlifecrisis confidence stevemartin dian...


#### Get recommendations

In [14]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim2):
    # Get the index of the movie that matches the title
    idx = indices[title]
# Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
# Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
# Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
# Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

In [15]:
get_recommendations('The Dark Knight Rises', cosine_sim2)

12589          The Dark Knight
10210            Batman Begins
9311                    Shiner
9874           Amongst Friends
516          Romeo Is Bleeding
11463             The Prestige
24090                Quicksand
25038                 Deadfall
41063                     Sara
43848    Lure: Teen Fight Club
Name: title, dtype: object

In [16]:
get_recommendations('The Godfather', cosine_sim2)

1934            The Godfather: Part III
1199             The Godfather: Part II
15609                   The Rain People
18940                         Last Exit
34488                              Rege
35802            Manuscripts Don't Burn
35803            Manuscripts Don't Burn
8001     The Night of the Following Day
18261                 The Son of No One
28683            In the Name of the Law
Name: title, dtype: object

The recommender has been successful in capturing information based on the metadata embedded information