In [2]:
import pandas as pd
import numpy as np 

# Import data
df = pd.read_csv('data/metadata_clean.csv')

# Print head
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"['Animation', 'Comedy', 'Family']",81.0,7.7,5415.0,1995
1,Jumanji,"['Adventure', 'Fantasy', 'Family']",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"['Romance', 'Comedy']",101.0,6.5,92.0,1995
3,Father of the Bride Part II,['Comedy'],106.0,5.7,173.0,1995
4,Heat,"['Action', 'Crime', 'Drama', 'Thriller']",170.0,7.7,1886.0,1995


# Document vectors 

In order to numerically quantify the similarity between two bodies of text we will use vectorizers such as CountVectorizer and TF-IDFVectorizer. 

CountVectorizer counts the amount of time a word appeared in a description, after discounting stop words like a, the, is, my...

TF-IDF (Term Frequency-Inverse Document Frequency) Vectorizer assigns weights to each word according to the following formula. For every word i in document j, the following applies. 

Wi,j = tfi,j x log(N/dfi) 

- wi,j is the weight of word i in document j 
- dfi is the number of documents that contain the term i 
- N is the total number of documents

The weight of a word in a document is greater if it ocrrus more frequently in that document and is present in fewer documents. The weight wi,j takes vales between 0 and 1: 




# The cosine similarity score 

The cosine can take any value between -1 and 1. The higher the cosine score, the more similar the documents are to each other. 

# Plot description-based recommender 

1. Obtain the data required to build the model
2. Create TF-IDF verctors for the plot description of every movie
3. Compute the pairwise cosine similarity score of every movie
4. Write the recommender function that takes in a movie title as an argument and ouputs movies most similar to it based on the plot 

In [3]:
# Import file
orig_df = pd.read_csv('data/movies_metadata.csv', low_memory=False)

df['overview'], df['id'] = orig_df['overview'], orig_df['id']

df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id
0,Toy Story,"['Animation', 'Comedy', 'Family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862
1,Jumanji,"['Adventure', 'Fantasy', 'Family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844
2,Grumpier Old Men,"['Romance', 'Comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602
3,Father of the Bride Part II,['Comedy'],106.0,5.7,173.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357
4,Heat,"['Action', 'Crime', 'Drama', 'Thriller']",170.0,7.7,1886.0,1995,Just when George Banks has recovered from his ...,11862


In [4]:
# Import TfIdVecotirzer from scikit-learn 
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with an empty strign 
df['overview'] = df['overview'].fillna('')

# Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix = tfidf.fit_transform(df['overview'])

# Output the shape of tfidf_matrix
tfidf_matrix.shape

(8963, 30300)

# Computing cosine similarity score

Create a 8963 x 8963 matrix where the cell in the ith row and jth column represents the similarity score between movies i and j. Since the movie plots are represented as TF-IDF vectors, their magnitude is always 1. Hence, we do not need to calculate the denominator in teh cosine similarity formula as it will alwyays be 1. 

In [15]:
# Import linear_kernel to compute the dot product
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [16]:
# Construct a reverse mapping of indices and movie titles, and drop duplicate titles, if any 
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [25]:
# Function that takes in movie title as input and gives recommendations
def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices): 
    
    # Obtain the index of the movie that matches the title
    idx = indices[title]
    
    # Get the pairwise similarity scores of all movies with that movie
    # And convert it into a list of tuples as described above
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies. Ignore the first movie. 
    sim_scores = sim_scores[1:11]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [26]:
# Get recommendations 
content_recommender('The Lion King')


7369                        Song of the Sea
2476                          Birthday Girl
5399                          Fermat's Room
5174    The Imaginarium of Doctor Parnassus
1625                            Next Friday
1030                     The Little Mermaid
6204                                    Ted
7746                          Danny Collins
5683                       The Green Hornet
6453                                  Sunny
Name: title, dtype: object

# Metadata-based recommender

In [29]:
cred_df = pd.read_csv('data/credits.csv')
key_df = pd.read_csv('data/keywords.csv')

cred_df.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [30]:
# Convert the IDs of df into int 
df['id'] = df['id'].astype('int')

In [22]:
# Function to convert all non-integer IDs to NaN
def clean_ids(x): 
    try: 
        return int(x)
    except: 
        return np.nan
    
# Clean the ids of df
df['id'] = df['id'].apply(clean_ids)

# Filter all rows that have a null ID
df = df[df['id'].notnull()]

# Convert IDs into integer
df['id'] = df['id'].astype('int')
key_df['id'] = key_df['id'].astype('int')

# Merge keywords and credits into your main metadata dataframe
df = df.merge(cred_df, on='id')
df = df.merge(key_df, on='id')

# Display the head of the merged df
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id,cast,crew,keywords
0,Toy Story,"['Animation', 'Comedy', 'Family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,Jumanji,"['Adventure', 'Fantasy', 'Family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,Grumpier Old Men,"['Romance', 'Comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,Father of the Bride Part II,['Comedy'],106.0,5.7,173.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,Heat,"['Action', 'Crime', 'Drama', 'Thriller']",170.0,7.7,1886.0,1995,Just when George Banks has recovered from his ...,11862,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [28]:
# Convert the stringified objects into native python objects
from ast import literal_eval 

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features: 
    df[feature] = df[feature].apply(literal_eval)

In [34]:
df.iloc[0]['crew'][0]

{'credit_id': '52fe4284c3a36847f8024f49',
 'department': 'Directing',
 'gender': 2,
 'id': 7879,
 'job': 'Director',
 'name': 'John Lasseter',
 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}

In [35]:
# Extract director's name. If director is not listed, return NaN

def get_director(x): 
    for crew_member in x: 
        if crew_member['job'] == 'Director': 
            return crew_member['name']
        return np.nan

In [36]:
# Define new director feature
df['director'] = df['crew'].apply(get_director)

# Print the directors of the first five movies
df['director'].head()



0      John Lasseter
1                NaN
2      Howard Deutch
3    Forest Whitaker
4                NaN
Name: director, dtype: object

In [37]:
# Returns the list top 3 elements or entire list if less than 3 elements
def generate_list(x): 
    if isinstance(x, list): 
        names = [ele['name'] for ele in x]
        # Check if more than 3 elements exist. If yes, return only first three
        # If no, return entire list
        if len(names) > 3: 
            names = names[:3]
        return names
    # Retrn empty list in case of missing data
    return []


In [38]:
# Apply generate_list function to cast and keywords
df['cast'] = df['cast'].apply(generate_list)
df['keywords'] = df['keywords'].apply(generate_list)

# Only consider a maximum of 4 genres
df['genres'] = df['genres'].apply(lambda x: x[:3])

In [40]:
# Print the new featres of the first 5 movies along with title
df[['title', 'cast', 'director', 'keywords', 'genres']].head()

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"
3,Father of the Bride Part II,"[Whitney Houston, Angela Bassett, Loretta Devine]",Forest Whitaker,"[based on novel, interracial relationship, sin...",[Comedy]
4,Heat,"[Steve Martin, Diane Keaton, Martin Short]",,"[baby, midlife crisis, confidence]","[Action, Crime, Drama]"


In [43]:
# Function to sanitize data to prevent ambiguity. E.g. Ryan Gosling and Ryan Reynolds become ryangosling and ryanreynolds
def sanitize(x): 
    if isinstance(x, list): 
        # Strip spaces and convert to lowercase
        return [str.lower(i.replace(" ", "")) for i in x]
    else: 
        # Check if director exists. If not, return empty string 
        if isinstance(x, str): 
            return str.lower(x.replace(" ", ""))
        else: 
            return ''

In [44]:
# Apply to generenate_list function 
for feature in ['cast', 'director', 'genres', 'keywords']: 
    df[feature] = df[feature].apply(sanitize)

# Creating metadata soup

In [45]:
# Function that creates a soup out of the desired metadata
def create_soup(x): 
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [46]:
# Create the new soup feature 
df['soup'] = df.apply(create_soup, axis=1)

# Display the soup of the first movie 
df.iloc[0]['soup']



'jealousy toy boy tomhanks timallen donrickles johnlasseter animation comedy family'

Use CountVectorizer because TF-IDFVectorizer will give less weight to actors and directors who have acted and directed in a relatively larger number of movies

In [48]:
from sklearn.feature_extraction.text import CountVectorizer
# Define a new CountVectorizer object and create vectors for the soup
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [49]:
# Import cosine_similarity function
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity score(equivalent to dot product for tf-idf vectors)
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [50]:
# Reset index of your df and construct reverse mapping again
df = df.reset_index()
indices2 = pd.Series(df.index, index=df['title'])

In [51]:
content_recommender('The Lion King', cosine_sim2, df, indices2)

2383                            Waking Life
5629    A Turtle's Tale: Sammy's Adventures
1468                             Thumbelina
1063                  All Dogs Go to Heaven
778                                Hercules
1073                        Charlotte's Web
1638                       Five Easy Pieces
1780                             Parenthood
3697                   Whisper of the Heart
4844                               Bambi II
Name: title, dtype: object