In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from ast import literal_eval
import matplotlib.pyplot as plt

In [3]:
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Content Recommender

In [4]:
links_small = pd.read_csv('data/input/movies/links_small.csv')
md = pd.read_csv('data/input/movies/movies_metadata.csv')

  md = pd.read_csv('data/input/movies/movies_metadata.csv')


In [5]:
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
md = md.drop([19730, 29503, 35587])

**Creating a smaller dataset to work with**

In [6]:
md['id'] = md['id'].astype('int')
smd = md[md['id'].isin(links_small)].copy()
smd.shape

(9099, 24)

In [7]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [8]:
# Analyses 1-2 words in the description of the movies, removing english stop words
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0.0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

In [9]:
tfidf_matrix.shape

(9099, 268124)

**Cosine Similarity**

I will be using the Cosine Similarity to calculate a numeric quantity that denotes the similarity between two movies. Mathematically, it is defined as follows

In [None]:
# Creating a cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [11]:
# Resetting the index of the main DataFrame and constructing reverse mapping
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [14]:
def get_recommendations(title):
    """
    Function that returns the top 30 most similar movies to the input title
    """
    # Get the index of the movie title
    idx = indices[title]
    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 30 most similar movies
    sim_scores = sim_scores[1:31]
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [15]:
get_recommendations('The Godfather').head(10)

973      The Godfather: Part II
8387                 The Family
3509                       Made
4196         Johnny Dangerously
29               Shanghai Triad
5667                       Fury
2412             American Movie
1582    The Godfather: Part III
4221                    8 Women
2159              Summer of Sam
Name: title, dtype: object

**We are going to use much more suggestive metadata than Overview and Tagline.**

**Build a more sophisticated recommender that takes genre, keywords, cast and crew into consideration.**

In [19]:
credits = pd.read_csv('data/input/movies/credits.csv')
keywords = pd.read_csv('data/input/movies/keywords.csv')

In [20]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [21]:
md.shape

(45463, 24)

In [22]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [26]:
# Consolidates into a single database, filtering for titles that are present in the small database set
smd = md[md['id'].isin(links_small)].copy()
smd.shape

(9219, 27)

**We now have our cast, crew, genres and credits, all in one dataframe**

In [27]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [None]:
def get_director(x):
    """
    Returns the director of the movie
    """

    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [30]:
smd['director'] = smd['crew'].apply(get_director)

In [31]:
# Returns the top 3 actors in the cast
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [32]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [34]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [35]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [36]:
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [37]:
s = s.value_counts()
s[:5]

keyword
independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: count, dtype: int64

In [38]:
s = s[s > 1]