In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
cred = pd.read_csv('data/tmdb_5000_credits.csv')
movs = pd.read_csv('data/tmdb_5000_movies.csv')

In [7]:
cred.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [8]:
movs.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


Want to merge the two datasets on `id`, must check that these align

In [12]:
(cred.movie_id != movs.id).sum()

0

In [13]:
# rename column name
cred.rename(columns={'movie_id': 'id'}, inplace=True)

# merge two dataframes
df = cred.merge(movs, on='id')

In [14]:
df.head(2)

Unnamed: 0,id,title_x,cast,crew,budget,genres,homepage,keywords,original_language,original_title,...,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title_y,vote_average,vote_count
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    4803 non-null   int64  
 1   title_x               4803 non-null   object 
 2   cast                  4803 non-null   object 
 3   crew                  4803 non-null   object 
 4   budget                4803 non-null   int64  
 5   genres                4803 non-null   object 
 6   homepage              1712 non-null   object 
 7   keywords              4803 non-null   object 
 8   original_language     4803 non-null   object 
 9   original_title        4803 non-null   object 
 10  overview              4800 non-null   object 
 11  popularity            4803 non-null   float64
 12  production_companies  4803 non-null   object 
 13  production_countries  4803 non-null   object 
 14  release_date          4802 non-null   object 
 15  revenue              

In [17]:
# remove null overviews
df.dropna(subset=['overview'], inplace=True)

# get useful columns
df = df[['id', 'title_x', 'cast', 'crew', 'budget', 'genres', 'keywords', 'overview',
         'popularity', 'production_companies', 'production_countries', 'revenue',
         'runtime', 'vote_average', 'vote_count']]

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4800 entries, 0 to 4802
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    4800 non-null   int64  
 1   title_x               4800 non-null   object 
 2   cast                  4800 non-null   object 
 3   crew                  4800 non-null   object 
 4   budget                4800 non-null   int64  
 5   genres                4800 non-null   object 
 6   keywords              4800 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4800 non-null   float64
 9   production_companies  4800 non-null   object 
 10  production_countries  4800 non-null   object 
 11  revenue               4800 non-null   int64  
 12  runtime               4800 non-null   float64
 13  vote_average          4800 non-null   float64
 14  vote_count            4800 non-null   int64  
dtypes: float64(3), int64(4), o

In [19]:
df.head(2)

Unnamed: 0,id,title_x,cast,crew,budget,genres,keywords,overview,popularity,production_companies,production_countries,revenue,runtime,vote_average,vote_count
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2787965087,162.0,7.2,11800
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",961000000,169.0,6.9,4500


In [25]:
' '.join([i['name'] for i in eval(df.cast[0])][:5])

'Sam Worthington Zoe Saldana Sigourney Weaver Stephen Lang Michelle Rodriguez'

In [28]:
jobs_of_interest = [
    'Producer', 'Director', 'Original Music Composer', 'Writer', 'Screenplay'
]

In [29]:
' '.join(list(set([x['name'] for x in eval(df.crew[0]) if x['job'] in jobs_of_interest])))

'James Cameron Jon Landau James Horner'

In [36]:
[x['name'] for x in eval(df.production_countries[0])]

['United States of America', 'United Kingdom']

In [39]:
def generate_corpus(row: pd.core.series.Series):
    corpus = ""
    
    cast = ' '.join([x['name'] for x in eval(row.cast)][:5])
    crew = ' '.join(list(set([x['name'] for x in eval(row.crew) if x['job'] in jobs_of_interest])))
    genre = ' '.join([x['name'] for x in eval(row.genres)])
    keywords = ' '.join([x['name'] for x in eval(row.keywords)])
    prod_companies = ' '.join([x['name'] for x in eval(row.production_companies)])
    prod_countries = ' '.join([x['name'] for x in eval(row.production_countries)])

    corpus = ' '.join([row.overview, cast, crew, genre, keywords, prod_companies, prod_countries])
    return corpus

In [40]:
generate_corpus(df.iloc[0])

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Sam Worthington Zoe Saldana Sigourney Weaver Stephen Lang Michelle Rodriguez James Cameron Jon Landau James Horner Action Adventure Fantasy Science Fiction culture clash future space war space colony society space travel futuristic romance space alien tribe alien planet cgi marine soldier battle love affair anti war power relations mind and soul 3d Ingenious Film Partners Twentieth Century Fox Film Corporation Dune Entertainment Lightstorm Entertainment United States of America United Kingdom'

In [42]:
corpus = []
for i in tqdm(range(len(df))):
    corpus.append(generate_corpus(df.iloc[i]))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4800/4800 [00:08<00:00, 588.18it/s]


In [43]:
len(corpus)

4800

In [46]:
df.rename(columns={'title_x': 'title'}, inplace=True)

# replace columns used for corpus with corpus itself
df.drop(columns=['cast', 'crew', 'genres', 'keywords', 'production_companies', 
                 'production_countries','overview'], inplace=True)
df['corpus'] = corpus

In [48]:
df.head(2)

Unnamed: 0,id,title,budget,popularity,revenue,runtime,vote_average,vote_count,corpus
0,19995,Avatar,237000000,150.437577,2787965087,162.0,7.2,11800,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,300000000,139.082615,961000000,169.0,6.9,4500,"Captain Barbossa, long believed to be dead, ha..."


# Text Representation & Text Similarity

In [55]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['corpus'])

In [56]:
df.shape

(4800, 9)

In [57]:
tfidf_matrix.shape

(4800, 37632)

In [63]:
# calculate similarity of every movie to every other movie (based on tfidf)
cosine_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_matrix.shape

(4800, 4800)

In [65]:
def get_recommendations(movie: str, df: pd.DataFrame=df, n=10):
    # get index from df
    index = df[df['title'] == movie].index[0]

    # sort top n similar movies
    similar_movies = sorted(list(enumerate(cosine_matrix[index])), 
                            reverse=True, key=lambda x: x[1])

    # extract names from df and return movie names
    recomm = []
    for i in similar_movies[1:n+1]:
        recomm.append(df.iloc[i[0]].title)
    return recomm

In [66]:
get_recommendations('The Dark Knight')

['The Dark Knight Rises',
 'Batman Begins',
 'Batman: The Dark Knight Returns, Part 2',
 'Batman Returns',
 'Batman v Superman: Dawn of Justice',
 'Batman Forever',
 'Man of Steel',
 'Batman',
 'Suicide Squad',
 'Batman & Robin']

In [67]:
get_recommendations('Mission: Impossible')

['Mission: Impossible II',
 'Mission: Impossible III',
 'Vanilla Sky',
 'Mission: Impossible - Ghost Protocol',
 'Bad Company',
 'Mission: Impossible - Rogue Nation',
 'You Only Live Twice',
 'Live and Let Die',
 'Dr. No',
 'The Last Samurai']

In [70]:
def get_keywords_recommendations(keywords: str, n=10):
    keywords = keywords.split()
    keywords = ' '.join(keywords)

    # transform string to vector representation
    key_tfidf = tfidf.transform([keywords])

    # compute cosine similarity
    cos_sim = cosine_similarity(key_tfidf, cosine_matrix)

    # sort top n similar movies
    similar_key_movies = sorted(list(enumerate(cos_sim[0])), reverse=True, key=lambda x: x[1])

    # extract names from df and return movie names
    recomm = []
    for i in similar_key_movies[1: n+1]:
        recomm.append(df.iloc[i[0]].title)
    return recomm

In [73]:
key_tfidf.shape

(1, 37632)

In [74]:
cosine_matrix.shape

(4800, 4800)

In [83]:
cosine_similarity(key_tfidf, cosine_matrix)

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 37632 while Y.shape[1] == 4800

In [82]:
keywords = 'Christopher Nolan'

keywords = keywords.split()
keywords = ' '.join(keywords)

# transform string to vector representation
key_tfidf = tfidf.transform([keywords])

# compute cosine similarity
cos_sim = cosine_similarity(key_tfidf, cosine_matrix)

# sort top n similar movies
similar_key_movies = sorted(list(enumerate(cos_sim[0])), reverse=True, key=lambda x: x[1])

# extract names from df and return movie names
recomm = []
for i in similar_key_movies[1: n+1]:
    recomm.append(df.iloc[i[0]].title)
recomm

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 37632 while Y.shape[1] == 4800

In [81]:
get_keywords_recommendations('Christopher Nolan')

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 37632 while Y.shape[1] == 4800