In [118]:
# import dependencies
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# load datasets
df_credits = pd.read_csv(".././data/tmdb_5000_credits.csv")
df_movies = pd.read_csv(".././data/tmdb_5000_movies.csv")

# See the size of the data sets
df_credits.shape, df_movies.shape

((4803, 4), (4803, 20))

In [119]:
# Before merging the two dataframes we will check if they have any same key ids
(df_credits.movie_id != df_movies.id).any().sum()

0

In [120]:
# rename column 'movie_id'
df_credits.rename(columns={'movie_id': 'id'}, inplace=True)

# merge the two dataframes & store in dataframe
df = df_credits.merge(df_movies,on='id')


In [121]:
# Looking at the datatypes of the columns 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4803 entries, 0 to 4802
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    4803 non-null   int64  
 1   title_x               4803 non-null   object 
 2   cast                  4803 non-null   object 
 3   crew                  4803 non-null   object 
 4   budget                4803 non-null   int64  
 5   genres                4803 non-null   object 
 6   homepage              1712 non-null   object 
 7   keywords              4803 non-null   object 
 8   original_language     4803 non-null   object 
 9   original_title        4803 non-null   object 
 10  overview              4800 non-null   object 
 11  popularity            4803 non-null   float64
 12  production_companies  4803 non-null   object 
 13  production_countries  4803 non-null   object 
 14  release_date          4802 non-null   object 
 15  revenue              

In [122]:
# drop null overviews
df.dropna(subset = ['overview'],inplace = True)

# filter out target columns 
df = df[['id','title_x','genres','overview','cast','crew']]

# check new df info
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4800 entries, 0 to 4802
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        4800 non-null   int64 
 1   title_x   4800 non-null   object
 2   genres    4800 non-null   object
 3   overview  4800 non-null   object
 4   cast      4800 non-null   object
 5   crew      4800 non-null   object
dtypes: int64(1), object(5)
memory usage: 262.5+ KB


In [123]:
# view the dataframe
df.head()

Unnamed: 0,id,title_x,genres,overview,cast,crew
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","Captain Barbossa, long believed to be dead, ha...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",A cryptic message from Bond’s past sends him o...,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",Following the death of District Attorney Harve...,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","John Carter is a war-weary, former military ca...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [124]:
# Genres
df.genres[0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [125]:
' '.join([i['name'] for i in eval(df.genres[0])])

'Action Adventure Fantasy Science Fiction'

In [126]:
# Taking top 4 casts
' '.join([i['name'] for i in eval(df.cast[0])[:4]])

'Sam Worthington Zoe Saldana Sigourney Weaver Stephen Lang'

In [127]:
# Taking Crew (Director & Producer)
' '.join(list(set([i['name'] for i in eval(df.crew[0]) if i['job']=='Director' or i['job']=='Producer'])))

'Jon Landau James Cameron'

In [128]:
# Corpus = Title + Genre + Overview + Cast + Crew
# Function To Generate a Corpus 
def generate_corpus(overview, genre, cast, crew):
    
    corpus = ""
    
    genre = ' '.join([i['name'] for i in eval(genre)])
    
    cast = ' '.join([i['name'] for i in eval(cast)[:3]])
       
    crew = ' '.join(list(set([i['name'] for i in eval(crew) if i['job']=='Director' or i['job']=='Producer'])))
    
    corpus+= overview + " " + genre + " " + cast + " " + crew
    
    return corpus

In [129]:
# Generating the corpus for the entire dataframe
corpus = []
for i in range(len(df)):
    corpus.append(generate_corpus(df.iloc[i].overview, df.iloc[i].genres, df.iloc[i].cast, df.iloc[i].crew))


len(corpus)

4800

In [130]:
corpus[0] # Corpus = Title + Genre + Overview + Cast + Crew

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy Science Fiction Sam Worthington Zoe Saldana Sigourney Weaver Jon Landau James Cameron'

In [131]:
# Rename the column 
df.rename(columns = {'title_x':'title'}, inplace = True)

In [132]:
# Drop Old Columns
df.drop(columns=['genres', 'overview', 'cast', 'crew'], inplace=True)


In [133]:
# Add Corpus
df['corpus'] = corpus

In [134]:
df.head()

Unnamed: 0,id,title,corpus
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [135]:
# Example : Cosine Simmilarity
# Define three vectors
A = [1, 2]
B = [2, 3]
C = [3, 1]

# Calculate dot products
ab = np.dot(A,B)
bc = np.dot(B,C)
ca = np.dot(C,A)

# calculate the length of the vector
a = np.linalg.norm(A)
b = np.linalg.norm(B)
c = np.linalg.norm(C)

# calculte cosine similarity for each pair using the above formula
sim_ab = ab/(a*b)
sim_bc = bc/(b*c)
sim_ca = ca/(c*a)

# lets see the similarities
sim_ab, sim_bc, sim_ca

(0.9922778767136677, 0.7893522173763263, 0.7071067811865475)

In [136]:
# import class scikit-learn(sklearn)
# from sklearn.metrics.pairwise import cosine_similarity

# compute cosine similarity
cosine_similarity([A, B, C])

array([[1.        , 0.99227788, 0.70710678],
       [0.99227788, 1.        , 0.78935222],
       [0.70710678, 0.78935222, 1.        ]])

In [137]:
pd.DataFrame(cosine_similarity([A, B, C]), 
             columns=['A', 'B', 'C'], 
             index=['A', 'B', 'C'])

Unnamed: 0,A,B,C
A,1.0,0.992278,0.707107
B,0.992278,1.0,0.789352
C,0.707107,0.789352,1.0


In [138]:
# import deps
# from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the Object and remove stopwords
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['corpus'])

# compare shapes
df.shape

(4800, 3)

In [139]:
tfidf_matrix.shape

(4800, 29102)

In [140]:
# import deps
# from sklearn.metrics.pairwise import linear_kernel

# Compute the Similarity Matirx
cos_mat = linear_kernel(tfidf_matrix, tfidf_matrix)

cos_mat.shape

(4800, 4800)

In [141]:
# Verifying the diagonal elements of the simmilarity matrix provide the sum to be the total number of movies(4800)
diag = 0
for i in range(len(cos_mat)):
    diag+= cos_mat[i][i]
    
print(diag)

4800.0


In [142]:
# def get_recommendations(movie, n):
    
#     # get index from dataframe
#     index = df[df['title']== movie].index[0]
    
#     # sort top n similar movies     
#     similar_movies = sorted(list(enumerate(cos_mat[index])), reverse=True, key=lambda x: x[1]) 
    
#     # extract names from dataframe and return movie names
#     recommendation = []
#     for i in similar_movies[1:n+1]:
#         recommendation.append(df.iloc[i[0]].title)
        
#     return recommendation

# get_recommendations("The Dark Knight", 3)


['The Dark Knight Rises', 'Batman Begins', 'Batman Returns']

In [143]:
# get_recommendations("Mission: Impossible", 3)

['Mission: Impossible III', 'Mission: Impossible II', 'Vanilla Sky']

In [144]:
# def get_keywords_recommendations(keywords, n):
    
#     keywords = keywords.split()
#     keywords = " ".join(keywords)
    
#     # transform the string to vector representation
#     key_tfidf = tfidf.transform([keywords]) 
    
#     # compute cosine similarity    
#     result = cosine_similarity(key_tfidf, tfidf_matrix)
  
#     # sort top n similar movies   
#     similar_key_movies = sorted(list(enumerate(result[0])), reverse=True, key=lambda x: x[1])
    
#     # extract names from dataframe and return movie names
#     recomm = []
#     for i in similar_key_movies[1:n+1]:
#         recomm.append(df.iloc[i[0]].title)
        
#     return recomm

In [145]:
# get_keywords_recommendations("Christopher Nolan", 4)

['Insomnia', 'Man of Steel', 'Batman Begins', 'Interstellar']

In [146]:
# get_keywords_recommendations("Daniel Craig", 4)

['Harrison Montgomery', 'Scary Movie 4', 'Action Jackson', 'Dream House']

In [147]:
import joblib

In [148]:
df.shape

(4800, 3)

In [149]:
cos_mat.shape

(4800, 4800)

In [150]:
tfidf_matrix.shape

(4800, 29102)

In [157]:
joblib.dump(df, '../models/movie_db.df')
joblib.dump(cos_mat, '../models/cos_mat.mt')
joblib.dump(tfidf, '../models/vectorizer.tf')
joblib.dump(tfidf_matrix, '../models/tfidf_mat.tf')

['../models/tfidf_mat.tf']