In [113]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import os
import json
from pathlib import Path

In [135]:
json_file_path = "../Data/tmdb_resources/tmdb_actors_db.json"

# Read JSON data from the file
with open(json_file_path, 'r') as file:
    json_data = json.load(file)

# Convert to DataFrame
actors_df = pd.json_normalize(json_data['results'], sep='_')
# Extracting 'original_language' from 'known_for' and adding it to the DataFrame
actors_df['original_language'] = actors_df['known_for'].apply(lambda x: x[0]['original_language'] if x else None)
# Filtering non-Hollywood/British actors
actors_df = actors_df[actors_df["original_language"] == "en"]
actors_df = actors_df[actors_df['known_for_department'] == "Acting"]
ordered_columns = ["name", "gender", "original_language", "known_for", "id"]
actors_df = actors_df[ordered_columns]

print(f"There are {actors_df.shape[0]} actors in the dataset.")
display(actors_df)

There are 8441 actors in the dataset.


Unnamed: 0,name,gender,original_language,known_for,id
1,Gary Oldman,2,en,"[{'adult': False, 'backdrop_path': '/nMKdUUepR...",64
3,Florence Pugh,1,en,"[{'adult': False, 'backdrop_path': '/aAM3cQmYG...",1373737
4,Jason Statham,2,en,"[{'adult': False, 'backdrop_path': '/ysKahAEPP...",976
6,Jackie Chan,2,en,"[{'adult': False, 'backdrop_path': '/r4yFYBEcV...",18897
7,Scarlett Johansson,1,en,"[{'adult': False, 'backdrop_path': '/ozVwXlfxq...",1245
...,...,...,...,...,...
9972,Richard Derr,2,en,"[{'adult': False, 'backdrop_path': '/ws8eX1paK...",15772
9973,Michael Maloney,2,en,"[{'adult': False, 'backdrop_path': '/9YoLdWeBS...",17483
9974,Gaia Scodellaro,1,en,"[{'adult': False, 'backdrop_path': '/tC78Pck2Y...",1636737
9976,Peter Cullen,2,en,"[{'adult': False, 'backdrop_path': '/2vFuG6bWG...",19540


In [126]:
data_path = "../Data/tmdb_resources/tmdb_id2detail.json"
tmdb_id2detail = { k:json.loads(v) for k,v in json.load(open(data_path,'r')).items() }

In [136]:
embedding_columns = ["embedding_movie_1", "embedding_movie_2", "embedding_movie_3"]
for col_name in embedding_columns:
    actors_df[col_name] = 0

In [137]:
display(actors_df)

Unnamed: 0,name,gender,original_language,known_for,id,embedding_movie_1,embedding_movie_2,embedding_movie_3
1,Gary Oldman,2,en,"[{'adult': False, 'backdrop_path': '/nMKdUUepR...",64,0,0,0
3,Florence Pugh,1,en,"[{'adult': False, 'backdrop_path': '/aAM3cQmYG...",1373737,0,0,0
4,Jason Statham,2,en,"[{'adult': False, 'backdrop_path': '/ysKahAEPP...",976,0,0,0
6,Jackie Chan,2,en,"[{'adult': False, 'backdrop_path': '/r4yFYBEcV...",18897,0,0,0
7,Scarlett Johansson,1,en,"[{'adult': False, 'backdrop_path': '/ozVwXlfxq...",1245,0,0,0
...,...,...,...,...,...,...,...,...
9972,Richard Derr,2,en,"[{'adult': False, 'backdrop_path': '/ws8eX1paK...",15772,0,0,0
9973,Michael Maloney,2,en,"[{'adult': False, 'backdrop_path': '/9YoLdWeBS...",17483,0,0,0
9974,Gaia Scodellaro,1,en,"[{'adult': False, 'backdrop_path': '/tC78Pck2Y...",1636737,0,0,0
9976,Peter Cullen,2,en,"[{'adult': False, 'backdrop_path': '/2vFuG6bWG...",19540,0,0,0


In [163]:
def date_to_year(date):
    if date == "None":
        return 0
    return int(date.split("-")[0])

def get_budget(movie_df):
    tmdb_idx = str(movie_df["id"])
    if tmdb_idx in tmdb_id2detail:
        return tmdb_id2detail[tmdb_idx]["budget"]
    return 0

def get_revenue(movie_df):
    tmdb_idx = str(movie_df["id"])
    if tmdb_idx in tmdb_id2detail:
        return tmdb_id2detail[tmdb_idx]["revenue"]
    return 0

def movie_to_embedding(movie_df):
    # feature vector for each movie has format (popularity, vote_average, release_date, production_budget, revenue)
    features = []
    feature_names = ["popularity", "vote_average"]
    for feature_name in feature_names:
        features.append(float(movie_df[feature_name]))
    features.append(date_to_year(movie_df["release_date"]))
    features.append(get_budget(movie_df))
    features.append(get_revenue(movie_df))
    return features


def get_embeddings(actor_name, movie_idx=0):
    popular_movies = actors_df[actors_df['name'] == actor_name]['known_for'].values[0]
    for movie in popular_movies:
        if "release_date" not in movie:
            movie["release_date"] = 'None'
    popular_movies = sorted(popular_movies, key=lambda x: x['release_date'])
    if len(popular_movies) <= movie_idx:
        return []
    return movie_to_embedding(popular_movies[movie_idx])

In [164]:
for i, embedding_column in enumerate(embedding_columns):
    actors_df[embedding_column] = actors_df["name"].apply(get_embeddings, movie_idx=i)

In [165]:
actors_df.head()

Unnamed: 0,name,gender,original_language,known_for,id,embedding_movie_1,embedding_movie_2,embedding_movie_3
1,Gary Oldman,2,en,"[{'adult': False, 'backdrop_path': '/nMKdUUepR...",64,"[127.121, 8.513, 2008, 185000000, 1004558444]","[78.615, 7.777, 2012, 250000000, 1081041287]","[32.352, 7.35, 2017, 30000000, 150847207]"
3,Florence Pugh,1,en,"[{'adult': False, 'backdrop_path': '/aAM3cQmYG...",1373737,"[53.79, 7.162, 2019, 9000000, 47969371]","[36.364, 7.893, 2019, 40000000, 216601214]","[89.461, 7.293, 2021, 200000000, 379751131]"
4,Jason Statham,2,en,"[{'adult': False, 'backdrop_path': '/ysKahAEPP...",976,"[36.288, 7.804, 2000, 10000000, 83557872]","[34.472, 6.698, 2002, 21000000, 43928932]","[101.3, 6.253, 2018, 150000000, 530517320]"
6,Jackie Chan,2,en,"[{'adult': False, 'backdrop_path': '/r4yFYBEcV...",18897,"[57.09, 7.019, 1998, 33000000, 244721064]","[50.15, 6.717, 2001, 90000000, 347325802]","[64.894, 6.436, 2007, 140000000, 258097122]"
7,Scarlett Johansson,1,en,"[{'adult': False, 'backdrop_path': '/ozVwXlfxq...",1245,"[41.798, 7.669, 2014, 170000000, 714766572]","[52.474, 6.44, 2014, 40000000, 458863600]","[89.461, 7.293, 2021, 200000000, 379751131]"


In [166]:
actors_df = actors_df[["name", "gender", "embedding_movie_1", "embedding_movie_2", "embedding_movie_3"]]

In [167]:
actors_df.head()

Unnamed: 0,name,gender,embedding_movie_1,embedding_movie_2,embedding_movie_3
1,Gary Oldman,2,"[127.121, 8.513, 2008, 185000000, 1004558444]","[78.615, 7.777, 2012, 250000000, 1081041287]","[32.352, 7.35, 2017, 30000000, 150847207]"
3,Florence Pugh,1,"[53.79, 7.162, 2019, 9000000, 47969371]","[36.364, 7.893, 2019, 40000000, 216601214]","[89.461, 7.293, 2021, 200000000, 379751131]"
4,Jason Statham,2,"[36.288, 7.804, 2000, 10000000, 83557872]","[34.472, 6.698, 2002, 21000000, 43928932]","[101.3, 6.253, 2018, 150000000, 530517320]"
6,Jackie Chan,2,"[57.09, 7.019, 1998, 33000000, 244721064]","[50.15, 6.717, 2001, 90000000, 347325802]","[64.894, 6.436, 2007, 140000000, 258097122]"
7,Scarlett Johansson,1,"[41.798, 7.669, 2014, 170000000, 714766572]","[52.474, 6.44, 2014, 40000000, 458863600]","[89.461, 7.293, 2021, 200000000, 379751131]"


In [168]:
actors_df.to_csv('../Data/preprocessed_data/embedding_db.csv', index=False)