In [204]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import os
import json
from pathlib import Path

In [205]:
json_file_path = "../Data/tmdb_resources/tmdb_actors_db.json"

# Read JSON data from the file
with open(json_file_path, 'r') as file:
    json_data = json.load(file)

# Convert to DataFrame
actors_df = pd.json_normalize(json_data['results'], sep='_')
# Extracting 'original_language' from 'known_for' and adding it to the DataFrame
actors_df['original_language'] = actors_df['known_for'].apply(lambda x: x[0]['original_language'] if x else None)
# Filtering non-Hollywood/British actors
actors_df = actors_df[actors_df["original_language"] == "en"]
actors_df = actors_df[actors_df['known_for_department'] == "Acting"]
ordered_columns = ["name", "gender", "original_language", "known_for", "id"]
actors_df = actors_df[ordered_columns]

print(f"There are {actors_df.shape[0]} actors in the dataset.")
display(actors_df)

There are 8441 actors in the dataset.


Unnamed: 0,name,gender,original_language,known_for,id
1,Gary Oldman,2,en,"[{'adult': False, 'backdrop_path': '/nMKdUUepR...",64
3,Florence Pugh,1,en,"[{'adult': False, 'backdrop_path': '/aAM3cQmYG...",1373737
4,Jason Statham,2,en,"[{'adult': False, 'backdrop_path': '/ysKahAEPP...",976
6,Jackie Chan,2,en,"[{'adult': False, 'backdrop_path': '/r4yFYBEcV...",18897
7,Scarlett Johansson,1,en,"[{'adult': False, 'backdrop_path': '/ozVwXlfxq...",1245
...,...,...,...,...,...
9972,Richard Derr,2,en,"[{'adult': False, 'backdrop_path': '/ws8eX1paK...",15772
9973,Michael Maloney,2,en,"[{'adult': False, 'backdrop_path': '/9YoLdWeBS...",17483
9974,Gaia Scodellaro,1,en,"[{'adult': False, 'backdrop_path': '/tC78Pck2Y...",1636737
9976,Peter Cullen,2,en,"[{'adult': False, 'backdrop_path': '/2vFuG6bWG...",19540


In [206]:
data_path = "../Data/tmdb_resources/tmdb_id2detail.json"
tmdb_id2detail = { k:json.loads(v) for k,v in json.load(open(data_path,'r')).items() }

In [208]:
feature_names = ["popularity", "vote_avg", "year_of_release", "budget", "revenue"]

In [209]:
EMBEDDING_DIM = 15

def date_to_year(date):
    if date == "None":
        return 0
    return int(date.split("-")[0])

def get_budget(movie_df):
    tmdb_idx = str(movie_df["id"])
    if tmdb_idx in tmdb_id2detail:
        return tmdb_id2detail[tmdb_idx]["budget"]
    return 0

def get_revenue(movie_df):
    tmdb_idx = str(movie_df["id"])
    if tmdb_idx in tmdb_id2detail:
        return tmdb_id2detail[tmdb_idx]["revenue"]
    return 0

def movie_to_embedding(movie_df):
    # feature vector for each movie has format (popularity, vote_average, release_date, production_budget, revenue)
    features = []
    feature_names = ["popularity", "vote_average"]
    for feature_name in feature_names:
        features.append(float(movie_df[feature_name]))
    features.append(date_to_year(movie_df["release_date"]))
    features.append(get_budget(movie_df))
    features.append(get_revenue(movie_df))
    return features

def get_embeddings(info_per_actor):
    embeddings = []
    popular_movies = info_per_actor['known_for']
    for movie in popular_movies:
        if "release_date" not in movie:
            movie["release_date"] = 'None'
    popular_movies = sorted(popular_movies, key=lambda x: x['release_date'])
    for movie in popular_movies:
        embeddings.extend(movie_to_embedding(movie))
    if len(embeddings) < EMBEDDING_DIM:
        embeddings.extend([0]*(EMBEDDING_DIM - len(embeddings)))
    return embeddings

In [211]:
number_of_movies = 3
emb_columns = []
for i in range(number_of_movies):
    emb_columns.extend([name + "_" + str(i) for name in feature_names])
actors_df[emb_columns] = actors_df.apply(get_embeddings, axis=1, result_type="expand")

In [214]:
actors_df = actors_df[["name", "gender", *emb_columns]]
actors_df.head()

Unnamed: 0,name,gender,popularity_0,vote_avg_0,year_of_release_0,budget_0,revenue_0,popularity_1,vote_avg_1,year_of_release_1,budget_1,revenue_1,popularity_2,vote_avg_2,year_of_release_2,budget_2,revenue_2
1,Gary Oldman,2,127.121,8.513,2008.0,185000000.0,1004558000.0,78.615,7.777,2012.0,250000000.0,1081041000.0,32.352,7.35,2017.0,30000000.0,150847207.0
3,Florence Pugh,1,53.79,7.162,2019.0,9000000.0,47969370.0,36.364,7.893,2019.0,40000000.0,216601200.0,89.461,7.293,2021.0,200000000.0,379751131.0
4,Jason Statham,2,36.288,7.804,2000.0,10000000.0,83557870.0,34.472,6.698,2002.0,21000000.0,43928930.0,101.3,6.253,2018.0,150000000.0,530517320.0
6,Jackie Chan,2,57.09,7.019,1998.0,33000000.0,244721100.0,50.15,6.717,2001.0,90000000.0,347325800.0,64.894,6.436,2007.0,140000000.0,258097122.0
7,Scarlett Johansson,1,41.798,7.669,2014.0,170000000.0,714766600.0,52.474,6.44,2014.0,40000000.0,458863600.0,89.461,7.293,2021.0,200000000.0,379751131.0


In [215]:
actors_df.to_csv('../Data/preprocessed_data/embedding_db.csv', index=False)