In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import os
import json
from pathlib import Path

**Embeddings: 1st version**

Each actor is represented by his gender and career trajectory: top-3 most popular movies from TMDB database in chronological order

In [205]:
json_file_path = "../Data/tmdb_resources/tmdb_actors_db.json"

# Read JSON data from the file
with open(json_file_path, 'r') as file:
    json_data = json.load(file)

# Convert to DataFrame
actors_df = pd.json_normalize(json_data['results'], sep='_')
# Extracting 'original_language' from 'known_for' and adding it to the DataFrame
actors_df['original_language'] = actors_df['known_for'].apply(lambda x: x[0]['original_language'] if x else None)
# Filtering non-Hollywood/British actors
actors_df = actors_df[actors_df["original_language"] == "en"]
actors_df = actors_df[actors_df['known_for_department'] == "Acting"]
ordered_columns = ["name", "gender", "original_language", "known_for", "id"]
actors_df = actors_df[ordered_columns]

print(f"There are {actors_df.shape[0]} actors in the dataset.")
display(actors_df)

There are 8441 actors in the dataset.


Unnamed: 0,name,gender,original_language,known_for,id
1,Gary Oldman,2,en,"[{'adult': False, 'backdrop_path': '/nMKdUUepR...",64
3,Florence Pugh,1,en,"[{'adult': False, 'backdrop_path': '/aAM3cQmYG...",1373737
4,Jason Statham,2,en,"[{'adult': False, 'backdrop_path': '/ysKahAEPP...",976
6,Jackie Chan,2,en,"[{'adult': False, 'backdrop_path': '/r4yFYBEcV...",18897
7,Scarlett Johansson,1,en,"[{'adult': False, 'backdrop_path': '/ozVwXlfxq...",1245
...,...,...,...,...,...
9972,Richard Derr,2,en,"[{'adult': False, 'backdrop_path': '/ws8eX1paK...",15772
9973,Michael Maloney,2,en,"[{'adult': False, 'backdrop_path': '/9YoLdWeBS...",17483
9974,Gaia Scodellaro,1,en,"[{'adult': False, 'backdrop_path': '/tC78Pck2Y...",1636737
9976,Peter Cullen,2,en,"[{'adult': False, 'backdrop_path': '/2vFuG6bWG...",19540


In [206]:
data_path = "../Data/tmdb_resources/tmdb_id2detail.json"
tmdb_id2detail = { k:json.loads(v) for k,v in json.load(open(data_path,'r')).items() }

**Feature extraction**

In [208]:
feature_names = ["popularity", "vote_avg", "year_of_release", "budget", "revenue"]

In [209]:
EMBEDDING_DIM = 15

def date_to_year(date):
    if date == "None":
        return 0
    return int(date.split("-")[0])

def get_budget(movie_df):
    tmdb_idx = str(movie_df["id"])
    if tmdb_idx in tmdb_id2detail:
        return tmdb_id2detail[tmdb_idx]["budget"]
    return 0

def get_revenue(movie_df):
    tmdb_idx = str(movie_df["id"])
    if tmdb_idx in tmdb_id2detail:
        return tmdb_id2detail[tmdb_idx]["revenue"]
    return 0

def movie_to_embedding(movie_df):
    # feature vector for each movie has format (popularity, vote_average, release_date, production_budget, revenue)
    features = []
    feature_names = ["popularity", "vote_average"]
    for feature_name in feature_names:
        features.append(float(movie_df[feature_name]))
    features.append(date_to_year(movie_df["release_date"]))
    features.append(get_budget(movie_df))
    features.append(get_revenue(movie_df))
    return features

def get_embeddings(info_per_actor):
    embeddings = []
    popular_movies = info_per_actor['known_for']
    for movie in popular_movies:
        if "release_date" not in movie:
            movie["release_date"] = 'None'
    popular_movies = sorted(popular_movies, key=lambda x: x['release_date'])
    for movie in popular_movies:
        embeddings.extend(movie_to_embedding(movie))
    if len(embeddings) < EMBEDDING_DIM:
        embeddings.extend([0]*(EMBEDDING_DIM - len(embeddings)))
    return embeddings

In [211]:
number_of_movies = 3
emb_columns = []
for i in range(number_of_movies):
    emb_columns.extend([name + "_" + str(i) for name in feature_names])
actors_df[emb_columns] = actors_df.apply(get_embeddings, axis=1, result_type="expand")

In [214]:
actors_df = actors_df[["name", "gender", *emb_columns]]
actors_df.head()

Unnamed: 0,name,gender,popularity_0,vote_avg_0,year_of_release_0,budget_0,revenue_0,popularity_1,vote_avg_1,year_of_release_1,budget_1,revenue_1,popularity_2,vote_avg_2,year_of_release_2,budget_2,revenue_2
1,Gary Oldman,2,127.121,8.513,2008.0,185000000.0,1004558000.0,78.615,7.777,2012.0,250000000.0,1081041000.0,32.352,7.35,2017.0,30000000.0,150847207.0
3,Florence Pugh,1,53.79,7.162,2019.0,9000000.0,47969370.0,36.364,7.893,2019.0,40000000.0,216601200.0,89.461,7.293,2021.0,200000000.0,379751131.0
4,Jason Statham,2,36.288,7.804,2000.0,10000000.0,83557870.0,34.472,6.698,2002.0,21000000.0,43928930.0,101.3,6.253,2018.0,150000000.0,530517320.0
6,Jackie Chan,2,57.09,7.019,1998.0,33000000.0,244721100.0,50.15,6.717,2001.0,90000000.0,347325800.0,64.894,6.436,2007.0,140000000.0,258097122.0
7,Scarlett Johansson,1,41.798,7.669,2014.0,170000000.0,714766600.0,52.474,6.44,2014.0,40000000.0,458863600.0,89.461,7.293,2021.0,200000000.0,379751131.0


In [215]:
actors_df.to_csv('../Data/preprocessed_data/embedding_db.csv', index=False)

**Data scaling**

In [220]:
# one-hot-encoding for gender 
actors_df["female"] = actors_df["gender"].apply(lambda x: int(x == 1))
actors_df["male"] = actors_df["gender"].apply(lambda x: int(x == 2))
actors_df.drop(columns =["gender"], inplace=True)

In [263]:
actors_df.head()

Unnamed: 0,name,popularity_0,vote_avg_0,year_of_release_0,budget_0,revenue_0,popularity_1,vote_avg_1,year_of_release_1,budget_1,revenue_1,popularity_2,vote_avg_2,year_of_release_2,budget_2,revenue_2,female,male
1,Gary Oldman,87069.15625,8.513,2008.0,119417400000.0,765977000000.0,33907.689655,7.777,2012.0,156594800000.0,683398000000.0,-12084.82,7.35,2017.0,-46639360000.0,-182249600000.0,0,1
3,Florence Pugh,18104.353333,7.162,2019.0,-62627840000.0,-277761900000.0,-2344.364706,7.893,2019.0,-34690290000.0,-132291100000.0,33197.570312,7.293,2021.0,103615100000.0,146148700000.0,1,0
4,Jason Statham,-2281.215311,7.804,2000.0,-42408020000.0,-101079900000.0,-9947.127273,6.698,2002.0,-40629500000.0,-262495700000.0,56108.307263,6.253,2018.0,71098520000.0,177829600000.0,0,1
6,Jackie Chan,22682.105556,7.019,1998.0,-16255530000.0,64707790000.0,10825.383234,6.717,2001.0,33458390000.0,134278400000.0,24975.111111,6.436,2007.0,61173440000.0,-4452244000.0,0,1
7,Scarlett Johansson,1410.468864,7.669,2014.0,105610000000.0,468272700000.0,8805.530435,6.44,2014.0,-32397350000.0,177895100000.0,33197.570312,7.293,2021.0,103615100000.0,146148700000.0,1,0


In [264]:
# time-scaling for budget
def find_mean_std(column_to_scale, year_column):
    actors_df_filtered = actors_df[actors_df[column_to_scale] > 0]
    mean_db = actors_df_filtered.groupby(year_column)[column_to_scale].mean()
    std_db = actors_df_filtered.groupby(year_column)[column_to_scale].std()
    return mean_db, std_db

def normalize(x, column_to_scale, year_column, mean_db, std_db):
    EPS = 0.001
    year = x[year_column]
    if year in mean_db:
        return (x[column_to_scale] - mean_db[year]) / max(std_db[year], EPS)
    return 0.

In [265]:
year_columns = ["year_of_release_0", "year_of_release_1", "year_of_release_2"]
numerical_columns = ["popularity", "budget", "revenue"]

for i, year_column in enumerate(year_columns):
    columns_to_scale = [x + "_" + str(i) for x in numerical_columns]
    for column_to_scale in columns_to_scale:
        mean_db, std_db = find_mean_std(column_to_scale, year_column)
        actors_df[column_to_scale] = actors_df.apply(normalize,
                                                    column_to_scale=column_to_scale,
                                                    year_column=year_column, 
                                                    mean_db=mean_db,
                                                    std_db=std_db,
                                                    axis=1)

In [266]:
actors_df.head()

Unnamed: 0,name,popularity_0,vote_avg_0,year_of_release_0,budget_0,revenue_0,popularity_1,vote_avg_1,year_of_release_1,budget_1,revenue_1,popularity_2,vote_avg_2,year_of_release_2,budget_2,revenue_2,female,male
1,Gary Oldman,1.507633,8.513,2008.0,1.316324,2.198007,0.010764,7.777,2012.0,1.386235,0.812204,-0.896142,7.35,2017.0,-2.458227,-1.993847,0,1
3,Florence Pugh,-0.489096,7.162,2019.0,-2.091787,-1.439428,-1.109771,7.893,2019.0,-1.920269,-1.290402,-0.362123,7.293,2021.0,0.580893,-0.647513,1,0
4,Jason Statham,-1.079677,7.804,2000.0,-3.515421,-2.560657,-1.665977,6.698,2002.0,-3.355364,-2.319752,0.230618,6.253,2018.0,-0.186803,-0.503029,0,1
6,Jackie Chan,-0.352283,7.019,1998.0,-1.917068,-0.79804,-0.658973,6.717,2001.0,-0.069952,-0.507973,-0.478246,6.436,2007.0,-0.362471,-1.495821,0,1
7,Scarlett Johansson,-0.884983,7.669,2014.0,0.774197,0.936368,-0.96942,6.44,2014.0,-2.784103,-0.594139,-0.362123,7.293,2021.0,0.580893,-0.647513,1,0


In [267]:
# vote average doesn't require time scaling
for column_to_scale in ["vote_avg_0", "vote_avg_1", "vote_avg_2"]:
    actors_df[column_to_scale] -= actors_df[column_to_scale].mean()
    actors_df[column_to_scale] /= actors_df[column_to_scale].std()

In [268]:
actors_df.head()

Unnamed: 0,name,popularity_0,vote_avg_0,year_of_release_0,budget_0,revenue_0,popularity_1,vote_avg_1,year_of_release_1,budget_1,revenue_1,popularity_2,vote_avg_2,year_of_release_2,budget_2,revenue_2,female,male
1,Gary Oldman,1.507633,2.019536,2008.0,1.316324,2.198007,0.010764,1.134591,2012.0,1.386235,0.812204,-0.896142,0.362527,2017.0,-2.458227,-1.993847,0,1
3,Florence Pugh,-0.489096,0.49286,2019.0,-2.091787,-1.439428,-1.109771,1.257971,2019.0,-1.920269,-1.290402,-0.362123,0.305373,2021.0,0.580893,-0.647513,1,0
4,Jason Statham,-1.079677,1.218342,2000.0,-3.515421,-2.560657,-1.665977,-0.01305,2002.0,-3.355364,-2.319752,0.230618,-0.737429,2018.0,-0.186803,-0.503029,0,1
6,Jackie Chan,-0.352283,0.331265,1998.0,-1.917068,-0.79804,-0.658973,0.007158,2001.0,-0.069952,-0.507973,-0.478246,-0.553936,2007.0,-0.362471,-1.495821,0,1
7,Scarlett Johansson,-0.884983,1.065787,2014.0,0.774197,0.936368,-0.96942,-0.287463,2014.0,-2.784103,-0.594139,-0.362123,0.305373,2021.0,0.580893,-0.647513,1,0


In [269]:
actors_df.to_csv('../Data/preprocessed_data/embedding_db_normalized.csv', index=False)

**Embeddings: 2nd version after Sunday's discussion**

Second version of embeddings includes avg statistics for beginning, middle and end of an actor's career, instead of only 3 most popular movies. We consider only movies which are in CMU database

In [18]:
tmdb_actors = pd.read_csv("../Data/preprocessed_data/actor_genre.csv")
tmdb_actors = tmdb_actors[["name", "genre_mean_weighted"]]

In [19]:
tmdb_id2detail_path = "../Data/tmdb_resources/tmdb_id2detail.json"
movie_properties = { k:json.loads(v) for k,v in json.load(open(tmdb_id2detail_path,'r')).items() }

key_0 = list(movie_properties.keys())[0]
print(movie_properties[key_0])

{'adult': False, 'backdrop_path': '/anSbunnEMI0TSmizqUSRACoe18l.jpg', 'belongs_to_collection': None, 'budget': 28000000, 'genres': [{'id': 28, 'name': 'Action'}, {'id': 27, 'name': 'Horror'}, {'id': 878, 'name': 'Science Fiction'}], 'homepage': 'http://www.theofficialjohncarpenter.com/ghost-of-mars/', 'id': 10016, 'imdb_id': 'tt0228333', 'original_language': 'en', 'original_title': 'Ghosts of Mars', 'overview': 'In 2176, a Martian police unit is sent to pick up a highly dangerous criminal at a remote mining post. Upon arrival, the cops find the post deserted and something far more dangerous than any criminal — the original inhabitants of Mars, hellbent on getting their planet back.', 'popularity': 19.189, 'poster_path': '/i2zztssCIbahGES1fdfWFmDXian.jpg', 'production_companies': [{'id': 51312, 'logo_path': None, 'name': 'Animationwerks', 'origin_country': ''}, {'id': 3287, 'logo_path': '/bz6GbCQQXGNE56LTW9dwgksW0Iw.png', 'name': 'Screen Gems', 'origin_country': 'US'}, {'id': 23895, 'lo

In [20]:
tmdb_id2credit = "../Data/tmdb_resources/tmdb_id2credit.json"
movie_to_cast = { k:json.loads(v) for k,v in json.load(open(tmdb_id2credit,'r')).items() }

key_0 = list(movie_to_cast.keys())[0]
print(movie_to_cast[key_0])

{'id': 44565, 'cast': [{'adult': False, 'gender': 2, 'id': 35070, 'known_for_department': 'Acting', 'name': 'Akshay Kumar', 'original_name': 'Akshay Kumar', 'popularity': 20.117, 'profile_path': '/gaINfJtR19rNTIsI2cF3KFjyAB7.jpg', 'cast_id': 4, 'character': 'Sachin Tichkule', 'credit_id': '52fe468fc3a36847f8105a31', 'order': 0}, {'adult': False, 'gender': 1, 'id': 116925, 'known_for_department': 'Acting', 'name': 'Trisha Krishnan', 'original_name': 'Trisha Krishnan', 'popularity': 34.129, 'profile_path': '/jfeAV0VeAQhKONzIv1UEYbklJGn.jpg', 'cast_id': 5, 'character': 'Gehna Ghanphule Madam', 'credit_id': '52fe468fc3a36847f8105a35', 'order': 1}, {'adult': False, 'gender': 2, 'id': 35756, 'known_for_department': 'Acting', 'name': 'Rajpal Yadav', 'original_name': 'Rajpal Yadav', 'popularity': 13.055, 'profile_path': '/zhzfI91YuODuC3ZY2ABytw68C37.jpg', 'cast_id': 6, 'character': 'Rangeela', 'credit_id': '52fe468fc3a36847f8105a39', 'order': 2}, {'adult': False, 'gender': 2, 'id': 35819, 'kno

In [23]:
tmdb_id2plot_cmu_only = "../Data/tmdb_resources/tmdb_id2plot_cmu_only.json"
cmu_movies_with_plots = json.load(open(tmdb_id2plot_cmu_only,'r'))

print(len(cmu_movies_with_plots.keys()))
key_0 = list(cmu_movies_with_plots.keys())[0]
print(key_0, cmu_movies_with_plots[key_0])

32280
10016 Set in the second half of the 22nd century, the film depicts Mars as a planet that has been 84% terraformed, allowing humans to walk on the surface without wearing pressure suits. The Martian society has become largely matriarchal, with women in most positions of authority. The story concerns a police officer, Melanie Ballard , second in command of a small team alongside Sergeant Jericho  sent to pick up and transport a prisoner named Desolation Williams . Arriving at the remote mining town where Williams is being held, Ballard finds virtually all of the people missing. She learns that the miners had discovered an underground doorway created by an ancient Martian civilization. When the door was opened it released "ghosts," disembodied spirits which possessed the miners. Violence ensues, as the possessed miners commit horrific acts of death and destruction, as well as self-mutilation. With their team leader Helena Bradock  murdered, Ballard must fight off the attacking miner

In [24]:
actors_to_movies_matching = {}
for id in movie_to_cast:
    cast = movie_to_cast[id]["cast"]
    for actor in cast:
        if actor["name"] in actors_to_movies_matching:
            actors_to_movies_matching[actor["name"]].append(id)
        else:
            actors_to_movies_matching[actor["name"]] = [id]

In [25]:
actors_to_movies_matching = {k:v for k, v in actors_to_movies_matching.items() if k in tmdb_actors["name"].unique()}

In [26]:
len(actors_to_movies_matching)

9284

In [27]:
LAST_RELEASE_THRESHOLD = "2003" # filtering out actors who ended their careers 20 years ago and earlier

def count_movies(info_per_actor):
    name = info_per_actor["name"]
    if name in actors_to_movies_matching:
        n_movies = len(actors_to_movies_matching[name])
        movie_ids = actors_to_movies_matching[name]
        release_dates = sorted([movie_properties[id]["release_date"] for id in movie_ids])

        last_release_date = release_dates[-1]
        if last_release_date >= LAST_RELEASE_THRESHOLD:
            return n_movies
    return

tmdb_actors["number_of_movies"] = tmdb_actors.apply(count_movies, axis=1, result_type="expand")

In [29]:
tmdb_actors = tmdb_actors.dropna(subset=["number_of_movies"])
display(tmdb_actors)

Unnamed: 0,name,genre_mean_weighted,number_of_movies
1,Gary Oldman,['Crime'],57.0
3,Florence Pugh,['Mystery'],10.0
4,Jason Statham,['Science Fiction'],45.0
6,Jackie Chan,['Crime'],62.0
7,Scarlett Johansson,['Science Fiction'],51.0
...,...,...,...
9968,Alice Isaaz,['Romance'],6.0
9969,Peter Cullen,['Science Fiction'],18.0
9970,Mary Crosby,['Family'],3.0
9971,Daisuke Namikawa,['Mystery'],12.0


In [30]:
tmdb_actors.to_csv('../Data/preprocessed_data/tmdb_acting_in_2003_and_later.csv', index=False)

In [105]:
numerical_columns = ["popularity", "vote_average", "budget", "revenue"]

def get_average_for_subset(movie_ids, mode="popularity"):
    stats = np.array([float(movie_properties[id][mode]) for id in movie_ids])
    return stats.mean()

def get_embeddings(info_per_actor):
    embeddings = []
    name = info_per_actor['name']
    step = int(info_per_actor['number_of_movies']) // 3
    movie_ids = actors_to_movies_matching[name]
    release_dates = sorted([(movie_properties[id]["release_date"], id) for id in movie_ids])
    
    for i in range(3):
        start = step * i
        end = step * (i + 1) if i < 2 else len(release_dates)
        movie_ids = [x[1] for x in release_dates[start:end]]
        for col_name in numerical_columns:
            embeddings.append(get_average_for_subset(movie_ids, mode=col_name))
    return embeddings

Index 0 corresponds to average statistics for the **beginning** of career, index 1 corresponds to average statistics for the **middle** of career, index 2 corresponds to average statistics for the **end** of career

In [106]:
emb_columns = []
for i in range(3):
    emb_columns.extend([col_name + "_" + str(i) for col_name in numerical_columns])
actors_cmu[emb_columns] = actors_cmu.apply(get_embeddings, axis=1, result_type="expand")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [108]:
display(actors_cmu)

Unnamed: 0,name,genre_mean_weighted,number_of_movies,popularity_0,vote_average_0,budget_0,revenue_0,popularity_1,vote_average_1,budget_1,revenue_1,popularity_2,vote_average_2,budget_2,revenue_2
1,Gary Oldman,['Crime'],32.0,23.073700,6.912700,1.425001e+07,5.334506e+07,52.007100,6.988900,8.020000e+07,2.846164e+08,51.813833,6.760333,9.841667e+07,4.377206e+08
4,Jason Statham,['Science Fiction'],20.0,26.619167,6.770667,2.822500e+07,6.976885e+07,26.384500,6.305167,3.183333e+07,6.045145e+07,41.027875,6.250000,4.712500e+07,1.357196e+08
6,Jackie Chan,['Crime'],36.0,20.357500,6.566250,5.743600e+06,1.441320e+07,25.055750,6.389833,2.949286e+07,8.079403e+07,26.463083,6.341000,3.908333e+07,7.577516e+07
7,Scarlett Johansson,['Science Fiction'],23.0,16.639000,5.899143,2.064286e+07,3.978111e+07,30.243286,6.814286,3.000000e+07,7.171574e+07,25.635444,6.471000,3.488889e+07,8.342756e+07
8,Josh Hutcherson,['Science Fiction'],13.0,36.714500,6.523250,6.884634e+07,1.082254e+08,36.286500,6.268500,3.875000e+07,8.605043e+07,34.813000,6.293400,4.026000e+07,2.225926e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9964,Lyle Lovett,['War'],6.0,15.375500,7.199000,4.000000e+06,1.390854e+07,11.591000,6.267500,7.500000e+06,8.399899e+06,15.229500,6.221500,2.400000e+07,2.474227e+07
9965,Richard Derr,['War'],7.0,6.894000,6.442500,4.680000e+05,2.128500e+06,7.680000,5.266000,2.000000e+06,3.000002e+06,15.064333,6.173333,8.566667e+06,4.199578e+07
9966,Michael Maloney,['Comedy'],5.0,18.096000,7.209000,9.000000e+06,1.017670e+07,22.254000,7.180000,2.500000e+07,1.353302e+08,15.624667,6.896333,2.100000e+07,6.496717e+07
9969,Peter Cullen,['Science Fiction'],12.0,18.088500,5.947750,9.750000e+06,6.214939e+07,38.546000,7.182000,4.775000e+07,6.481246e+07,16.913750,6.455750,1.477500e+08,6.831695e+08


In [110]:
actors_cmu.to_csv('../Data/preprocessed_data/embedding_db_v2.csv', index=False)