In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import os
import json

In [33]:
json_file_path = "../Data/tmdb_resources/tmdb_actors_db.json"

# Read JSON data from the file
with open(json_file_path, 'r') as file:
    json_data = json.load(file)

# Convert to DataFrame
actors_df = pd.json_normalize(json_data['results'], sep='_')
# Extracting 'original_language' from 'known_for' and adding it to the DataFrame
actors_df['original_language'] = actors_df['known_for'].apply(lambda x: x[0]['original_language'] if x else None)
# Filtering non-Hollywood/British actors
actors_df = actors_df[actors_df["original_language"] == "en"]
actors_df = actors_df[actors_df['known_for_department'] == "Acting"]
ordered_columns = ["name", "gender", "original_language", "known_for", "id"]
actors_df = actors_df[ordered_columns]

print(f"There are {actors_df.shape[0]} actors in the dataset.")
display(actors_df)

There are 8441 actors in the dataset.


Unnamed: 0,name,gender,original_language,known_for,id
1,Gary Oldman,2,en,"[{'adult': False, 'backdrop_path': '/nMKdUUepR...",64
3,Florence Pugh,1,en,"[{'adult': False, 'backdrop_path': '/aAM3cQmYG...",1373737
4,Jason Statham,2,en,"[{'adult': False, 'backdrop_path': '/ysKahAEPP...",976
6,Jackie Chan,2,en,"[{'adult': False, 'backdrop_path': '/r4yFYBEcV...",18897
7,Scarlett Johansson,1,en,"[{'adult': False, 'backdrop_path': '/ozVwXlfxq...",1245
...,...,...,...,...,...
9972,Richard Derr,2,en,"[{'adult': False, 'backdrop_path': '/ws8eX1paK...",15772
9973,Michael Maloney,2,en,"[{'adult': False, 'backdrop_path': '/9YoLdWeBS...",17483
9974,Gaia Scodellaro,1,en,"[{'adult': False, 'backdrop_path': '/tC78Pck2Y...",1636737
9976,Peter Cullen,2,en,"[{'adult': False, 'backdrop_path': '/2vFuG6bWG...",19540


In [30]:
embedding_columns = ["embedding_movie_1", "embedding_movie_2", "embedding_movie_3"]
actors_df["embedding_movie_1"] = 0
actors_df["embedding_movie_2"] = 0
actors_df["embedding_movie_3"] = 0

In [31]:
display(actors_df)

Unnamed: 0,name,gender,original_language,known_for,id,embedding_movie_1,embedding_movie_2,embedding_movie_3
1,Gary Oldman,2,en,"[{'adult': False, 'backdrop_path': '/nMKdUUepR...",64,0,0,0
3,Florence Pugh,1,en,"[{'adult': False, 'backdrop_path': '/aAM3cQmYG...",1373737,0,0,0
4,Jason Statham,2,en,"[{'adult': False, 'backdrop_path': '/ysKahAEPP...",976,0,0,0
6,Jackie Chan,2,en,"[{'adult': False, 'backdrop_path': '/r4yFYBEcV...",18897,0,0,0
7,Scarlett Johansson,1,en,"[{'adult': False, 'backdrop_path': '/ozVwXlfxq...",1245,0,0,0
...,...,...,...,...,...,...,...,...
9972,Richard Derr,2,en,"[{'adult': False, 'backdrop_path': '/ws8eX1paK...",15772,0,0,0
9973,Michael Maloney,2,en,"[{'adult': False, 'backdrop_path': '/9YoLdWeBS...",17483,0,0,0
9974,Gaia Scodellaro,1,en,"[{'adult': False, 'backdrop_path': '/tC78Pck2Y...",1636737,0,0,0
9976,Peter Cullen,2,en,"[{'adult': False, 'backdrop_path': '/2vFuG6bWG...",19540,0,0,0


In [28]:
example_df = pd.json_normalize(actors_df["known_for"].values[0])

In [29]:
example_df.head()

Unnamed: 0,adult,backdrop_path,id,title,original_language,original_title,overview,poster_path,media_type,genre_ids,popularity,release_date,video,vote_average,vote_count
0,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,155,The Dark Knight,en,The Dark Knight,Batman raises the stakes in his war on crime. ...,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,movie,"[18, 28, 80, 53]",127.121,2008-07-16,False,8.513,30883
1,False,/c3OHQncTAnKFhdOTX7D3LTW6son.jpg,49026,The Dark Knight Rises,en,The Dark Knight Rises,Following the death of District Attorney Harve...,/hr0L2aueqlP2BYUblTTjmtn0hw4.jpg,movie,"[28, 80, 18, 53]",78.615,2012-07-17,False,7.777,21486
2,False,/zXwFJMwvQcJFitP9GcHZvHAHGe8.jpg,399404,Darkest Hour,en,Darkest Hour,"In May 1940, the fate of World War II hangs on...",/xa6G3aKlysQeVg9wOb0dRcIGlWu.jpg,movie,"[18, 36]",32.352,2017-11-22,False,7.35,4789


In [25]:
#Example:  Get information about the 3 most popular movies of the actor Jackie Chan
def get_popular_movies(actor_name):
    actor_movies = actors_df[actors_df['name'] == actor_name]['known_for'].values[0]
    popular_movies = sorted(actor_movies, key=lambda x: x['release_date'])
    
    actors_df["movie_1"]
    return popular_movies

example_df = pd.json_normalize(get_popular_movies("Jackie Chan"))
ordered_columns = ["title", "popularity", "original_language", "genre_ids", "release_date", "vote_average", "vote_count", "id", "overview"]
example_df[ordered_columns]

Unnamed: 0,title,popularity,original_language,genre_ids,release_date,vote_average,vote_count,id,overview
0,Rush Hour,57.09,en,"[28, 35, 80]",1998-09-18,7.019,4436,2109,When Hong Kong Inspector Lee is summoned to Lo...
1,Rush Hour 2,50.15,en,"[28, 35, 80]",2001-08-03,6.717,3729,5175,It's vacation time for Carter as he finds hims...
2,Rush Hour 3,64.894,en,"[28, 35, 80]",2007-08-08,6.436,2948,5174,"After a botched assassination attempt, the mis..."


In [34]:
column_to_idx = {
    "popularity": 0,
    "genre_ids": 1,
    "release_date": 2, 
    "vote_average": 3
}

In [35]:
def movie_to_embedding(title, movie_df):
    features = []
    feature_names = {"popularity", "vote_average", "release_date"}
    for feature_name in feature_names:
        features.append(movie_df[feature_name].values[0])
    return features

In [None]:
def get_embeddings(actor_name):
    popular_movies = actors_df[actors_df['name'] == actor_name]['known_for'].values[0]
    popular_movies = sorted(popular_movies, key=lambda x: x['release_date'])
    
    embedding_columns = 
    actors_df["movie_1"]
    return popular_movies