In [46]:
import pandas as pd

from src.data.dataloader import DataLoader

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [47]:
dataloader = DataLoader()

In [48]:
characters = dataloader.load_characters()
print(len(characters))
characters.head()

450787


Unnamed: 0,wikipedia_movie_id,wikidata_movie_id,character_name,actor_date_of_birth,actor_gender,actor_height_meters,actor_name,actor_age_at_release,ethnicity,wikidata_character_id,wikidata_actor_id
0,975900,Q261700,Akooshay,1958-08-26,F,1.62,Wanda De Jesus,42.0,,,Q1873468
1,975900,Q261700,Lieutenant Melanie Ballard,1974-08-15,F,1.78,Natasha Henstridge,27.0,,,Q230527
2,975900,Q261700,Desolation Williams,1969-06-15,M,1.727,Ice Cube,32.0,African Americans,,Q173637
3,975900,Q261700,Sgt Jericho Butler,1967-09-12,M,1.75,Jason Statham,33.0,,,Q169963
4,975900,Q261700,Bashira Kincaid,1977-09-25,F,1.65,Clea DuVall,23.0,,,Q233347


In [49]:
# Merge characters with movies to get release dates
characters_with_movies = pd.merge(
    characters,
    dataloader.load_movies()[["wikipedia_movie_id", "Movie release date"]],
    on="wikipedia_movie_id",
)


# Extract first 4 characters (year) from birth date and convert to numeric
characters_with_movies["actor_birth_year"] = pd.to_numeric(
    characters_with_movies["actor_date_of_birth"].str[:4], errors="coerce"
)

# Extract movie year from release date
characters_with_movies["movie_release_year"] = pd.to_numeric(
    characters_with_movies["Movie release date"].str[:4], errors="coerce"
)

characters_with_movies.drop(columns=["Movie release date"], inplace=True)

print(len(characters_with_movies))


450787


In [50]:
characters_with_movies[characters_with_movies["actor_birth_year"].isna()].head()


Unnamed: 0,wikipedia_movie_id,wikidata_movie_id,character_name,actor_date_of_birth,actor_gender,actor_height_meters,actor_name,actor_age_at_release,ethnicity,wikidata_character_id,wikidata_actor_id,actor_birth_year,movie_release_year
7,975900,Q261700,Big Daddy Mars,,M,,Richard Cetrone,,,,,,2001.0
9,975900,Q261700,Uno,,M,,Duane Davis,,,,,,2001.0
10,975900,Q261700,Dos,,M,,Lobo Sebastian,,,,,,2001.0
13,975900,Q261700,Zimmerman,,M,,Rick Edelstein,,,,,,2001.0
17,3196793,,Police Officer,,M,,Allen Cutler,,,,,,2000.0


In [51]:
# Filter out invalid years (before 1800 and after 2014)
characters_with_movies = characters_with_movies[
    (characters_with_movies["actor_birth_year"].isna())
    | (
        (characters_with_movies["actor_birth_year"] > 1800)
        & (characters_with_movies["actor_birth_year"] < 2014)
    )
]


In [52]:
characters_with_movies[characters_with_movies["actor_birth_year"].isna()].head()


Unnamed: 0,wikipedia_movie_id,wikidata_movie_id,character_name,actor_date_of_birth,actor_gender,actor_height_meters,actor_name,actor_age_at_release,ethnicity,wikidata_character_id,wikidata_actor_id,actor_birth_year,movie_release_year
7,975900,Q261700,Big Daddy Mars,,M,,Richard Cetrone,,,,,,2001.0
9,975900,Q261700,Uno,,M,,Duane Davis,,,,,,2001.0
10,975900,Q261700,Dos,,M,,Lobo Sebastian,,,,,,2001.0
13,975900,Q261700,Zimmerman,,M,,Rick Edelstein,,,,,,2001.0
17,3196793,,Police Officer,,M,,Allen Cutler,,,,,,2000.0


In [53]:
print("Characters with movies after filtering:", len(characters_with_movies))
characters_with_movies.head()

Characters with movies after filtering: 450762


Unnamed: 0,wikipedia_movie_id,wikidata_movie_id,character_name,actor_date_of_birth,actor_gender,actor_height_meters,actor_name,actor_age_at_release,ethnicity,wikidata_character_id,wikidata_actor_id,actor_birth_year,movie_release_year
0,975900,Q261700,Akooshay,1958-08-26,F,1.62,Wanda De Jesus,42.0,,,Q1873468,1958.0,2001.0
1,975900,Q261700,Lieutenant Melanie Ballard,1974-08-15,F,1.78,Natasha Henstridge,27.0,,,Q230527,1974.0,2001.0
2,975900,Q261700,Desolation Williams,1969-06-15,M,1.727,Ice Cube,32.0,African Americans,,Q173637,1969.0,2001.0
3,975900,Q261700,Sgt Jericho Butler,1967-09-12,M,1.75,Jason Statham,33.0,,,Q169963,1967.0,2001.0
4,975900,Q261700,Bashira Kincaid,1977-09-25,F,1.65,Clea DuVall,23.0,,,Q233347,1977.0,2001.0


In [54]:
# Calculate age at release
characters_with_movies["actor_age_at_release"] = (
    characters_with_movies["movie_release_year"]
    - characters_with_movies["actor_birth_year"]
)

# Update the characters DataFrame
# characters = characters_with_movies.drop(["Movie release date", "birth_year"], axis=1)
print(len(characters_with_movies))
characters_with_movies.sort_values("actor_age_at_release").head()

450762


Unnamed: 0,wikipedia_movie_id,wikidata_movie_id,character_name,actor_date_of_birth,actor_gender,actor_height_meters,actor_name,actor_age_at_release,ethnicity,wikidata_character_id,wikidata_actor_id,actor_birth_year,movie_release_year
383908,4210812,Q3794530,Glinda,1992-02-17,M,,Olive Cox,-82.0,,Q2292767,,1992.0,1910.0
298017,6149842,Q2456767,Jack's pal,1988-05-23,M,,William R. Dunn,-76.0,,,,1988.0,1912.0
30168,33625136,,,1981-10-04,F,,Muriel Martin-Harvey,-65.0,,,,1981.0,1916.0
92533,17455134,Q3840588,Sua moglie,1978-04-14,F,,Tilde Teldi,-64.0,,,,1978.0,1914.0
387597,20859086,,,1973-08-24,F,,Grey DeLisle,-61.0,,,Q13938,1973.0,1912.0


In [55]:
# Keep only actors with age at release > 0
characters_with_movies = characters_with_movies[
    (characters_with_movies["actor_age_at_release"].isna())
    | (characters_with_movies["actor_age_at_release"] > 0)
]

print("Characters with movies after filtering:", len(characters_with_movies))
characters_with_movies.head()

Characters with movies after filtering: 450329


Unnamed: 0,wikipedia_movie_id,wikidata_movie_id,character_name,actor_date_of_birth,actor_gender,actor_height_meters,actor_name,actor_age_at_release,ethnicity,wikidata_character_id,wikidata_actor_id,actor_birth_year,movie_release_year
0,975900,Q261700,Akooshay,1958-08-26,F,1.62,Wanda De Jesus,43.0,,,Q1873468,1958.0,2001.0
1,975900,Q261700,Lieutenant Melanie Ballard,1974-08-15,F,1.78,Natasha Henstridge,27.0,,,Q230527,1974.0,2001.0
2,975900,Q261700,Desolation Williams,1969-06-15,M,1.727,Ice Cube,32.0,African Americans,,Q173637,1969.0,2001.0
3,975900,Q261700,Sgt Jericho Butler,1967-09-12,M,1.75,Jason Statham,34.0,,,Q169963,1967.0,2001.0
4,975900,Q261700,Bashira Kincaid,1977-09-25,F,1.65,Clea DuVall,24.0,,,Q233347,1977.0,2001.0


In [58]:
test = characters_with_movies[characters_with_movies["actor_age_at_release"].isna()]

test.head()


Unnamed: 0,wikipedia_movie_id,wikidata_movie_id,character_name,actor_date_of_birth,actor_gender,actor_height_meters,actor_name,actor_age_at_release,ethnicity,wikidata_character_id,wikidata_actor_id,actor_birth_year,movie_release_year
7,975900,Q261700,Big Daddy Mars,,M,,Richard Cetrone,,,,,,2001.0
9,975900,Q261700,Uno,,M,,Duane Davis,,,,,,2001.0
10,975900,Q261700,Dos,,M,,Lobo Sebastian,,,,,,2001.0
13,975900,Q261700,Zimmerman,,M,,Rick Edelstein,,,,,,2001.0
17,3196793,,Police Officer,,M,,Allen Cutler,,,,,,2000.0


In [57]:
len(characters_with_movies)

450329