# Exploration of actors repartition based upon gender

In [33]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

In [34]:
# Data source: http://www.cs.cmu.edu/~ark/personas/

# Define the data path
CLEAN_DATA_PATH = 'clean_data/'

In [35]:
# Function: filter NaN of a given column
def filter_nan(dataframe, column_to_filter):
    '''
    Create a new dataframe filtering out the NaN values
    of the given column of the given dataframe 
    :param dataframe: pandas.DataFrame
    :param column_to_filter: str
    :return: new dataframe
    '''
    new_df = dataframe.dropna(axis=0, subset=column_to_filter)
    print(f"Cleaning of {column_to_filter} : {new_df.shape[0]} entries left")
    return new_df    

In [41]:
# Load the data
characters_df = pd.read_csv(CLEAN_DATA_PATH + "characters.csv")
display(characters_df)

initial_entries_number = characters_df.shape[0]
print(f"Initial entries: {initial_entries_number}")

# Remove rows with NaN in olumn actor_gender
filt_characters_df = filter_nan(characters_df, "actor_gender")

# Remove rows with NaN or negative values in actor_age
filt_characters_df = filter_nan(filt_characters_df, "actor_age")
filt_characters_df = filt_characters_df[filt_characters_df["actor_age"] > 0]

# Remove rows with NaN in movie_release_date
filt_characters_df = filter_nan(filt_characters_df, "movie_release_date")

# Remove rows with NaN in name (we postulate that if the name)
 
display(filt_characters_df)

Unnamed: 0,movie_wiki_id,movie_id,movie_release_date,name,actor_birth_date,actor_gender,actor_height,actor_ethinicity_id,actor_name,actor_age,actor_map_id,id,actor_id
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.620,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.780,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.750,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.650,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg
...,...,...,...,...,...,...,...,...,...,...,...,...,...
450664,913762,/m/03pcrp,1992-05-21,Elensh,1970-05,F,,,Dorothy Elias-Fahn,,/m/0kr406c,/m/0kr406h,/m/0b_vcv
450665,913762,/m/03pcrp,1992-05-21,Hibiki,1965-04-12,M,,,Jonathan Fahn,27.0,/m/0kr405_,/m/0kr4090,/m/0bx7_j
450666,28308153,/m/0cp05t9,1957,,1941-11-18,M,1.730,/m/02w7gg,David Hemmings,15.0,/m/0g8ngmc,,/m/022g44
450667,28308153,/m/0cp05t9,1957,,,,,,Roberta Paterson,,/m/0g8ngmj,,/m/0g8ngmm


Initial entries: 450669
Cleaning of actor_gender : 405060 entries left
Cleaning of actor_age : 290770 entries left
Cleaning of movie_release_date : 290349 entries left


Unnamed: 0,movie_wiki_id,movie_id,movie_release_date,name,actor_birth_date,actor_gender,actor_height,actor_ethinicity_id,actor_name,actor_age,actor_map_id,id,actor_id
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.620,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.780,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.750,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.650,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg
...,...,...,...,...,...,...,...,...,...,...,...,...,...
450660,913762,/m/03pcrp,1992-05-21,Additional Voices,1954,M,,,Sonny Byrkett,38.0,/m/0kr405f,/m/0kr405k,/m/0gn4bz
450661,913762,/m/03pcrp,1992-05-21,UN Spacy Commander,1954,M,,,Sonny Byrkett,38.0,/m/0kr407w,/m/0kr407_,/m/0gn4bz
450662,913762,/m/03pcrp,1992-05-21,Silvie Gena,1958,F,,,Susan Byrkett,34.0,/m/0kr40b9,/m/0kr40bf,/m/0gn4nd
450665,913762,/m/03pcrp,1992-05-21,Hibiki,1965-04-12,M,,,Jonathan Fahn,27.0,/m/0kr405_,/m/0kr4090,/m/0bx7_j


Note that the order of column filtering of NaN is independent of the number of entries we obtain at the end.

In [42]:
IS_FEMALE = filt_characters_df["actor_gender"] == "F"

# Create dataframe containing only actresses or actors
actresses_df = filt_characters_df[IS_FEMALE]
actors_df = filt_characters_df[~IS_FEMALE]

display(actresses_df)
display(actors_df)
print(f"There is {actresses_df.shape[0]} entries in actresses_df and {actors_df.shape[0]} entries in actors_df")

Unnamed: 0,movie_wiki_id,movie_id,movie_release_date,name,actor_birth_date,actor_gender,actor_height,actor_ethinicity_id,actor_name,actor_age,actor_map_id,id,actor_id
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.620,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.780,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.650,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg
5,975900,/m/03vyhn,2001-08-24,Commander Helena Braddock,1949-05-26,F,1.727,/m/0x67,Pam Grier,52.0,/m/02vdcfp,/m/0bgchnd,/m/0418ft
6,975900,/m/03vyhn,2001-08-24,Whitlock,1945-08-02,F,1.753,,Joanna Cassidy,56.0,/m/02vd6kw,/m/0bgchmx,/m/06lj1m
...,...,...,...,...,...,...,...,...,...,...,...,...,...
450643,12476867,/m/02w7zz8,2002,,1980-06-24,F,1.720,/m/041rx,Liane Balaban,21.0,/m/03jpb_5,,/m/02pn4z4
450645,12476867,/m/02w7zz8,2002,,1978,F,1.650,,Siri Baruc,24.0,/m/0gc8cd7,,/m/0gbwvjl
450653,913762,/m/03pcrp,1992-05-21,Ishtar,1970-02-19,F,,,笠原弘子,22.0,/m/0kr40cw,/m/0kr40cz,/m/01qwg7h
450654,913762,/m/03pcrp,1992-05-21,Sylvie,1966-12-20,F,,,Yumi Tōma,25.0,/m/0kr40cd,/m/0kr40ch,/m/08g3fb


Unnamed: 0,movie_wiki_id,movie_id,movie_release_date,name,actor_birth_date,actor_gender,actor_height,actor_ethinicity_id,actor_name,actor_age,actor_map_id,id,actor_id
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.750,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
8,975900,/m/03vyhn,2001-08-24,Michael Descanso,1971-03-20,M,1.892,,Liam Waite,30.0,/m/03jqhb0,/m/0bgchs4,/m/0ks8b0
11,975900,/m/03vyhn,2001-08-24,Tres,1959-03-09,M,,/m/064b9n,Rodney A. Grant,42.0,/m/0bgchrs,/m/0bgchrw,/m/03ydsb
12,975900,/m/03vyhn,2001-08-24,McSimms,1944-07-22,M,1.800,,Peter Jason,57.0,/m/0bgchxd,/m/0bgchxh,/m/03d663h
...,...,...,...,...,...,...,...,...,...,...,...,...,...
450658,913762,/m/03pcrp,1992-05-21,Lord Feff,1960-04-28,M,,,Steven Blum,32.0,/m/0kr408g,/m/0kr408l,/m/044_7j
450660,913762,/m/03pcrp,1992-05-21,Additional Voices,1954,M,,,Sonny Byrkett,38.0,/m/0kr405f,/m/0kr405k,/m/0gn4bz
450661,913762,/m/03pcrp,1992-05-21,UN Spacy Commander,1954,M,,,Sonny Byrkett,38.0,/m/0kr407w,/m/0kr407_,/m/0gn4bz
450665,913762,/m/03pcrp,1992-05-21,Hibiki,1965-04-12,M,,,Jonathan Fahn,27.0,/m/0kr405_,/m/0kr4090,/m/0bx7_j


There is 99100 entries in actresses_df and 191249 entries in actors_df


Note that there is almost half less entries in actresses_df than in actors_df : after data cleaning we have approximately half less data with female characters than male characters. 
It could be interesting to analyse if one gender present more uncomplete data (containing NaN values) in movie_release_date and actor_name than the other. 

In [48]:
# actresses_age_categories_df = actresses_df.groupby("actor_age").count()

# display(actresses_age_categories_df)

Unnamed: 0_level_0,movie_wiki_id,movie_id,movie_release_date,name,actor_birth_date,actor_gender,actor_height,actor_ethinicity_id,actor_name,actor_map_id,id,actor_id
actor_age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1.0,15,15,15,6,15,15,3,4,15,15,6,15
2.0,33,33,33,15,33,33,6,8,33,33,15,33
3.0,35,35,35,18,35,35,8,14,35,35,18,35
4.0,58,58,58,23,58,58,20,15,58,58,23,58
5.0,90,90,90,53,90,90,40,30,90,90,53,90
...,...,...,...,...,...,...,...,...,...,...,...,...
93.0,5,5,5,2,5,5,2,1,5,5,2,5
95.0,8,8,8,6,8,8,0,3,8,8,6,8
96.0,5,5,5,2,5,5,2,1,5,5,2,5
97.0,1,1,1,0,1,1,0,0,1,1,0,1


In [None]:
# # # Plot actor_age repartition
# ax = sns.histplot(actresses_age_categories_df["actor_age"], kde=True, stat="density", color="green", label="actresses")
# # # sns.histplot(actors_df["actor_age"], kde=True, stat="density", color="blue", label="actors")



#### Comparison of actresses and actors number over the years 

In [58]:
#FIXME long loop (>2min) no possibility to check if error in code
# def get_year(dataframe, x):
#     try:
#         return pd.to_datetime(dataframe["movie_release_date"]).year()
#     except:
#         return x

# # Uniformize movie_release_date to string containing only the year
# filt_characters_df["movie_release_date"] = filt_characters_df["movie_release_date"].apply(lambda x: get_year(filt_characters_df, x))

### Age repartition of role played separated by gender