In [1]:
import pandas as pd
import numpy as np
import helpers
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from IPython.display import display

# Load Data

In [2]:
# Load data
char_df = helpers.get_characters()
char_df = char_df.where(char_df["ActorAge"] > 0, np.nan)
movie_df = helpers.get_movies()
display(char_df.head(5))
display(movie_df.head(5))

Unnamed: 0,WikiID,MovieID,ReleaseDate,CharacterName,ActorDOB,ActorGender,ActorHeight,ActorEthnicity,ActorName,ActorAge,FreebaseCharacterActorMapId,FreebaseCharacterID,FreebaseActorID
0,975900.0,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900.0,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900.0,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900.0,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900.0,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


Unnamed: 0,WikiID,FreebaseID,Name,ReleaseDate,BoxOfficeRevenue,Runtime,Language,Country,Genres,LanguageID,CountryID,Year
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,English,United States of America,"[(Thriller, /m/01jfsb), (Science Fiction, /m/0...",/m/02h40lc,/m/09c7w0,2001.0
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,English,United States of America,"[(Mystery, /m/02n4kr), (Biographical film, /m/...",/m/02h40lc,/m/09c7w0,2000.0
2,28463795,/m/0crgdbh,Brun bitter,1988-01-01,,83.0,Norwegian,Norway,"[(Crime Fiction, /m/0lsxr), (Drama, /m/07s9rl0)]",/m/05f_3,/m/05b4w,1988.0
3,9363483,/m/0285_cd,White Of The Eye,1987-01-01,,110.0,English,United Kingdom,"[(Thriller, /m/01jfsb), (Erotic thriller, /m/0...",/m/02h40lc,/m/07ssc,1987.0
4,261236,/m/01mrr1,A Woman in Flames,1983-01-01,,106.0,German,Germany,"[(Drama, /m/07s9rl0)]",/m/04306rv,/m/0345h,1983.0


In [3]:
# Get genres as a separate dataframe
genre_df = movie_df[["WikiID", "FreebaseID", "Genres"]].explode("Genres")
genre_df["GenreName"] = genre_df["Genres"].apply(lambda x: x.name)
genre_df["GenreID"] = genre_df["Genres"].apply(lambda x: x.id)
display(genre_df.head(5))


Unnamed: 0,WikiID,FreebaseID,Genres,GenreName,GenreID
0,975900,/m/03vyhn,"(Thriller, /m/01jfsb)",Thriller,/m/01jfsb
0,975900,/m/03vyhn,"(Science Fiction, /m/06n90)",Science Fiction,/m/06n90
0,975900,/m/03vyhn,"(Horror, /m/03npn)",Horror,/m/03npn
0,975900,/m/03vyhn,"(Adventure, /m/03k9fj)",Adventure,/m/03k9fj
0,975900,/m/03vyhn,"(Supernatural, /m/0fdjb)",Supernatural,/m/0fdjb


# Detect actors that played father role

In [4]:
# Rules to detect a father role
def is_father_role(rolename):
    # filter nan values
    if pd.isna(rolename):
        return False

    rolename = rolename.lower()
    if "'s father" in rolename:
        return True
    
    elif "father of" in rolename:
        return True

    elif "the father" in rolename:
        return True
    
    else:
        return False

In [5]:
char_df["is_father_role"] = char_df.CharacterName.apply(is_father_role)

merge_df = char_df.merge(genre_df, on="WikiID", how="inner")

In [6]:
char_df.head()

Unnamed: 0,WikiID,MovieID,ReleaseDate,CharacterName,ActorDOB,ActorGender,ActorHeight,ActorEthnicity,ActorName,ActorAge,FreebaseCharacterActorMapId,FreebaseCharacterID,FreebaseActorID,is_father_role
0,975900.0,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,False
1,975900.0,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4,False
2,975900.0,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l,False
3,975900.0,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc,False
4,975900.0,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg,False


In [7]:
# Number of father roles
nb_father_roles = char_df.is_father_role.sum()
print(f"There are {nb_father_roles} father roles in the dataset")
print(f"Which is {nb_father_roles / char_df.shape[0] * 100:.2f}% of the total number of roles")


There are 288 father roles in the dataset
Which is 0.06% of the total number of roles


# Genres of movies that have father role

In [8]:
# Count the genres they have played in as father
counts = merge_df[merge_df["is_father_role"]].groupby("GenreName").nunique()[["WikiID"]].sort_values("WikiID", ascending=False)
counts["ratio"] = counts["WikiID"] / nb_father_roles
counts.rename(columns={"WikiID": "count"}, inplace=True)
print(counts.head(25))

                  count     ratio
GenreName                        
Drama               179  0.621528
Romance Film         86  0.298611
World cinema         68  0.236111
Comedy               61  0.211806
Action               42  0.145833
Romantic drama       39  0.135417
Musical              35  0.121528
Bollywood            33  0.114583
Thriller             32  0.111111
Indie                27  0.093750
Family Film          25  0.086806
Adventure            20  0.069444
Romantic comedy      20  0.069444
Crime Fiction        19  0.065972
Fantasy              18  0.062500
Action/Adventure     15  0.052083
Science Fiction      14  0.048611
Comedy-drama         13  0.045139
Animation            13  0.045139
Coming of age        12  0.041667
Film adaptation      12  0.041667
Japanese Movies      10  0.034722
Period piece         10  0.034722
Mystery               9  0.031250
Horror                9  0.031250


# Study of actors that played father role once or more in their career

In [9]:
# Detect is an actor has played a father role once or more in his career
set_of_father_actors_id = set(merge_df[merge_df["is_father_role"]].FreebaseActorID)
merge_df["has_played_father_role"] = merge_df.apply(lambda x: x["FreebaseActorID"] in set_of_father_actors_id, axis=1)
merge_df.head(5)

Unnamed: 0,WikiID,MovieID,ReleaseDate,CharacterName,ActorDOB,ActorGender,ActorHeight,ActorEthnicity,ActorName,ActorAge,FreebaseCharacterActorMapId,FreebaseCharacterID,FreebaseActorID,is_father_role,FreebaseID,Genres,GenreName,GenreID,has_played_father_role
0,975900.0,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,False,/m/03vyhn,"(Thriller, /m/01jfsb)",Thriller,/m/01jfsb,False
1,975900.0,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,False,/m/03vyhn,"(Science Fiction, /m/06n90)",Science Fiction,/m/06n90,False
2,975900.0,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,False,/m/03vyhn,"(Horror, /m/03npn)",Horror,/m/03npn,False
3,975900.0,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,False,/m/03vyhn,"(Adventure, /m/03k9fj)",Adventure,/m/03k9fj,False
4,975900.0,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,False,/m/03vyhn,"(Supernatural, /m/0fdjb)",Supernatural,/m/0fdjb,False


In [10]:
print(merge_df[merge_df["has_played_father_role"]].__len__())
merge_df[merge_df["is_father_role"]].__len__()

22072


1110

In [11]:
nb_actors = len(merge_df.FreebaseActorID.unique())
nb_male_actors = len(merge_df[merge_df["ActorGender"] == "M"].FreebaseActorID.unique())
nb_actors_with_father_role = len(merge_df[merge_df["has_played_father_role"]].FreebaseActorID.unique())
print(f"{nb_actors_with_father_role/nb_actors *100:.1f} % of the actors have played a father role at least once in their career")
print(f"{nb_actors_with_father_role/nb_male_actors *100:.1f} % of the male actors have played a father role at least once in their career")

0.5 % of the actors have played a father role at least once in their career
0.8 % of the male actors have played a father role at least once in their career


In [12]:
# Count the genres they have played in in their life
counts = merge_df[merge_df["has_played_father_role"]].groupby("GenreName").nunique()[["WikiID"]].sort_values("WikiID", ascending=False)
counts["ratio"] = counts["WikiID"] / merge_df.has_played_father_role.sum()
counts.rename(columns={"WikiID": "count"}, inplace=True)
print(counts.head(25))

                  count     ratio
GenreName                        
Drama              3073  0.139226
World cinema       1126  0.051015
Romance Film       1122  0.050834
Action             1071  0.048523
Comedy             1029  0.046620
Thriller            813  0.036834
Musical             591  0.026776
Bollywood           574  0.026006
Crime Fiction       560  0.025372
Action/Adventure    482  0.021838
Romantic drama      481  0.021792
Family Film         374  0.016945
Adventure           353  0.015993
Indie               269  0.012187
Period piece        221  0.010013
Romantic comedy     220  0.009967
Mystery             219  0.009922
Fantasy             208  0.009424
Crime Thriller      196  0.008880
Film adaptation     187  0.008472
Comedy film         186  0.008427
Science Fiction     182  0.008246
Horror              180  0.008155
                    173  0.007838
Comedy-drama        163  0.007385


In [17]:
# Number of movies they have played in during their life
counts1 = merge_df[merge_df["has_played_father_role"]].groupby("FreebaseActorID").nunique()[["WikiID"]].sort_values("WikiID", ascending=False)
print(f"On average, actors that have played a father once in their life have played in {counts1.mean().values[0]:.1f} movies")

counts2 = merge_df[~merge_df["has_played_father_role"]].groupby("FreebaseActorID").nunique()[["WikiID"]].sort_values("WikiID", ascending=False)
print(f"On average, actors that have NOT played a father once in their life have played in {counts2.mean().values[0]:.1f} movies")

# ttest to check if the difference is significant
print(f"Independend ttest has pvalue: {stats.ttest_ind(counts1.values, counts2.values).pvalue[0]}")

print("\n")
print("Same thing considering only male actors")

counts2 = merge_df[(~merge_df["has_played_father_role"]) & (merge_df["ActorGender"] == "M")].groupby("FreebaseActorID").nunique()[["WikiID"]].sort_values("WikiID", ascending=False)
print(f"On average, MALE actors that have NOT played a father once in their life have played in {counts2.mean().values[0]:.1f} movies")

# ttest to check if the difference is significant
print(f"Independend ttest has pvalue: {stats.ttest_ind(counts1.values, counts2.values).pvalue[0]}")

On average, actors that have played a father once in their life have played in 26.1 movies
On average, actors that have NOT played a father once in their life have played in 5.5 movies
Independend ttest has pvalue: 5.091928267969821e-221


Same thing considering only male actors
On average, MALE actors that have NOT played a father once in their life have played in 5.8 movies
Independend ttest has pvalue: 2.5997093381612975e-171


In [18]:
# When did they begin their career
counts1 = merge_df[merge_df["has_played_father_role"]].groupby("FreebaseActorID")[["ActorAge"]].min().sort_values("ActorAge", ascending=True)
print(f"On average, actors that have played a father once in their life have played the first time at {counts1.mean().values[0]:.1f} years old")

counts2 = merge_df[~merge_df["has_played_father_role"]].groupby("FreebaseActorID")[["ActorAge"]].min().sort_values("ActorAge", ascending=True)
print(f"On average, actors that have NOT played a father once in their life have played the first time at {counts2.mean().values[0]:.1f} years old")

# ttest to check if the difference is significant
print(f"Independend ttest has pvalue: {stats.ttest_ind(counts1.values, counts2.values).pvalue[0]}")

print("\n")
print("Same thing considering only male actors")

counts2 = merge_df[(~merge_df["has_played_father_role"]) & (merge_df["ActorGender"] == "M")].groupby("FreebaseActorID")[["ActorAge"]].min().sort_values("ActorAge", ascending=True)
print(f"On average, MALE actors that have NOT played a father once in their life have played in {counts2.mean().values[0]:.1f} movies")

# ttest to check if the difference is significant
print(f"Independend ttest has pvalue: {stats.ttest_ind(counts1.values, counts2.values).pvalue[0]}")

On average, actors that have played a father once in their life have played the first time at 35.5 years old
On average, actors that have NOT played a father once in their life have played the first time at 30.9 years old
Independend ttest has pvalue: 3.581861630743631e-08


Same thing considering only male actors
On average, MALE actors that have NOT played a father once in their life have played in 33.0 movies
Independend ttest has pvalue: 0.0024550219181628456
