In [1]:
import pandas as pd



In [2]:
# download https://www.kaggle.com/datasets/unanimad/the-oscar-award?resource=download

oscar_award = pd.read_csv('../data/the_oscar_award.csv')

In [3]:
oscar_award['category'].value_counts()

category
DIRECTING                                    469
FILM EDITING                                 450
ACTRESS IN A SUPPORTING ROLE                 440
ACTOR IN A SUPPORTING ROLE                   440
DOCUMENTARY (Short Subject)                  378
                                            ... 
SPECIAL FOREIGN LANGUAGE FILM AWARD            2
GORDON E. SAWYER AWARD                         1
SPECIAL ACHIEVEMENT AWARD (Sound Editing)      1
SPECIAL ACHIEVEMENT AWARD (Sound Effects)      1
AWARD OF COMMENDATION                          1
Name: count, Length: 115, dtype: int64

In [4]:
import json

genders = json.load(open(
    '../src/genders_final.json'))

gender_df = pd.DataFrame(genders)

In [5]:
import re
def parse_gpt_gender(gender_str):
    clean_str = gender_str.replace(' ', '').replace('.', '').replace(',', '')

    clean_str = re.sub(r'\d+', '', clean_str).strip().lower()


    if clean_str == 'male':
        return 'Male'
    elif clean_str == 'female':
        return 'Female'
    else:
        return 'unknown'


gender_df['gender'] = gender_df['gender'].apply(parse_gpt_gender)

In [6]:
unique_people = gender_df[['name', 'gender']].drop_duplicates()

In [7]:
unique_people = unique_people[unique_people['gender'] != 'unknown']

In [11]:
import plotly.express as px

# Calculate the value counts for "directors_gender"
gender_counts = unique_people['gender'].value_counts().reset_index()
gender_counts.columns = ['gender', 'count']

# Create a pie chart to visualize the gender imbalance
fig = px.pie(
    gender_counts,
    names='gender',  # Column to use for labels
    values='count',  # Column to use for proportions
    title='Oscar nominees Gender Imbalance',
    color='gender',  # Optional: color based on gender
)

fig.show()

fig.write_html("oscar_gender_imbalance.html")

In [12]:
oscar_award['category'].value_counts()

category
DIRECTING                                    469
FILM EDITING                                 450
ACTRESS IN A SUPPORTING ROLE                 440
ACTOR IN A SUPPORTING ROLE                   440
DOCUMENTARY (Short Subject)                  378
                                            ... 
SPECIAL FOREIGN LANGUAGE FILM AWARD            2
GORDON E. SAWYER AWARD                         1
SPECIAL ACHIEVEMENT AWARD (Sound Editing)      1
SPECIAL ACHIEVEMENT AWARD (Sound Effects)      1
AWARD OF COMMENDATION                          1
Name: count, Length: 115, dtype: int64

In [13]:
list(oscar_award['category'].unique())

['ACTOR',
 'ACTRESS',
 'ART DIRECTION',
 'CINEMATOGRAPHY',
 'DIRECTING (Comedy Picture)',
 'DIRECTING (Dramatic Picture)',
 'ENGINEERING EFFECTS',
 'OUTSTANDING PICTURE',
 'UNIQUE AND ARTISTIC PICTURE',
 'WRITING (Adaptation)',
 'WRITING (Original Story)',
 'WRITING (Title Writing)',
 'SPECIAL AWARD',
 'DIRECTING',
 'WRITING',
 'OUTSTANDING PRODUCTION',
 'SOUND RECORDING',
 'SHORT SUBJECT (Cartoon)',
 'SHORT SUBJECT (Comedy)',
 'SHORT SUBJECT (Novelty)',
 'ASSISTANT DIRECTOR',
 'FILM EDITING',
 'MUSIC (Scoring)',
 'MUSIC (Song)',
 'DANCE DIRECTION',
 'WRITING (Screenplay)',
 'ACTOR IN A SUPPORTING ROLE',
 'ACTRESS IN A SUPPORTING ROLE',
 'SHORT SUBJECT (Color)',
 'SHORT SUBJECT (One-reel)',
 'SHORT SUBJECT (Two-reel)',
 'IRVING G. THALBERG MEMORIAL AWARD',
 'MUSIC (Original Score)',
 'CINEMATOGRAPHY (Black-and-White)',
 'CINEMATOGRAPHY (Color)',
 'SPECIAL EFFECTS',
 'ART DIRECTION (Black-and-White)',
 'ART DIRECTION (Color)',
 'WRITING (Original Screenplay)',
 'DOCUMENTARY (Short Subje

In [14]:
oscar_award[oscar_award['category'] == 'WRITING (Screenplay)']

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
410,1935,1936,8,WRITING (Screenplay),Casey Robinson,Captain Blood,False
411,1935,1936,8,WRITING (Screenplay),Dudley Nichols,The Informer,True
412,1935,1936,8,WRITING (Screenplay),"Screenplay by Waldemar Young, John L. Balderst...",The Lives of a Bengal Lancer,False
413,1935,1936,8,WRITING (Screenplay),"Talbot Jennings, Jules Furthman, Carey Wilson",Mutiny on the Bounty,False
515,1936,1937,9,WRITING (Screenplay),"Frances Goodrich, Albert Hackett",After the Thin Man,False
...,...,...,...,...,...,...,...
3123,1955,1956,28,WRITING (Screenplay),Millard Kaufman,Bad Day at Black Rock,False
3124,1955,1956,28,WRITING (Screenplay),Richard Brooks,Blackboard Jungle,False
3125,1955,1956,28,WRITING (Screenplay),Paul Osborn,East of Eden,False
3126,1955,1956,28,WRITING (Screenplay),"Daniel Fuchs, Isobel Lennart",Love Me or Leave Me,False


In [15]:
def extract_gender(row):
    if 'name' not in row:
        print(row)
    name = row['name']
    category = row['category']
    film = row['film']

    gender_q = gender_df[
        (gender_df['name'] == str(name)) &
        (gender_df['category'] == str(category)) &
        (gender_df['film'] == str(film))
        ]

    if gender_q.empty:
        return 'unknown'
    else:
        return gender_q['gender'].iloc[0]


oscar_award.loc[:, 'gender'] = oscar_award.apply(extract_gender, axis=1)

In [16]:
combined_category = {
    'ART DIRECTION': "ART DIRECTION",
    "ACTOR": "ACTOR",
    "ACTRESS": "ACTRESS",
    "ACTOR IN A LEADING ROLE": "ACTOR IN A LEADING ROLE",
    "ACTRESS IN A LEADING ROLE": "ACTRESS IN A LEADING ROLE",
    "ACTOR IN A SUPPORTING ROLE": "ACTOR IN A SUPPORTING ROLE",
    "ACTRESS IN A SUPPORTING ROLE": "ACTRESS IN A SUPPORTING ROLE",
    "CINEMATOGRAPHY": "CINEMATOGRAPHY",
    "CINEMATOGRAPHY (Black-and-White)": "CINEMATOGRAPHY",
    "CINEMATOGRAPHY (Color)": "CINEMATOGRAPHY",
    'COSTUME DESIGN': "COSTUME DESIGN",
    'MUSIC (Original Score)': "MUSIC",
    "DIRECTING": "DIRECTING",
    "DIRECTING (Comedy Picture)": "DIRECTING",
    "DIRECTING (Dramatic Picture)": "DIRECTING",
    "ASSISTANT DIRECTOR": "ASSISTANT DIRECTOR",
    "DANCE DIRECTION": "DANCE DIRECTION",
    "WRITING (Adaptation)": "WRITING",
    "WRITING (Original Story)": "WRITING",
    "WRITING (Title Writing)": "WRITING",
    "WRITING": "WRITING",
    "WRITING (Screenplay)": "WRITING",
    "WRITING (Original Screenplay)": "WRITING",
    "WRITING (Motion Picture Story)": "WRITING",
    "WRITING (Story and Screenplay)": "WRITING",
    "WRITING (Screenplay--Adapted)": "WRITING",
    "WRITING (Screenplay--Original)": "WRITING",
    "WRITING (Screenplay--based on material from another medium)": "WRITING",
    "WRITING (Story and Screenplay--written directly for the screen)": "WRITING",
    "WRITING (Screenplay Written Directly for the Screen)": "WRITING",
    "WRITING (Screenplay Based on Material from Another Medium)": "WRITING",
    "WRITING (Screenplay Written Directly for the Screen--based on factual material or on story material not previously published or produced)": "WRITING",
    "WRITING (Adapted Screenplay)": "WRITING",
    "MAKEUP": "MAKEUP",
    "MAKEUP AND HAIRSTYLING": "MAKEUP",
    'FILM EDITING': "FILM EDITING",
}

human_category = list(combined_category.keys())

In [17]:
human_oscar_award = oscar_award[oscar_award['category'].isin(human_category)]

human_oscar_award.loc[:, 'category'] = human_oscar_award['category'].apply(lambda x: combined_category[x])

In [18]:
human_oscar_award[human_oscar_award['category'] == 'ART DIRECTION']

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner,gender
5,1927,1928,1,ART DIRECTION,Rochus Gliese,Sunrise,False,Male
6,1927,1928,1,ART DIRECTION,William Cameron Menzies,The Dove;,True,Male
7,1927,1928,1,ART DIRECTION,Harry Oliver,7th Heaven,False,Male
46,1928,1929,2,ART DIRECTION,Hans Dreier,The Patriot,False,Male
47,1928,1929,2,ART DIRECTION,Cedric Gibbons,The Bridge of San Luis Rey,True,Male
...,...,...,...,...,...,...,...,...
9291,2011,2012,84,ART DIRECTION,Production Design: Laurence Bennett; Set Decor...,The Artist,False,Male
9292,2011,2012,84,ART DIRECTION,Production Design: Stuart Craig; Set Decoratio...,Harry Potter and the Deathly Hallows Part 2,False,Female
9293,2011,2012,84,ART DIRECTION,Production Design: Dante Ferretti; Set Decorat...,Hugo,True,unknown
9294,2011,2012,84,ART DIRECTION,Production Design: Anne Seibel; Set Decoration...,Midnight in Paris,False,unknown


In [19]:
len(oscar_award), len(human_oscar_award)

(10889, 5349)

In [20]:
import pandas as pd
import plotly.express as px

# Group data by 'category' and 'gender', and count occurrences
plot_df = human_oscar_award[['category', 'gender']]

plot_df = plot_df[~plot_df['category'].isin(
    ['ACTOR', 'ACTOR IN A LEADING ROLE', 'ACTOR IN A SUPPORTING ROLE', 'ACTRESS', 'ACTRESS IN A LEADING ROLE',
     'ACTRESS IN A SUPPORTING ROLE'])]

# Enforce the order of 'gender' with Male first, Female second
plot_df['gender'] = pd.Categorical(plot_df['gender'], categories=["Male", "Female"], ordered=True)

# Group by 'category' and 'gender', and count occurrences
gender_balance = plot_df.groupby(['category', 'gender']).size().reset_index(name='count')

# Plot using Plotly
fig = px.bar(
    gender_balance,
    x='category',
    y='count',
    color='gender',
    title='Gender Balance by Oscar Category',
    labels={'count': 'Count', 'category': 'Oscar Category'},
    barmode='group',  # Group bars by gender
    text='count'  # Show counts on the bars
)

# Update layout to make the chart more readable
fig.update_layout(
    xaxis_title="Oscar Category",
    yaxis_title="Number of Nominations",
    legend_title="Gender",
    xaxis=dict(tickangle=-45),  # Rotate x-axis labels for better readability
    margin=dict(pad=10)  # Adjust margins for clarity
)

# Show the plot
fig.show()

fig.write_html("oscar_category_gender.html")





In [21]:
# Only 7 Directing awards!

director_oscar = human_oscar_award[human_oscar_award['category'] == 'DIRECTING']


director_oscar[director_oscar['gender'] == 'Female']

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner,gender
5444,1976,1977,49,DIRECTING,Lina Wertmüller,Seven Beauties,False,Female
7252,1993,1994,66,DIRECTING,Jane Campion,The Piano,False,Female
8381,2003,2004,76,DIRECTING,Sofia Coppola,Lost in Translation,False,Female
9058,2009,2010,82,DIRECTING,Kathryn Bigelow,The Hurt Locker,True,Female
10052,2017,2018,90,DIRECTING,Greta Gerwig,Lady Bird,False,Female
10553,2021,2022,94,DIRECTING,Jane Campion,The Power of the Dog,True,Female
10800,2023,2024,96,DIRECTING,Justine Triet,Anatomy of a Fall,False,Female


In [22]:
 cin_oscar = human_oscar_award[human_oscar_award['category'] == 'CINEMATOGRAPHY']

 cin_oscar[cin_oscar['gender'] == 'Female'] # Only 3 nominations and 0 wins

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner,gender
10043,2017,2018,90,CINEMATOGRAPHY,Rachel Morrison,Mudbound,False,Female
10542,2021,2022,94,CINEMATOGRAPHY,Ari Wegner,The Power of the Dog,False,Female
10666,2022,2023,95,CINEMATOGRAPHY,Mandy Walker,Elvis,False,Female


In [23]:
# Count nominations by individual
top_nominees = human_oscar_award.groupby(['name', 'gender']).size().reset_index(name='nominations').sort_values(
    by='nominations', ascending=False).head(10)

# Display
print(top_nominees)

                name  gender  nominations
1655   John Williams    Male           35
2058    Meryl Streep  Female           19
3195     Woody Allen    Male           18
1838    Leon Shamroy    Male           17
2649    Sandy Powell  Female           14
2971   Thomas Newman    Male           13
2566   Roger Deakins    Male           13
1383  Jack Nicholson    Male           12
712   Colleen Atwood  Female           12
1145   George Folsey    Male           12
