In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from helpers import *
import plotly.express as px # for interactive plots

In [2]:
# load the useful dataset for the task 1
df = pd.read_csv("data_cleaned.csv")

In [3]:
# groupby year and do average male and female proportions
df['Actor_gender_female'] = 1 - df['Actor_gender_male'] 
avg_gender_by_year = df.groupby('Movie_release_date').agg(male_proportion=('Actor_gender_male', 'mean'),female_proportion=('Actor_gender_female', 'mean')).reset_index()
# get the proportion 
avg_gender_by_year['male_proportion'] /= (avg_gender_by_year['male_proportion'] + avg_gender_by_year['female_proportion'])
avg_gender_by_year['female_proportion'] /= (avg_gender_by_year['male_proportion'] + avg_gender_by_year['female_proportion'])

#plot 
fig = px.bar(
    avg_gender_by_year.melt(id_vars='Movie_release_date', var_name='Gender', value_name='Proportion'),
    x='Movie_release_date',
    y='Proportion',
    color='Gender',
    barmode='stack',    
    labels={'Movie_release_date': 'Year', 'Proportion': 'Proportion'},
    title='Average proportion of male and female actors per film over time',
    color_discrete_map={'male_proportion': 'blue', 'female_proportion': 'coral'}
)

# add title and labels
fig.update_layout(
    xaxis_title='Year',
    yaxis_title='Proportion',
    legend_title='Gender',
    template='plotly_white'
)

fig.show()


In [4]:
fig.write_html('.\plots\gender_proportion_plot.html')