In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from helpers import *
import plotly.express as px # for interactive plots

In [3]:
# load the useful dataset for the task 1
df = pd.read_csv("data_cleaned.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Movie_name,Movie_release_date,Movie_box_office_scaled,Movie_runtime,Actor_gender_male,Actor_height_scaled,Actor_age_at_movie_release,Cluster_Name_B&W - Indie,Cluster_Name_Comedy - Action,...,Cluster_Name_Romance - Comedy,Cluster_Name_Short film - World cinema,Cluster_Name_Thriller,Region_Asia,Region_Dead country,Region_East Europa,Region_North America,Region_Oceania,Region_South America,Region_West Europa
0,0,Ghosts of Mars,2001.0,7912729.0,98.0,0,1.031847,42.0,False,False,...,False,False,False,False,False,False,True,False,False,False
1,1,Ghosts of Mars,2001.0,7912729.0,98.0,0,1.133758,27.0,False,False,...,False,False,False,False,False,False,True,False,False,False
2,2,Ghosts of Mars,2001.0,7912729.0,98.0,1,0.990877,32.0,False,False,...,False,False,False,False,False,False,True,False,False,False
3,3,Ghosts of Mars,2001.0,7912729.0,98.0,1,1.004074,33.0,False,False,...,False,False,False,False,False,False,True,False,False,False
4,4,Ghosts of Mars,2001.0,7912729.0,98.0,0,1.050955,23.0,False,False,...,False,False,False,False,False,False,True,False,False,False


In [18]:
# groupby year and do average male and female proportions
df['Actor_gender_female'] = 1 - df['Actor_gender_male'] 
avg_gender_by_year = df.groupby('Movie_release_date').agg(male_proportion=('Actor_gender_male', 'mean'),female_proportion=('Actor_gender_female', 'mean')).reset_index()

# format for Plotly
gender_long_df = avg_gender_by_year.melt(
    id_vars='Movie_release_date',
    value_vars=['female_proportion', 'male_proportion'],
    var_name='gender',
    value_name='proportion'
)
color_map = {'male_proportion': 'cyan', 'female_proportion': 'pink'}

#plot interactive line
fig = px.area(
    gender_long_df,
    x='Movie_release_date',
    y='proportion',
    color='gender',
    labels={'Movie_release_date': 'Year', 'proportion': 'Proportion'},
    title='Average Proportion of Male and Female Actors Per Film Over Time',
    template='plotly_white',
    color_discrete_map=color_map 

)

# make  title etc
fig.update_layout(
    xaxis_title='Year',
    yaxis_title='Proportion',
    legend_title='Gender',
    xaxis=dict(range=[1920, gender_long_df['Movie_release_date'].max()]),  # Skip first years as we have few films
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
)

fig.show()


In [20]:
import plotly.io as pio

# Alternatively, use the figure's built-in method
fig.write_html('gender_proportion_plot.html')