In [102]:
from src.utils.data_utils import load_dataframe_from_csv
from src.data.dataloader import load_initial_dataset
from src.utils.clean_cmu import clean_movies_cmu
from config import *

import plotly.express as px
import pandas as pd
import plotly
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [103]:
movie_cmu = load_dataframe_from_csv('movie_cmu.csv')
character = load_dataframe_from_csv('character.csv')
plot_summaries = load_dataframe_from_csv('plot_summaries.csv')
tvtropes = load_dataframe_from_csv('tvtropes.csv')
name_clusters = load_dataframe_from_csv('name_clusters.csv')
oscar_winning_films = load_dataframe_from_csv('oscar_winning_films_ids.csv')
oscar_winning_actors = load_dataframe_from_csv('oscar_winning_actors.csv')
oscar_winning_actresses = load_dataframe_from_csv('oscar_winning_actresses.csv')
oscar_supporting_actors = load_dataframe_from_csv('oscar_winning_supporting_actors.csv')
oscar_supporting_actresses = load_dataframe_from_csv('oscar_winning_supporting_actresses.csv')
extended_films = load_dataframe_from_csv('film_2015_2024.csv')
academy_award_winning_films = load_dataframe_from_csv('acedemy_award_winning_films')
winning_actors_info = load_dataframe_from_csv('winning_actors_information.csv')

In [104]:
extended_films["Movie genres"] = extended_films["genres"]
extended_films["Movie release date"] = extended_films["release_date"]
extended_films["Movie languages"] = extended_films["languages"]
extended_films["Movie countries"] = extended_films["countries"]
extended_films["Movie name"] = extended_films["film"]
extended_films["Movie box office revenue"] = extended_films["box_office"]
extended_films["Movie runtime"] = extended_films["runtime"]
extended_films["Wikipedia movie ID"] = extended_films["page_id"]
extended_films = extended_films.drop(columns=["genres", "release_date", "languages", "countries", "release date", "film", "box_office", "runtime", "page_id"])
winning_actors_info["Actor name"] = winning_actors_info["actor"]
winning_actors_info = winning_actors_info.drop(columns=["actor"])

In [105]:
oscar_actress_movies_cmu  = pd.merge(oscar_winning_actresses, movie_cmu, left_on='film_id', right_on='Wikipedia movie ID')
oscar_actress_movies_cmu.drop(columns=['film_id'], inplace=True) 

In [106]:
print(oscar_actress_movies_cmu.shape)
print(oscar_actress_movies_cmu['Movie release date'].min())
print(oscar_actress_movies_cmu['Movie release date'].max())

(88, 10)
1928.0
2012.0


In [107]:
oscar_actress_movies_extendedDS  = pd.merge(oscar_winning_actresses, extended_films, left_on='film_id', right_on='Wikipedia movie ID')
oscar_actress_movies_extendedDS.drop(columns=['film_id'], inplace=True) 

# Convert the 'Movie release date' column to datetime
oscar_actress_movies_extendedDS['Movie release date'] = pd.to_datetime(oscar_actress_movies_extendedDS['Movie release date'], format='%Y-%m-%dT%H:%M:%SZ')

# Extract the year
oscar_actress_movies_extendedDS['Movie release date'] = oscar_actress_movies_extendedDS['Movie release date'].dt.year

In [108]:
print(oscar_actress_movies_extendedDS.shape)
print(oscar_actress_movies_extendedDS['Movie release date'].min())
print(oscar_actress_movies_extendedDS['Movie release date'].max())

(8, 14)
2015
2023


In [109]:
oscar_actress_movies = pd.concat([oscar_actress_movies_cmu, oscar_actress_movies_extendedDS], axis=0)
oscar_actress_movies.drop(columns=['wikidata_id'], inplace=True)
print(oscar_actress_movies.shape)
print(oscar_actress_movies.columns)

(96, 14)
Index(['Actress id', 'Wikipedia movie ID', 'Freebase movie ID', 'Movie name',
       'Movie release date', 'Movie box office revenue', 'Movie runtime',
       'Movie languages', 'Movie countries', 'Movie genres', 'reviewScores',
       'awardsReceived', 'awardsNominated', 'capitalCost'],
      dtype='object')


In [110]:
oscar_actress_info = pd.merge(winning_actors_info, oscar_actress_movies, left_on='page_id', right_on='Actress id')
print(oscar_actress_info.columns)
oscar_actress_info.drop(columns=['Freebase movie ID', 'sexLabel'], inplace=True)  #Remove the columns that are in character.csv
oscar_actress_info.head(2)
print(oscar_actress_info.shape)

Index(['page_id', 'wikidata_id', 'sexLabel', 'nativeLanguageLabel',
       'countryOfCitizenshipLabel', 'ethnicGroupLabel', 'Actor name',
       'Actress id', 'Wikipedia movie ID', 'Freebase movie ID', 'Movie name',
       'Movie release date', 'Movie box office revenue', 'Movie runtime',
       'Movie languages', 'Movie countries', 'Movie genres', 'reviewScores',
       'awardsReceived', 'awardsNominated', 'capitalCost'],
      dtype='object')
(96, 19)


In [111]:
duplicates_with_different_ages = oscar_actress_info.groupby('Actor name').filter(lambda x: x['page_id'].nunique() > 1)

if not duplicates_with_different_ages.empty:
    print("Duplicate names with different ages found:")
    print(duplicates_with_different_ages)
else:
    print("No duplicate names with different Actress id found.")

No duplicate names with different Actress id found.


In [112]:
# Prepare data for box office revenue evolution
oscar_revenue_data_corrected = oscar_actress_info[['Movie box office revenue', 'Movie release date']].dropna()
oscar_revenue_data_corrected = oscar_revenue_data_corrected.dropna(subset=['Movie release date'])

# Group by year and calculate total box office revenue
oscar_revenue_data_corrected['Year'] = oscar_revenue_data_corrected['Movie release date']
revenue_by_year_corrected = oscar_revenue_data_corrected.groupby('Year')['Movie box office revenue'].sum().reset_index()

# Create a line chart
fig = px.line(
    revenue_by_year_corrected,
    x='Year',
    y='Movie box office revenue',
    title='Evolution of Movie Box Office Revenue Over the Years',
    labels={'Year': 'Year', 'Movie box office revenue': 'Box Office Revenue ($)'},
    line_shape='spline'
)

# Update layout for better readability
fig.update_layout(
    xaxis=dict(title='Year'),
    yaxis=dict(title='Total Box Office Revenue ($)', tickformat='$,.0f'),
    width=800, 
    height=600  
)

fig.show()

In [113]:
oscar_actress_info = pd.merge(oscar_actress_info, character[['Wikipedia movie ID', 'Actor name', 'Actor age at movie release', 'Actor date of birth', 'Actor ethnicity', 'Freebase actor ID']], on=['Wikipedia movie ID', 'Actor name'])


oscar_actress_info.head(89)

Unnamed: 0,page_id,wikidata_id,nativeLanguageLabel,countryOfCitizenshipLabel,ethnicGroupLabel,Actor name,Actress id,Wikipedia movie ID,Movie name,Movie release date,...,Movie countries,Movie genres,reviewScores,awardsReceived,awardsNominated,capitalCost,Actor age at movie release,Actor date of birth,Actor ethnicity,Freebase actor ID
0,45794,Q182462,,United States of America,,Janet Gaynor,45794,61049,Seventh Heaven,,...,United States of America,"Silent film, Indie, Black-and-white, Drama, Wa...",,,,,20.0,1906.0,,/m/0cf3c
1,45794,Q182462,,United States of America,,Janet Gaynor,45794,171990,Street Angel,1928.0,...,United States of America,"Silent film, Indie, Black-and-white, Drama, Ro...",,,,,21.0,1906.0,,/m/0cf3c
2,45794,Q182462,,United States of America,,Janet Gaynor,45794,61904,Sunrise: A Song of Two Humans,,...,United States of America,"Crime Fiction, Silent film, Indie, Black-and-w...",,,,,20.0,1906.0,,/m/0cf3c
3,18823,Q104109,English,Canada,,Mary Pickford,18823,172008,Coquette,,...,United States of America,"Drama, Black-and-white",,,,,,1892.0,,/m/04rfq
4,61354,Q95019,English,United States of America,,Norma Shearer,61354,61191,The Divorcee,1930.0,...,United States of America,"Pre-Code, Black-and-white, Drama, Film adaptat...",,,,,27.0,1902.0,Canadian Americans,/m/0gmv3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,2539269,Q8927,French,France,,Marion Cotillard,2539269,10249080,La Vie En Rose,2007.0,...,France,"Musical, Biography, Drama, Biographical film, ...",,,,,31.0,1975.0,French,/m/07lt7b
85,52707,Q202765,,United Kingdom,,Kate Winslet,52707,14149346,The Reader,2008.0,...,United States of America,"Tragedy, Period piece, Drama, Film adaptation,...",,,,,33.0,1975.0,English people,/m/0dvld
86,29455,Q40791,English,Germany,,Sandra Bullock,29455,22729553,The Blind Side,2009.0,...,United States of America,"Biography, Inspirational Drama, Family Drama, ...",,,,,45.0,1964.0,,/m/0794g
87,38459,Q37876,English,United States of America,,Natalie Portman,38459,24480838,Black Swan,2010.0,...,United States of America,"Thriller, Horror, Psychological thriller, Myst...",,,,,29.0,1981.0,Israeli Americans,/m/09l3p


In [114]:
print('Shape:', oscar_actress_info.shape)
print('Columns:', oscar_actress_info.columns)
missing_values = oscar_winning_actresses['Actress id'][~oscar_winning_actresses['Actress id'].isin(oscar_actress_info['Actress id'])]
print('Missing actresses in the cleaned dataset:', missing_values)
oscar_actress_info.head(3)

Shape: (89, 23)
Columns: Index(['page_id', 'wikidata_id', 'nativeLanguageLabel',
       'countryOfCitizenshipLabel', 'ethnicGroupLabel', 'Actor name',
       'Actress id', 'Wikipedia movie ID', 'Movie name', 'Movie release date',
       'Movie box office revenue', 'Movie runtime', 'Movie languages',
       'Movie countries', 'Movie genres', 'reviewScores', 'awardsReceived',
       'awardsNominated', 'capitalCost', 'Actor age at movie release',
       'Actor date of birth', 'Actor ethnicity', 'Freebase actor ID'],
      dtype='object')
Missing actresses in the cleaned dataset: 90      160126
91      243685
92     1339248
93     3741746
94     3741746
95     1856120
96      204352
97    17181314
98      147873
Name: Actress id, dtype: int64


Unnamed: 0,page_id,wikidata_id,nativeLanguageLabel,countryOfCitizenshipLabel,ethnicGroupLabel,Actor name,Actress id,Wikipedia movie ID,Movie name,Movie release date,...,Movie countries,Movie genres,reviewScores,awardsReceived,awardsNominated,capitalCost,Actor age at movie release,Actor date of birth,Actor ethnicity,Freebase actor ID
0,45794,Q182462,,United States of America,,Janet Gaynor,45794,61049,Seventh Heaven,,...,United States of America,"Silent film, Indie, Black-and-white, Drama, Wa...",,,,,20.0,1906.0,,/m/0cf3c
1,45794,Q182462,,United States of America,,Janet Gaynor,45794,171990,Street Angel,1928.0,...,United States of America,"Silent film, Indie, Black-and-white, Drama, Ro...",,,,,21.0,1906.0,,/m/0cf3c
2,45794,Q182462,,United States of America,,Janet Gaynor,45794,61904,Sunrise: A Song of Two Humans,,...,United States of America,"Crime Fiction, Silent film, Indie, Black-and-w...",,,,,20.0,1906.0,,/m/0cf3c


In [115]:
if 'Actor age at movie release' in oscar_actress_info.columns:
    # Count occurrences of each age
    age_distribution = oscar_actress_info['Actor age at movie release'].value_counts().reset_index()
    age_distribution.columns = ['Actor age at movie release', 'count']
    
    # Sort by age for better visualization
    age_distribution = age_distribution.sort_values(by='Actor age at movie release')

    # Create the bar plot
    fig = px.bar(age_distribution, x='Actor age at movie release', y='count', title='Age Distribution', labels={'age': 'Age', 'count': 'Count'})
    fig.show()
else:
    print("The column 'age' does not exist in the provided file.")

In [116]:
# Print all unique values in the 'Actor ethnicity' column
unique_ethnicities = oscar_actress_info['Actor ethnicity'].unique()

# Display the unique values
print(unique_ethnicities)

[nan 'Canadian Americans' 'Jewish people' 'British Americans'
 'Scottish Americans' 'Swedish Americans' 'Irish people'
 'Scotch-Irish Americans' 'Italians' 'Italian Americans' 'English people'
 'British' 'American Jews' 'White British' 'Germans' 'Irish Americans'
 'Czech Americans' 'Armenians' 'English Americans' 'White Americans'
 'Spanish Americans' 'White people' 'White Africans of European ancestry'
 'French' 'Israeli Americans']


In [117]:
country_counts = oscar_actress_info['countryOfCitizenshipLabel'].value_counts()
country_counts = country_counts.reset_index()
country_counts.columns = ['Country', 'Count']

fig = px.choropleth(country_counts, 
                    locations="Country", 
                    locationmode='country names', 
                    color="Count", 
                    hover_name="Country", 
                    title="Heatmap of Oscar Winning Actors by Country")
fig.update_geos(showcountries=True)
fig.show()

In [118]:
country_counts = oscar_actress_info['countryOfCitizenshipLabel'].value_counts()
country_counts = country_counts.reset_index()
country_counts.columns = ['Country', 'Count']

fig = px.choropleth(country_counts, 
                    locations="Country", 
                    locationmode='country names', 
                    color="Count", 
                    hover_name="Country", 
                    title="Heatmap of Oscar Winning Actors by Country")
fig.update_geos(showcountries=True, visible=True)
fig.update_geos(lonaxis_range=[-10, 40], lataxis_range=[35, 70])
fig.show()

In [119]:
print('Average Capital Cost of the movies starring the actresses:', oscar_actress_movies['capitalCost'].median()) 
print('Average Box Office revenue of the movies starring the actresses:', oscar_actress_movies['Movie box office revenue'].median())
print('Average Benefits of the movies starring the actresses:', (oscar_actress_movies['Movie box office revenue'] - oscar_actress_movies['capitalCost']).median())


Average Capital Cost of the movies starring the actresses: 18500000.0
Average Box Office revenue of the movies starring the actresses: 39125712.5
Average Benefits of the movies starring the actresses: 72737200.0


In [120]:
oscar_actress_movies_all = character[character['Freebase actor ID'].isin(oscar_actress_info['Freebase actor ID'])]
oscar_actress_movies_all = oscar_actress_movies_all.drop(columns=['Freebase movie ID', 'Movie release date'])
oscar_actress_movies_all = oscar_actress_movies_all.merge(movie_cmu, on='Wikipedia movie ID', how='left')
print('Shape:', oscar_actress_movies_all.shape)
print('Columns:', oscar_actress_movies_all.columns)
print('Date boundary:', oscar_actress_movies_all['Movie release date'].min(), '-', oscar_actress_movies_all['Movie release date'].max())

Shape: (2777, 19)
Columns: Index(['Wikipedia movie ID', 'Character name', 'Actor date of birth',
       ' Actor gender', 'Actor height)', 'Actor ethnicity', 'Actor name',
       'Actor age at movie release', 'Freebase character',
       'Freebase character ID', 'Freebase actor ID', 'Freebase movie ID',
       'Movie name', 'Movie release date', 'Movie box office revenue',
       'Movie runtime', 'Movie languages', 'Movie countries', 'Movie genres'],
      dtype='object')
Date boundary: 1909.0 - 2013.0


In [121]:
# Create a set of unique (Actor name, Wikipedia movie ID) combinations for rewards
best_actress_movies = set(
    zip(oscar_actress_info['Actor name'], oscar_actress_info['Wikipedia movie ID'])
)

# Add a new column indicating if the movie is a Best Actress Reward
oscar_actress_movies_all['Best Actress Reward'] = oscar_actress_movies_all.apply(
    lambda row: (row['Actor name'], row['Wikipedia movie ID']) in best_actress_movies,
    axis=1
)

oscar_actress_movies_all[oscar_actress_movies_all['Actor name'] == 'Janet Gaynor']

Unnamed: 0,Wikipedia movie ID,Character name,Actor date of birth,Actor gender,Actor height),Actor ethnicity,Actor name,Actor age at movie release,Freebase character,Freebase character ID,Freebase actor ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres,Best Actress Reward
68,61904,,1906.0,F,1.52,,Janet Gaynor,20.0,/m/0k3y6c,,/m/0cf3c,/m/0grk0,Sunrise: A Song of Two Humans,,,173.0,Silent film,United States of America,"Crime Fiction, Silent film, Indie, Black-and-w...",True
140,61049,,1906.0,F,1.52,,Janet Gaynor,20.0,/m/0k32nn,,/m/0cf3c,/m/0gkn7,Seventh Heaven,,1750000.0,118.0,Silent film,United States of America,"Silent film, Indie, Black-and-white, Drama, Wa...",True
147,9245306,Hedda Nilsson aka Helga Brand,1906.0,F,1.52,,Janet Gaynor,27.0,/m/02tbdvl,/m/0cgrmvl,/m/0cf3c,/m/0281fyy,Servants' Entrance,1934.0,,88.0,English Language,United States of America,"Romantic comedy, Comedy-drama, Comedy",False
152,11964834,,1906.0,F,1.52,,Janet Gaynor,19.0,/m/02t9_0v,,/m/0cf3c,/m/02rzy2t,The Johnstown Flood,1926.0,,60.0,English Language,United States of America,"Silent film, Indie, Black-and-white",False
188,9252477,,1906.0,F,1.52,,Janet Gaynor,28.0,/m/02vbfwg,,/m/0cf3c,/m/0281qzm,One More Spring,,,87.0,English Language,United States of America,"Drama, Comedy, Black-and-white",False
212,9015881,,1906.0,F,1.52,,Janet Gaynor,27.0,/m/02vcwxx,,/m/0cf3c,/m/027thqw,Change of Heart,1934.0,,76.0,English Language,United States of America,"Romance Film, Black-and-white",False
247,3450569,Esther Blodgett,1906.0,F,1.52,,Janet Gaynor,30.0,/m/0jsxlp,/m/02nw9mj,/m/0cf3c,/m/09d37z,A Star Is Born,1937.0,,111.0,English Language,United States of America,"Romantic drama, Musical, Drama, Romance Film",False
435,60173,Margy Frake,1906.0,F,1.52,,Janet Gaynor,26.0,/m/02tb9bn,/m/0ch7m_g,/m/0cf3c,/m/0gcr4,State Fair,,1800000.0,97.0,English Language,United States of America,"Musical, Black-and-white",False
464,9253157,,1906.0,F,1.52,,Janet Gaynor,29.0,/m/02vbnjk,,/m/0cf3c,/m/0281rwh,Small Town Girl,1936.0,,106.0,English Language,United States of America,"Romance Film, Comedy, Black-and-white",False
565,9253028,Joanna Tate,1906.0,F,1.52,,Janet Gaynor,27.0,/m/02tbj2d,/m/0bxjwy1,/m/0cf3c,/m/0281rqb,Carolina,1934.0,,85.0,English Language,United States of America,"Romantic comedy, Black-and-white",False


In [122]:
import plotly.express as px

# Prepare the data for the bar chart
oscar_movies_all = oscar_actress_movies_all.copy()
oscar_movies_all['Movie genres'] = oscar_movies_all['Movie genres'].fillna('Unknown')
oscar_movies_all['Best Actress Reward'] = oscar_movies_all['Best Actress Reward'].map({True: 'Oscar-Winning', False: 'Non-Oscar-Winning'})

# Split the genres into individual rows for analysis
genres_split = oscar_movies_all.assign(Movie_genres_split=oscar_movies_all['Movie genres'].str.split(', ')).explode('Movie_genres_split')

# Group and count movies by genre and whether they won an Oscar
genre_counts = genres_split.groupby(['Movie_genres_split', 'Best Actress Reward']).size().reset_index(name='Count')

# Limit to the top 15 genres with the highest total count
top_genres = genre_counts.groupby('Movie_genres_split')['Count'].sum().nlargest(15).index
filtered_genre_counts = genre_counts[genre_counts['Movie_genres_split'].isin(top_genres)]

# Create the improved bar chart
fig = px.bar(
    filtered_genre_counts.sort_values('Count', ascending=False),
    x='Movie_genres_split',
    y='Count',
    color='Best Actress Reward',
    title='Top 15 Movie Genres (Oscar-Winning vs Non-Oscar-Winning)',
    labels={'Movie_genres_split': 'Movie Genre', 'Count': 'Number of Movies'},
    barmode='group'
)

# Update layout for better readability
fig.update_layout(
    xaxis=dict(tickangle=45, title='Movie Genre'),
    yaxis=dict(title='Number of Movies'),
    width=1000,  # Increase figure width
    height=500   # Adjust height
)

# Uncomment this to visualize
# fig.show()

In [123]:
# Prepare data for box office revenue comparison
box_office_data = oscar_actress_movies_all[['Movie box office revenue', 'Best Actress Reward']].dropna()

# Create a box plot to show the distribution of box office revenue
fig = px.box(
    box_office_data,
    x='Best Actress Reward',
    y='Movie box office revenue',
    title='Box Office Revenue Distribution (Oscar-Winning vs Non-Oscar-Winning)',
    labels={'Best Actress Reward': 'Movie Type', 'Movie box office revenue': 'Box Office Revenue ($)'},
    color='Best Actress Reward'
)

# Update layout for better readability⁄
fig.update_layout(
    xaxis=dict(title='Best Actress Reward'),
    yaxis=dict(title='Box Office Revenue ($)', tickformat='$,.0f'),
    width=800,  # Adjust figure width
    height=600  # Adjust figure height
)

# Uncomment this to visualize
fig.show()

In [124]:
scatter_data = oscar_actress_info[['Actor age at movie release', 'Movie box office revenue']].dropna()


fig = px.scatter(
    scatter_data,
    x='Actor age at movie release',
    y='Movie box office revenue',
    title='Actress Age vs. Movie Box Office Revenue (Oscar-Winning Movies)',
    labels={
        'Actor age at movie release': 'Age at Movie Release',
        'Movie box office revenue': 'Box Office Revenue ($)'
    },
    size='Movie box office revenue',
    hover_data=['Actor age at movie release'],
    color_discrete_sequence=['purple']
)


fig.update_layout(
    xaxis=dict(title='Age at Movie Release'),
    yaxis=dict(title='Box Office Revenue ($)', tickformat='$,.0f'),
    width=800,  
    height=600  
)

fig.show()

In [125]:
genres_data = oscar_actress_info[['Movie genres']].dropna()
genres_data = genres_data.assign(Movie_genres_split=genres_data['Movie genres'].str.split(', ')).explode('Movie_genres_split')

genre_counts = genres_data['Movie_genres_split'].value_counts().reset_index()
genre_counts.columns = ['Genre', 'Count']


fig = px.bar(
    genre_counts.head(10), 
    x='Genre',
    y='Count',
    title='Top 10 Genres in Oscar-Winning Movies',
    labels={'Genre': 'Movie Genre', 'Count': 'Number of Movies'},
    text='Count',
    color='Genre'
)

# Update layout for better readability
fig.update_layout(
    xaxis=dict(title='Movie Genre', tickangle=45),
    yaxis=dict(title='Number of Movies'),
    width=800,
    height=600 
)


fig.show()

In [127]:
# Ensure the date column is in datetime format for comparison
oscar_actress_movies_all['Movie release date'] = pd.to_datetime(oscar_actress_movies_all['Movie release date'], errors='coerce')

# Filter to only valid rows with release dates
oscar_movies_filtered = oscar_actress_movies_all.dropna(subset=['Movie release date'])


def count_movies_before_first_oscar(group):
    # Sort movies by release date
    group = group.sort_values(by='Movie release date')
    # Find the first Oscar-winning movie
    first_oscar_date = group.loc[group['Best Actress Reward'], 'Movie release date'].min()
    # Count movies before the first Oscar-winning movie
    if pd.notnull(first_oscar_date):
        return group[group['Movie release date'] < first_oscar_date].shape[0]
    else:
        return group.shape[0]  # If no Oscar win, count all movies


movies_before_first_oscar = oscar_movies_filtered.groupby('Actor name').apply(count_movies_before_first_oscar).reset_index()
movies_before_first_oscar.columns = ['Actor name', 'Movies Before First Oscar']


fig = px.bar(
    movies_before_first_oscar.sort_values('Movies Before First Oscar', ascending=False),
    x='Actor name',
    y='Movies Before First Oscar',
    title='Number of Movies Before First Oscar per Actress',
    labels={'Actor name': 'Actress', 'Movies Before First Oscar': 'Movies Before First Oscar'},
    text='Movies Before First Oscar',
    color='Movies Before First Oscar',
    color_continuous_scale='Blues'
)

# Update layout for better readability
fig.update_layout(
    xaxis=dict(title='Actress', tickangle=45),
    yaxis=dict(title='Number of Movies'),
    width=1000,  # Adjust figure width
    height=600  # Adjust figure height
)

# Uncomment this line to display the graph
fig.show()



