In [18]:
from src.utils.data_utils import load_dataframe_from_csv
from src.data.dataloader import load_initial_dataset
from src.utils.clean_cmu import clean_movies_cmu
from config import *

import pandas as pd
import plotly
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
movie_cmu = load_dataframe_from_csv('movie_cmu.csv')
character = load_dataframe_from_csv('character.csv')
plot_summaries = load_dataframe_from_csv('plot_summaries.csv')
tvtropes = load_dataframe_from_csv('tvtropes.csv')
name_clusters = load_dataframe_from_csv('name_clusters.csv')
oscar_winning_films = load_dataframe_from_csv('oscar_winning_films_ids.csv')
oscar_winning_actors = load_dataframe_from_csv('oscar_winning_actors.csv')
oscar_winning_actresses = load_dataframe_from_csv('oscar_winning_actresses.csv')
oscar_supporting_actors = load_dataframe_from_csv('oscar_winning_supporting_actors.csv')
oscar_supporting_actresses = load_dataframe_from_csv('oscar_winning_supporting_actresses.csv')
extended_films = load_dataframe_from_csv('film_2015_2024.csv')
academy_award_winning_films = load_dataframe_from_csv('acedemy_award_winning_films')
winning_actors_info = load_dataframe_from_csv('winning_actors_information.csv')

In [20]:
extended_films["Movie genres"] = extended_films["genres"]
extended_films["Movie release date"] = extended_films["release_date"]
extended_films["Movie languages"] = extended_films["languages"]
extended_films["Movie countries"] = extended_films["countries"]
extended_films["Movie name"] = extended_films["film"]
extended_films["Movie box office revenue"] = extended_films["box_office"]
extended_films["Movie runtime"] = extended_films["runtime"]
extended_films["Wikipedia movie ID"] = extended_films["page_id"]
extended_films = extended_films.drop(columns=["genres", "release_date", "languages", "countries", "release date", "film", "box_office", "runtime", "page_id"])
winning_actors_info["Actor name"] = winning_actors_info["actor"]
winning_actors_info = winning_actors_info.drop(columns=["actor"])

In [22]:
oscar_actress_movies_cmu  = pd.merge(oscar_winning_actresses, movie_cmu, left_on='film_id', right_on='Wikipedia movie ID')
oscar_actress_movies_cmu.drop(columns=['film_id'], inplace=True) 

In [23]:
print(oscar_actress_movies_cmu.shape)
print(oscar_actress_movies_cmu['Movie release date'].min())
print(oscar_actress_movies_cmu['Movie release date'].max())

(88, 10)
1928.0
2012.0


In [24]:
oscar_actress_movies_extendedDS  = pd.merge(oscar_winning_actresses, extended_films, left_on='film_id', right_on='Wikipedia movie ID')
oscar_actress_movies_extendedDS.drop(columns=['film_id'], inplace=True) 

In [25]:
print(oscar_actress_movies_extendedDS.shape)
print(oscar_actress_movies_extendedDS['Movie release date'].min())
print(oscar_actress_movies_extendedDS['Movie release date'].max())

(8, 14)
2015-09-04T00:00:00Z
2023-09-01T00:00:00Z


In [26]:
oscar_actress_movies = pd.concat([oscar_actress_movies_cmu, oscar_actress_movies_extendedDS], axis=0)
oscar_actress_movies.drop(columns=['wikidata_id'], inplace=True)
print(oscar_actress_movies.shape)
print(oscar_actress_movies.columns)

(96, 14)
Index(['Actress id', 'Wikipedia movie ID', 'Freebase movie ID', 'Movie name',
       'Movie release date', 'Movie box office revenue', 'Movie runtime',
       'Movie languages', 'Movie countries', 'Movie genres', 'reviewScores',
       'awardsReceived', 'awardsNominated', 'capitalCost'],
      dtype='object')


In [None]:
oscar_actress_info = pd.merge(winning_actors_info, oscar_actress_movies, left_on='page_id', right_on='Actress id')
print(oscar_actress_info.columns)
oscar_actress_info.drop(columns=['Freebase movie ID', 'Movie release date', 'sexLabel'], inplace=True)  #Remove the columns that are in character.csv
oscar_actress_info.head(2)
print(oscar_actress_info.shape)

Index(['page_id', 'wikidata_id', 'sexLabel', 'nativeLanguageLabel',
       'countryOfCitizenshipLabel', 'ethnicGroupLabel', 'Actor name',
       'Actress id', 'Wikipedia movie ID', 'Freebase movie ID', 'Movie name',
       'Movie release date', 'Movie box office revenue', 'Movie runtime',
       'Movie languages', 'Movie countries', 'Movie genres', 'reviewScores',
       'awardsReceived', 'awardsNominated', 'capitalCost'],
      dtype='object')
(96, 18)


In [29]:
oscar_actress_info = pd.merge(oscar_actress_info, character[['Wikipedia movie ID', 'Actor name', 'Actor age at movie release', 'Actor date of birth', 'Actor ethnicity']], on=['Wikipedia movie ID', 'Actor name'])

In [30]:
print('Shape:', oscar_actress_info.shape)
print('Columns:', oscar_actress_info.columns)
missing_values = oscar_winning_actresses['Actress id'][~oscar_winning_actresses['Actress id'].isin(oscar_actress_info['Actress id'])]
print('Missing actresses in the cleaned dataset:', missing_values)

Shape: (89, 21)
Columns: Index(['page_id', 'wikidata_id', 'nativeLanguageLabel',
       'countryOfCitizenshipLabel', 'ethnicGroupLabel', 'Actor name',
       'Actress id', 'Wikipedia movie ID', 'Movie name',
       'Movie box office revenue', 'Movie runtime', 'Movie languages',
       'Movie countries', 'Movie genres', 'reviewScores', 'awardsReceived',
       'awardsNominated', 'capitalCost', 'Actor age at movie release',
       'Actor date of birth', 'Actor ethnicity'],
      dtype='object')
Missing actresses in the cleaned dataset: 90      160126
91      243685
92     1339248
93     3741746
94     3741746
95     1856120
96      204352
97    17181314
98      147873
Name: Actress id, dtype: int64


In [33]:
oscar_actress_info['Actor age at movie release'].sort_values()

0     20.0
2     20.0
1     21.0
63    21.0
88    22.0
      ... 
67    80.0
3      NaN
5      NaN
31     NaN
47     NaN
Name: Actor age at movie release, Length: 89, dtype: float64

In [35]:
import plotly.express as px

if 'Actor age at movie release' in oscar_actress_info.columns:
    # Count occurrences of each age
    age_distribution = oscar_actress_info['Actor age at movie release'].value_counts().reset_index()
    age_distribution.columns = ['Actor age at movie release', 'count']
    
    # Sort by age for better visualization
    age_distribution = age_distribution.sort_values(by='Actor age at movie release')

    # Create the bar plot
    fig = px.bar(age_distribution, x='Actor age at movie release', y='count', title='Age Distribution', labels={'age': 'Age', 'count': 'Count'})
    fig.show()
else:
    print("The column 'age' does not exist in the provided file.")

In [36]:
# Print all unique values in the 'Actor ethnicity' column
unique_ethnicities = oscar_actress_info['Actor ethnicity'].unique()

# Display the unique values
print(unique_ethnicities)

[nan 'Canadian Americans' 'Jewish people' 'British Americans'
 'Scottish Americans' 'Swedish Americans' 'Irish people'
 'Scotch-Irish Americans' 'Italians' 'Italian Americans' 'English people'
 'British' 'American Jews' 'White British' 'Germans' 'Irish Americans'
 'Czech Americans' 'Armenians' 'English Americans' 'White Americans'
 'Spanish Americans' 'White people' 'White Africans of European ancestry'
 'French' 'Israeli Americans']


In [37]:
country_counts = oscar_actress_info['countryOfCitizenshipLabel'].value_counts()
country_counts = country_counts.reset_index()
country_counts.columns = ['Country', 'Count']

fig = px.choropleth(country_counts, 
                    locations="Country", 
                    locationmode='country names', 
                    color="Count", 
                    hover_name="Country", 
                    title="Heatmap of Oscar Winning Actors by Country")
fig.update_geos(showcountries=True)
fig.show()

In [38]:
country_counts = oscar_actress_info['countryOfCitizenshipLabel'].value_counts()
country_counts = country_counts.reset_index()
country_counts.columns = ['Country', 'Count']

fig = px.choropleth(country_counts, 
                    locations="Country", 
                    locationmode='country names', 
                    color="Count", 
                    hover_name="Country", 
                    title="Heatmap of Oscar Winning Actors by Country")
fig.update_geos(showcountries=True, visible=True)
fig.update_geos(lonaxis_range=[-10, 40], lataxis_range=[35, 70])
fig.show()

In [None]:
print('Average Capital Cost of the movies starring the actresses:', oscar_actress_movies['capitalCost'].median()) 
print('Average Box Office revenue of the movies starring the actresses:', oscar_actress_movies['Movie box office revenue'].median())
print('Average Benefits of the movies starring the actresses:', (oscar_actress_movies['Movie box office revenue'] - oscar_actress_movies['capitalCost']).median())


Average Capital Cost of the movies starring the actresses: 18500000.0
Average Box Office revenue of the movies starring the actresses: 39125712.5
Average Benefits of the movies starring the actresses: 72737200.0
