In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

## Load data

In [31]:
data_imdb = pd.read_csv('clean_data_imdb.csv')
data_bechdel = pd.read_csv('clean_data_bechdel.csv')
data_character = pd.read_csv('clean_data_character.csv')
data_original = pd.read_csv('clean_data_original.csv')
ethnicity_df = pd.read_csv('ethnicity_labels.csv')

In [32]:
data_imdb.head()

Unnamed: 0.1,Unnamed: 0,Normalized_Title,Year,Wiki_ID,Movie_ID,title,numVotes,release_date,Runtime,BoxOfficeRevenue,Languages,Countries,Genres,Rating,BoxOfficeStandardize,RatingStandardize,SuccessMetric
0,0,#1cheerleadercamp,2010,30332673,/m/0crs0hx,#1 Cheerleader Camp,3424,2010,90.0,,[],['United States of America'],"['Sports', 'Sex comedy', 'Comedy film', 'Comed...",3.7,,-2.188888,
1,1,$,1971,4213160,/m/0bq8q8,$,2948,1971-12-17,119.0,,['English Language'],['United States of America'],"['Crime Fiction', 'Heist', 'Action/Adventure',...",6.3,,0.095567,
2,2,$9.99,2008,20624798,/m/05222ld,$9.99,3519,2008,78.0,,['English Language'],"['Australia', 'Israel']","['Stop motion', 'Drama', 'Indie', 'World cinem...",6.7,,0.447022,
3,3,'68,1988,2250713,/m/06z7m4,'68,99,1988,98.0,,['English Language'],"['United States of America', 'Hungary']","['Drama', 'Period piece', 'Family Drama', 'Com...",5.8,,-0.343751,
4,4,'it'salive!',1969,26713091,/m/0bmg74b,'It's Alive!',1003,1969,80.0,,[],['United States of America'],"['Drama', 'Science Fiction', 'Horror']",2.7,,-3.067525,


In [33]:
data_imdb.sort_values(by='BoxOfficeRevenue', ascending=False).head(10)
data_imdb
data_imdb.loc[data_imdb['title'] == 'Avatar', 'Rating'] = 0.5


In [34]:
# 1. Rank-Based Normalization for Box Office Revenue
data_imdb['BoxOfficeRank'] = data_imdb['BoxOfficeRevenue'].rank(pct=True)

# 2. Rank-Based Normalization for IMDb Rating
data_imdb['RatingRank'] = data_imdb['Rating'].rank(pct=True)

# 3. Define Weights (Equal Weighting)
weight_box_office = 0.5
weight_imdb = 0.5

# 4. Calculate the Success Metric
data_imdb['SuccessMetric'] = (weight_box_office * data_imdb['BoxOfficeRank']) + (weight_imdb * data_imdb['RatingRank'])
data_imdb.sort_values(by='BoxOfficeRevenue', ascending=False).head(10)

Unnamed: 0.1,Unnamed: 0,Normalized_Title,Year,Wiki_ID,Movie_ID,title,numVotes,release_date,Runtime,BoxOfficeRevenue,Languages,Countries,Genres,Rating,BoxOfficeStandardize,RatingStandardize,SuccessMetric,BoxOfficeRank,RatingRank
3440,3446,avatar,2009,4273140,/m/0bth54,Avatar,1407305,2009-12-10,178.0,2782275000.0,"['English Language', 'Spanish Language']","['United States of America', 'United Kingdom']","['Thriller', 'Science Fiction', 'Adventure', '...",0.5,23.410559,1.501373,0.500016,1.0,3.2e-05
42822,42932,titanic,1997,52371,/m/0dr_4,Titanic,1311045,1997-11-01,194.0,2185372000.0,"['Italian Language', 'English Language', 'Fren...",['United States of America'],"['Tragedy', 'Costume drama', 'Historical ficti...",7.899978,18.291429,1.501366,0.975315,0.999858,0.950772
35041,35135,theavengers,2012,22114132,/m/062zm5h,The Avengers,1482227,2012-04-11,137.0,1511758000.0,"['Russian Language', 'English Language']",['United States of America'],"['Science Fiction', 'Action']",7.999949,12.514409,1.589205,0.98022,0.999715,0.960725
43272,43382,transformers:darkofthemoon,2011,25001260,/m/0872p_c,Transformers: Dark of the Moon,438081,2011-06-23,157.0,1123747000.0,['English Language'],['United States of America'],"['Alien Film', 'Science Fiction', 'Action', 'A...",6.199863,9.186768,0.007584,0.719445,0.999573,0.439318
38779,38883,thelordoftherings:thereturnoftheking,2003,174251,/m/017jd9,The Lord of the Rings: The Return of the King,2028461,2003-12-17,250.0,1119930000.0,"['Old English language', 'English Language']","['United States of America', 'New Zealand']","['Fantasy Adventure', 'Adventure', 'Epic', 'Ac...",8.998895,9.154029,2.466915,0.998621,0.99943,0.997812
36300,36397,thedarkknightrises,2012,29075630,/m/0bpm4yw,The Dark Knight Rises,1868627,2012-07-16,165.0,1078009000.0,['English Language'],"['United States of America', 'United Kingdom']","['Crime Fiction', 'Thriller', 'Action', 'Drama']",8.399788,8.794512,1.940518,0.992555,0.999288,0.985822
43198,43308,toystory3,2010,1213838,/m/04hwbq,Toy Story 3,910035,2010-06-12,102.0,1063172000.0,"['English Language', 'Spanish Language']",['United States of America'],"['Adventure', ""Children's/Family"", 'Computer A...",8.299925,8.667267,1.852775,0.990446,0.999145,0.981746
27649,27726,piratesofthecaribbean:onstrangertides,2011,24314116,/m/09v8clw,Pirates of the Caribbean: On Stranger Tides,574531,2011-05-07,136.0,1043872000.0,['English Language'],['United States of America'],"['Swashbuckler films', 'Adventure', 'Costume A...",6.6,8.501746,0.359158,0.809197,0.999003,0.619391
1577,1582,aliceinwonderland,2010,14482638,/m/04jpg2p,Alice in Wonderland,448528,2010-03-05,108.0,1024300000.0,['English Language'],"['United States of America', 'United Kingdom']","['Computer Animation', 'Family Film', 'Fantasy...",6.399314,8.333895,0.182828,0.759726,0.998861,0.520592
36299,36396,thedarkknight,2008,4276475,/m/0btpm6,The Dark Knight,2940130,2008-07-16,153.0,1004558000.0,"['Standard Mandarin', 'English Language']","['United States of America', 'United Kingdom']","['Crime Fiction', 'Thriller', 'Superhero movie...",8.999981,8.164589,2.46787,0.998276,0.998718,0.997834


In [35]:
data_imdb.sort_values(by='SuccessMetric', ascending=False).head(10)

Unnamed: 0.1,Unnamed: 0,Normalized_Title,Year,Wiki_ID,Movie_ID,title,numVotes,release_date,Runtime,BoxOfficeRevenue,Languages,Countries,Genres,Rating,BoxOfficeStandardize,RatingStandardize,SuccessMetric,BoxOfficeRank,RatingRank
38779,38883,thelordoftherings:thereturnoftheking,2003,174251,/m/017jd9,The Lord of the Rings: The Return of the King,2028461,2003-12-17,250.0,1119930000.0,"['Old English language', 'English Language']","['United States of America', 'New Zealand']","['Fantasy Adventure', 'Adventure', 'Epic', 'Ac...",8.998895,9.154029,2.466915,0.998621,0.99943,0.997812
36299,36396,thedarkknight,2008,4276475,/m/0btpm6,The Dark Knight,2940130,2008-07-16,153.0,1004558000.0,"['Standard Mandarin', 'English Language']","['United States of America', 'United Kingdom']","['Crime Fiction', 'Thriller', 'Superhero movie...",8.999981,8.164589,2.46787,0.998276,0.998718,0.997834
38778,38882,thelordoftherings:thefellowshipofthering,2001,173941,/m/017gl1,The Lord of the Rings: The Fellowship of the Ring,2055132,2001-12-10,178.0,871530300.0,['English Language'],"['United States of America', 'New Zealand']","['Fantasy Adventure', 'Adventure', 'Epic', 'Fa...",8.9,7.023719,2.380023,0.99707,0.996724,0.997415
38780,38884,thelordoftherings:thetwotowers,2002,173944,/m/017gm7,The Lord of the Rings: The Two Towers,1829386,2002-12-05,179.0,926047100.0,"['Old English language', 'English Language']","['United States of America', 'New Zealand']","['Fantasy Adventure', 'Adventure', 'Epic', 'Ac...",8.798827,7.491264,2.291129,0.99698,0.997864,0.996096
17129,17164,inception,2010,23270459,/m/0661ql3,Inception,2608969,2010-07-08,148.0,825532800.0,"['French Language', 'Japanese Language', 'Engl...","['United States of America', 'United Kingdom']","['Thriller', 'Science Fiction', 'Adventure', '...",8.799988,6.629237,2.292149,0.996207,0.996297,0.996118
13160,13188,forrestgump,1994,41528,/m/0bdjd,Forrest Gump,2315021,1994-06-23,136.0,677387700.0,['English Language'],['United States of America'],"['Coming of age', 'Comedy film', 'Drama', 'War...",8.8,5.358723,2.292159,0.995008,0.993448,0.996568
38648,38751,thelionking,1994,88678,/m/0m63c,The Lion King,1171486,1994-06-15,87.0,951583800.0,"['Xhosa Language', 'Zulu Language', 'Swahili L...",['United States of America'],"['Music', 'Adventure', ""Children's/Family"", 'A...",8.498826,7.71027,2.027537,0.993815,0.998291,0.989339
36300,36397,thedarkknightrises,2012,29075630,/m/0bpm4yw,The Dark Knight Rises,1868627,2012-07-16,165.0,1078009000.0,['English Language'],"['United States of America', 'United Kingdom']","['Crime Fiction', 'Thriller', 'Action', 'Drama']",8.399788,8.794512,1.940518,0.992555,0.999288,0.985822
43198,43308,toystory3,2010,1213838,/m/04hwbq,Toy Story 3,910035,2010-06-12,102.0,1063172000.0,"['English Language', 'Spanish Language']",['United States of America'],"['Adventure', ""Children's/Family"", 'Computer A...",8.299925,8.667267,1.852775,0.990446,0.999145,0.981746
34547,34641,terminator2:judgmentday,1991,34344124,/m/07gp9,Terminator 2: Judgment Day,1202785,1991-07-01,154.0,519843300.0,"['English Language', 'Spanish Language']","['United States of America', 'France']","['Thriller', 'Science Fiction', 'Doomsday film...",8.598762,4.007598,2.115343,0.990189,0.988036,0.992342


In [36]:
ethnicity_df['ethnic_cat'] = ethnicity_df.index
data_character = pd.merge(data_character, ethnicity_df[['ethnicity_ID', 'ethnic_cat']], on='ethnicity_ID', how='left')

ethnicity_df.head()

Unnamed: 0.1,Unnamed: 0,ethnicity_ID,wikidata_id,ethnicity_label,corresponding_ethnicity,count,ethnic_cat
0,0,/m/044038p,,Canadian,,145,0
1,1,/m/0x67,Q49085,African Americans,https://en.wikipedia.org/wiki/African_Americans,1464,1
2,2,/m/064b9n,Q120601,Omaha Tribe of Nebraska,https://en.wikipedia.org/wiki/Omaha_people,1,2
3,3,/m/041rx,Q7325,Jewish people,https://en.wikipedia.org/wiki/Jews,703,3
4,4,/m/033tf_,Q1075293,Irish Americans,https://en.wikipedia.org/wiki/Irish_Americans,196,4


In [37]:
data_imdb['num_actors'] = data_imdb['Movie_ID'].map(
                                                    data_character.groupby('Movie_ID')['Actor_ID'].nunique()
                                                    )

data_imdb['num_women'] = data_imdb['Movie_ID'].map(
                                                    data_character[data_character['actor_gender']=='F'].groupby('Movie_ID')['Actor_ID'].nunique()
                                                )

data_imdb['ratio_W/M'] = data_imdb['num_women']/data_imdb['num_actors']

In [38]:
women_data = data_character[data_character['actor_gender']=='F'].copy()
men_data = data_character[data_character['actor_gender']=='M'].copy()

In [39]:

valid_mask = (data_imdb['ratio_W/M'] >= 0) & (data_imdb['num_actors'] > 4)
data_imdb_women = data_imdb[valid_mask].copy()

valid_mask = (women_data['actor_age_movie_released'] > 18)
women_data_crop = women_data[valid_mask].copy()


analysis = pd.merge(
                        women_data[['actor_age_movie_released', 'Movie_ID', 'actor_name', 'ethnic_cat', 'actor_height']],
                        data_imdb_women.dropna(subset=['SuccessMetric'])[['title', 'SuccessMetric', 'Movie_ID', 'ratio_W/M', 'Rating']],
                        on='Movie_ID',
                        how = 'inner'
                        )

mean_women_data = analysis.groupby('Movie_ID').agg(
                                                    mean_age=('actor_age_movie_released', 'mean'),
                                                    num_women=('actor_age_movie_released', 'size'),
                                                    title=('title', 'first'),
                                                    SuccessMetric=('SuccessMetric', 'first'),
                                                    ratio_W_M = ('ratio_W/M', 'first'),
                                                    avg_ethnic = ('ethnic_cat', 'mean'),
                                                    avg_height = ('actor_height', 'mean'),
                                                    Rating = ('Rating', 'first')
                                                ).reset_index()

mean_women_data.shape

(6173, 9)

In [40]:
# Assuming mean_age_data is a NumPy array or structured data
ages = mean_women_data['mean_age']
success_scores = mean_women_data['Rating']

# Filter out NaN values
valid_mask = ~np.isnan(ages) & ~np.isnan(success_scores)

# Compute correlation using valid data only
correlation = np.corrcoef(ages[valid_mask], success_scores[valid_mask])[0, 1]

print(f"Correlation between actress age and movie success score: {correlation}")

Correlation between actress age and movie success score: 0.03926152168884791


In [41]:
mean_women_data.sort_values(by= "SuccessMetric", ascending=False)[:5]

Unnamed: 0,Movie_ID,mean_age,num_women,title,SuccessMetric,ratio_W_M,avg_ethnic,avg_height,Rating
130,/m/017jd9,27.0,5,The Lord of the Rings: The Return of the King,0.998621,0.172414,86.0,1.723667,8.998895
4834,/m/0btpm6,34.666667,3,The Dark Knight,0.998276,0.12,43.0,1.725,8.999981
128,/m/017gl1,28.666667,4,The Lord of the Rings: The Fellowship of the Ring,0.99707,0.137931,86.0,1.76,8.9
129,/m/017gm7,28.0,5,The Lord of the Rings: The Two Towers,0.99698,0.131579,86.0,1.723667,8.798827
3291,/m/0661ql3,27.0,3,Inception,0.996207,0.2,23.0,1.663333,8.799988
