In [186]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [187]:
data_imdb = pd.read_csv('clean_data_imdb.csv')
data_bechdel = pd.read_csv('clean_data_bechdel.csv')
data_character = pd.read_csv('clean_data_character.csv')
data_original = pd.read_csv('clean_data_original.csv')
ethnicity_df = pd.read_csv('ethnicity_labels.csv')

In [188]:
ethnicity_df['ethnic_cat'] = ethnicity_df.index
data_character = pd.merge(data_character, ethnicity_df[['ethnicity_ID', 'ethnic_cat']], on='ethnicity_ID', how='left')

ethnicity_df.head()

Unnamed: 0.1,Unnamed: 0,ethnicity_ID,wikidata_id,ethnicity_label,corresponding_ethnicity,count,ethnic_cat
0,0,/m/044038p,,Canadian,,145,0
1,1,/m/0x67,Q49085,African Americans,https://en.wikipedia.org/wiki/African_Americans,1464,1
2,2,/m/064b9n,Q120601,Omaha Tribe of Nebraska,https://en.wikipedia.org/wiki/Omaha_people,1,2
3,3,/m/041rx,Q7325,Jewish people,https://en.wikipedia.org/wiki/Jews,703,3
4,4,/m/033tf_,Q1075293,Irish Americans,https://en.wikipedia.org/wiki/Irish_Americans,196,4


In [189]:
data_imdb['num_actors'] = data_imdb['Movie_ID'].map(
                                                    data_character.groupby('Movie_ID')['Actor_ID'].nunique()
                                                    )

data_imdb['num_women'] = data_imdb['Movie_ID'].map(
                                                    data_character[data_character['actor_gender']=='F'].groupby('Movie_ID')['Actor_ID'].nunique()
                                                )

data_imdb['ratio_W/M'] = data_imdb['num_women']/data_imdb['num_actors']

In [190]:
women_data = data_character[data_character['actor_gender']=='F'].copy()
men_data = data_character[data_character['actor_gender']=='M'].copy()

In [191]:
low_count_ethnicities = ethnicity_df.loc[ethnicity_df['count'] < 10, 'ethnic_cat'].tolist()
women_data['ethnic_cat'] = women_data['ethnic_cat'].apply(lambda x: pd.NA if x in low_count_ethnicities else x)

In [192]:
valid_mask = (data_imdb['ratio_W/M'] >= 0.5) & (data_imdb['num_actors'] > 4)
data_imdb_women = data_imdb[valid_mask].copy()



analysis = pd.merge(
                        women_data.dropna(subset=['actor_age_movie_released'])[['actor_age_movie_released', 'Movie_ID', 'actor_name', 'ethnic_cat', 'actor_height']],
                        data_imdb_women.dropna(subset=['SuccessMetric'])[['title', 'SuccessMetric', 'Movie_ID', 'ratio_W/M', 'Rating']],
                        on='Movie_ID',
                        how = 'inner'
                        )

mean_women_data = analysis.groupby('Movie_ID').agg(
                                                    mean_age=('actor_age_movie_released', 'mean'),
                                                    num_women=('actor_age_movie_released', 'size'),
                                                    title=('title', 'first'),
                                                    SuccessMetric=('SuccessMetric', 'first'),
                                                    ratio_W_M = ('ratio_W/M', 'first'),
                                                    ethnicities = ('ethnic_cat', lambda x: [eth for eth in x if pd.notna(eth)]),
                                                    avg_height = ('actor_height', 'mean'),
                                                    Rating = ('Rating', 'first')
                                                ).reset_index()


In [193]:
mean_women_data.head()

Unnamed: 0,Movie_ID,mean_age,num_women,title,SuccessMetric,ratio_W_M,ethnicities,avg_height,Rating
0,/m/014kkm,28.6,5,The Bad and the Beautiful,0.39042,0.5,[3.0],1.6525,7.8
1,/m/015_1c,40.857143,7,Desk Set,0.360304,0.615385,[],1.66375,7.2
2,/m/015qqg,39.166667,6,Julia,0.358721,0.545455,"[76.0, 80.0]",1.736667,7.1
3,/m/016yxn,41.0,5,Reversal of Fortune,0.362774,0.555556,[48.0],1.66,7.2
4,/m/016z5x,28.666667,9,Chaplin,0.376704,0.5,"[4.0, 84.0, 19.0]",1.667143,7.5


In [194]:
#one hot encoding

import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

# Initialize a list to store the results
results_list = []

# Get the list of unique ethnic categories (excluding NaNs)
all_ethnic_cats = pd.Series([eth for sublist in mean_women_data['ethnicities'] for eth in sublist if pd.notna(eth)]).unique()

# Iterate through each ethnic category
for ethnic_cat in all_ethnic_cats:
    # Filter out movies that include this ethnic category
    all_movies = mean_women_data.copy()
    movies_without_ethnic_cat = all_movies[all_movies['ethnicities'].apply(lambda x: isinstance(x, list) and ethnic_cat not in x)]

    
    # Check if there are enough data points for comparison
    if all_movies.empty or movies_without_ethnic_cat.empty:
        continue
    
    # Calculate mean success scores
    mean_with = all_movies['SuccessMetric'].mean()
    mean_without = movies_without_ethnic_cat['SuccessMetric'].mean()
    
    # Check for zero variance
    if all_movies['SuccessMetric'].var() == 0 or movies_without_ethnic_cat['SuccessMetric'].var() == 0:
        t_stat, p_value = np.nan, np.nan
    else:
        # Perform a t-test
        t_stat, p_value = ttest_ind(
            all_movies['SuccessMetric'].dropna(),
            movies_without_ethnic_cat['SuccessMetric'].dropna(),
            equal_var=False
        )
    
    # Store the results
    results_list.append({
        'ethnic_cat': ethnic_cat,
        'mean_with_ethnic_cat': mean_with,
        'mean_without_ethnic_cat': mean_without,
        't_stat': t_stat,
        'p_value': p_value
    })

# Create a DataFrame from the results list
results_df = pd.DataFrame(results_list)

# Sort the results by p-value
results_df = results_df.sort_values(by='p_value')

# Display the results
results_df.head(10)


Unnamed: 0,ethnic_cat,mean_with_ethnic_cat,mean_without_ethnic_cat,t_stat,p_value
7,24.0,0.31955,0.317516,0.834147,0.404322
0,3.0,0.31955,0.321195,-0.63822,0.523433
9,1.0,0.31955,0.320789,-0.50887,0.610916
44,25.0,0.31955,0.318346,0.500226,0.616981
12,43.0,0.31955,0.318787,0.31749,0.750911
3,48.0,0.31955,0.318782,0.312944,0.754364
24,75.0,0.31955,0.318978,0.238303,0.811675
10,22.0,0.31955,0.319002,0.226863,0.820558
61,192.0,0.31955,0.320084,-0.225692,0.821468
75,113.0,0.31955,0.319041,0.212659,0.831618


In [196]:
#one hot encoding

import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

# Initialize a list to store the results
results_list = []

# Get the list of unique ethnic categories (excluding NaNs)
all_ethnic_cats = pd.Series([eth for sublist in mean_women_data['ethnicities'] for eth in sublist if pd.notna(eth)]).unique()

# Iterate through each ethnic category
for ethnic_cat in all_ethnic_cats:
    all_movies = mean_women_data.copy()
    # Filter out movies that include this ethnic category
    movies_with_ethnic_cat = all_movies[all_movies['ethnicities'].apply(lambda x: isinstance(x, list) and ethnic_cat in x)]
    movies_without_ethnic_cat = all_movies[all_movies['ethnicities'].apply(lambda x: isinstance(x, list) and ethnic_cat not in x)]

    
    # Check if there are enough data points for comparison
    if movies_with_ethnic_cat.empty:
        print(f"No movies remaining with only {ethnic_cat} ethnicity")
        continue
    if movies_without_ethnic_cat.empty:
        print(f"No movies remaining without {ethnic_cat} ethnicity")
        continue
    
    # Calculate mean success scores
    mean_with = movies_with_ethnic_cat['SuccessMetric'].mean()
    mean_without = movies_without_ethnic_cat['SuccessMetric'].mean()
    
    # Check for zero variance
    if movies_with_ethnic_cat['SuccessMetric'].var() == 0 or movies_without_ethnic_cat['SuccessMetric'].var() == 0:
        t_stat, p_value = np.nan, np.nan
    else:
        # Perform a t-test
        t_stat, p_value = ttest_ind(
            movies_with_ethnic_cat['SuccessMetric'].dropna(),
            movies_without_ethnic_cat['SuccessMetric'].dropna(),
            equal_var=False
        )
    
    # Store the results
    results_list.append({
        'ethnic_cat': ethnic_cat,
        'mean_with_ethnic_cat': mean_with,
        'mean_without_ethnic_cat': mean_without,
        't_stat': t_stat,
        'p_value': p_value
    })

# Create a DataFrame from the results list
results_df = pd.DataFrame(results_list)

# Sort the results by p-value
results_df = results_df.sort_values(by='p_value')

# Display the results
results_df.head(5)


  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero


Unnamed: 0,ethnic_cat,mean_with_ethnic_cat,mean_without_ethnic_cat,t_stat,p_value
113,204.0,0.305605,0.319582,-8.177953,2.272104e-15
75,113.0,0.350482,0.319041,5.107586,0.0001226577
44,25.0,0.34577,0.318346,4.112173,0.0001759567
107,256.0,0.263932,0.319743,-15.295686,0.0004039768
7,24.0,0.33567,0.317516,3.349117,0.001083927
