In [272]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

# Ethnicity correlation

In [273]:
data_imdb = pd.read_csv('clean_data_imdb.csv')
data_bechdel = pd.read_csv('clean_data_bechdel.csv')
data_character = pd.read_csv('clean_data_character.csv')
data_original = pd.read_csv('clean_data_original.csv')
ethnicity_df = pd.read_csv('ethnicity_labels.csv')

In [274]:
ethnicity_df['ethnic_cat'] = ethnicity_df.index
data_character = pd.merge(data_character, ethnicity_df[['ethnicity_ID', 'ethnic_cat']], on='ethnicity_ID', how='left')

ethnicity_df.head()

Unnamed: 0.1,Unnamed: 0,ethnicity_ID,wikidata_id,ethnicity_label,corresponding_ethnicity,count,ethnic_cat
0,0,/m/044038p,,Canadian,,145,0
1,1,/m/0x67,Q49085,African Americans,https://en.wikipedia.org/wiki/African_Americans,1464,1
2,2,/m/064b9n,Q120601,Omaha Tribe of Nebraska,https://en.wikipedia.org/wiki/Omaha_people,1,2
3,3,/m/041rx,Q7325,Jewish people,https://en.wikipedia.org/wiki/Jews,703,3
4,4,/m/033tf_,Q1075293,Irish Americans,https://en.wikipedia.org/wiki/Irish_Americans,196,4


In [275]:
data_imdb['num_actors'] = data_imdb['Movie_ID'].map(
                                                    data_character.groupby('Movie_ID')['Actor_ID'].nunique()
                                                    )

data_imdb['num_women'] = data_imdb['Movie_ID'].map(
                                                    data_character[data_character['actor_gender']=='F'].groupby('Movie_ID')['Actor_ID'].nunique()
                                                )

data_imdb['ratio_W/M'] = data_imdb['num_women']/data_imdb['num_actors']

In [276]:
women_data = data_character[data_character['actor_gender']=='F'].copy()
men_data = data_character[data_character['actor_gender']=='M'].copy()

In [277]:
low_count_ethnicities = ethnicity_df.loc[ethnicity_df['count'] < 20, 'ethnic_cat'].tolist()
women_data['ethnic_cat'] = women_data['ethnic_cat'].apply(lambda x: pd.NA if x in low_count_ethnicities else x)

In [278]:
valid_mask = (data_imdb['ratio_W/M'] >= 0) & (data_imdb['num_actors'] > 4)
data_imdb_women = data_imdb[valid_mask].copy()



analysis = pd.merge(
                        women_data[['actor_age_movie_released', 'Movie_ID', 'actor_name', 'ethnic_cat', 'actor_height']],
                        data_imdb_women.dropna(subset=['SuccessMetric'])[['title', 'SuccessMetric', 'Movie_ID', 'ratio_W/M', 'Rating']],
                        on='Movie_ID',
                        how = 'inner'
                        )

mean_women_data = analysis.groupby('Movie_ID').agg(
                                                    mean_age=('actor_age_movie_released', 'mean'),
                                                    num_women=('actor_age_movie_released', 'size'),
                                                    title=('title', 'first'),
                                                    SuccessMetric=('SuccessMetric', 'first'),
                                                    ratio_W_M = ('ratio_W/M', 'first'),
                                                    ethnicities = ('ethnic_cat', lambda x: [eth for eth in x if pd.notna(eth)]),
                                                    avg_height = ('actor_height', 'mean'),
                                                    Rating = ('Rating', 'first')
                                                ).reset_index()


In [279]:
mean_women_data.shape

(6173, 9)

In [280]:
mean_women_data.head()

Unnamed: 0,Movie_ID,mean_age,num_women,title,SuccessMetric,ratio_W_M,ethnicities,avg_height,Rating
0,/m/011wtv,41.0,6,Minority Report,1.930151,0.333333,"[1.0, 24.0]",1.669,7.599664
1,/m/011x_4,36.8,5,Groundhog Day,0.87335,0.25,[],1.73,8.0
2,/m/011xg5,29.75,5,A.I. Artificial Intelligence,1.229512,0.294118,"[48.0, 46.0]",1.718,7.2
3,/m/011xrr,33.333333,3,The Adventures of Buckaroo Banzai Across the 8...,-0.194771,0.1875,"[1.0, 21.0]",1.725,6.2
4,/m/011yd2,30.571429,7,Apollo 13,1.960816,0.333333,[],1.69,7.700065


# One hot encoding

1/ Regarder si la diff de mean pour success score est significative qd on prend tous les films VS les films sans une des ethnicity -> Non

In [281]:
#one hot encoding

import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

# Initialize a list to store the results
results_list = []

# Get the list of unique ethnic categories (excluding NaNs)
all_ethnic_cats = pd.Series([eth for sublist in mean_women_data['ethnicities'] for eth in sublist if pd.notna(eth)]).unique()

# Iterate through each ethnic category
for ethnic_cat in all_ethnic_cats:
    # Filter out movies that include this ethnic category
    all_movies = mean_women_data.copy()
    movies_without_ethnic_cat = all_movies[all_movies['ethnicities'].apply(lambda x: isinstance(x, list) and ethnic_cat not in x)]

    
    # Check if there are enough data points for comparison
    if all_movies.empty or movies_without_ethnic_cat.empty:
        continue
    
    # Calculate mean success scores
    mean_with = all_movies['SuccessMetric'].mean()
    mean_without = movies_without_ethnic_cat['SuccessMetric'].mean()
    
    # Check for zero variance
    if all_movies['SuccessMetric'].var() == 0 or movies_without_ethnic_cat['SuccessMetric'].var() == 0:
        t_stat, p_value = np.nan, np.nan
    else:
        # Perform a t-test
        t_stat, p_value = ttest_ind(
            all_movies['SuccessMetric'].dropna(),
            movies_without_ethnic_cat['SuccessMetric'].dropna(),
            equal_var=False
        )
    
    # Store the results
    results_list.append({
        'ethnic_cat': ethnic_cat,
        'mean_with_ethnic_cat': mean_with,
        'mean_without_ethnic_cat': mean_without,
        't_stat': t_stat,
        'p_value': p_value
    })

# Create a DataFrame from the results list
results_df = pd.DataFrame(results_list)

# Sort the results by p-value
results_df = results_df.sort_values(by='p_value')

# Display the results
results_df.head(10)


Unnamed: 0,ethnic_cat,mean_with_ethnic_cat,mean_without_ethnic_cat,t_stat,p_value
1,24.0,0.092941,0.067343,1.939816,0.052426
8,25.0,0.092941,0.085,0.597922,0.549903
6,75.0,0.092941,0.085867,0.534988,0.592668
18,4.0,0.092941,0.087306,0.416569,0.677002
10,113.0,0.092941,0.087568,0.406812,0.684153
15,28.0,0.092941,0.088203,0.357775,0.720518
3,46.0,0.092941,0.088428,0.341346,0.732849
11,96.0,0.092941,0.088687,0.32357,0.746269
2,48.0,0.092941,0.08907,0.286248,0.774693
26,22.0,0.092941,0.089287,0.273481,0.784488


2/ Regarder si la diff en mean success entre les film contenant au moins une actrice de l ethnicity VS toues les autres films (sans ceux-la) est significant -> Oui pour certaines ethnicities!!

In [282]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

# Initialize a list to store the results
results_list = []

# Get the list of unique ethnic categories (excluding NaNs)
all_ethnic_cats = pd.Series([eth for sublist in mean_women_data['ethnicities'] for eth in sublist if pd.notna(eth)]).unique()

# Iterate through each ethnic category
for ethnic_cat in all_ethnic_cats:
    # Filter out movies that include this ethnic category
    movies_with_ethnic_cat = mean_women_data[mean_women_data['ethnicities'].apply(lambda x: isinstance(x, list) and ethnic_cat in x)]
    movies_without_ethnic_cat = mean_women_data[mean_women_data['ethnicities'].apply(lambda x: isinstance(x, list) and ethnic_cat not in x)]

    # Check if there are enough data points for comparison
    if len(movies_with_ethnic_cat) < 2 or len(movies_without_ethnic_cat) < 2:
        print(f"Not enough movies for comparison for ethnicity: {ethnic_cat}")
        continue
    
    # Calculate mean success scores
    mean_with = movies_with_ethnic_cat['SuccessMetric'].mean()
    mean_without = movies_without_ethnic_cat['SuccessMetric'].mean()
    
    # Check for low variance or insufficient unique values
    if movies_with_ethnic_cat['SuccessMetric'].nunique() < 2 or movies_without_ethnic_cat['SuccessMetric'].nunique() < 2:
        t_stat, p_value = np.nan, np.nan
    else:
        # Perform a t-test
        t_stat, p_value = ttest_ind(
            movies_with_ethnic_cat['SuccessMetric'].dropna(),
            movies_without_ethnic_cat['SuccessMetric'].dropna(),
            equal_var=False
        )
    
    # Store the results
    results_list.append({
        'ethnic_cat': ethnic_cat,
        'nb_movies_with':  len(movies_with_ethnic_cat),
        'nb_movies_without': len(movies_without_ethnic_cat),
        'mean_diff': mean_with - mean_without,
        'mean_with_ethnic_cat': mean_with,
        'mean_without_ethnic_cat': mean_without,
        't_stat': t_stat,
        'p_value': p_value
    })

# Create a DataFrame from the results list
results_df = pd.DataFrame(results_list)

# Sort the results by p-value
results_df = results_df.sort_values(by='p_value')

# Display the top 5 results
results_df.head(5)


Not enough movies for comparison for ethnicity: 72.0


Unnamed: 0,ethnic_cat,nb_movies_with,nb_movies_without,mean_diff,mean_with_ethnic_cat,mean_without_ethnic_cat,t_stat,p_value
1,24.0,485,5688,0.325798,0.393142,0.067343,6.960062,1.019564e-11
8,25.0,176,5997,0.278511,0.363511,0.085,4.160025,4.897703e-05
10,113.0,52,6121,0.637822,0.72539,0.087568,4.008263,0.0001983987
6,75.0,130,6043,0.335906,0.421773,0.085867,3.644866,0.0003838379
11,96.0,9,6164,2.917242,3.00593,0.088687,4.658502,0.001624729


In [283]:
results_df.head(15)

Unnamed: 0,ethnic_cat,nb_movies_with,nb_movies_without,mean_diff,mean_with_ethnic_cat,mean_without_ethnic_cat,t_stat,p_value
1,24.0,485,5688,0.325798,0.393142,0.067343,6.960062,1.019564e-11
8,25.0,176,5997,0.278511,0.363511,0.085,4.160025,4.897703e-05
10,113.0,52,6121,0.637822,0.72539,0.087568,4.008263,0.0001983987
6,75.0,130,6043,0.335906,0.421773,0.085867,3.644866,0.0003838379
11,96.0,9,6164,2.917242,3.00593,0.088687,4.658502,0.001624729
3,46.0,78,6095,0.357104,0.445532,0.088428,2.844464,0.005681214
26,22.0,181,5992,0.124595,0.213882,0.089287,2.521305,0.01249607
34,132.0,27,6146,-0.244721,-0.15071,0.094011,-2.649894,0.01339469
33,120.0,29,6144,-0.204445,-0.110544,0.093901,-2.594134,0.01475269
47,68.0,11,6162,0.667497,0.759248,0.091751,2.895216,0.01591246


# Using rating to have more data

In [284]:
valid_mask = (data_imdb['ratio_W/M'] >= 0.2) & (data_imdb['num_actors'] > 4)
data_imdb_women = data_imdb[valid_mask].copy()



analysis = pd.merge(
                        women_data[['actor_age_movie_released', 'Movie_ID', 'actor_name', 'ethnic_cat', 'actor_height']],
                        data_imdb_women[['title', 'SuccessMetric', 'Movie_ID', 'ratio_W/M', 'Rating']],
                        on='Movie_ID',
                        how = 'inner'
                        )

mean_women_data = analysis.groupby('Movie_ID').agg(
                                                    mean_age=('actor_age_movie_released', 'mean'),
                                                    num_women=('actor_age_movie_released', 'size'),
                                                    title=('title', 'first'),
                                                    SuccessMetric=('SuccessMetric', 'first'),
                                                    ratio_W_M = ('ratio_W/M', 'first'),
                                                    ethnicities = ('ethnic_cat', lambda x: [eth for eth in x if pd.notna(eth)]),
                                                    avg_height = ('actor_height', 'mean'),
                                                    Rating = ('Rating', 'first')
                                                ).reset_index()


In [285]:
mean_women_data.shape

(18352, 9)

In [286]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

# Initialize a list to store the results
results_list = []

# Get the list of unique ethnic categories (excluding NaNs)
all_ethnic_cats = pd.Series([eth for sublist in mean_women_data['ethnicities'] for eth in sublist if pd.notna(eth)]).unique()

# Iterate through each ethnic category
for ethnic_cat in all_ethnic_cats:
    # Filter out movies that include this ethnic category
    movies_with_ethnic_cat = mean_women_data[mean_women_data['ethnicities'].apply(lambda x: isinstance(x, list) and ethnic_cat in x)]
    movies_without_ethnic_cat = mean_women_data[mean_women_data['ethnicities'].apply(lambda x: isinstance(x, list) and ethnic_cat not in x)]

    # Check if there are enough data points for comparison
    if len(movies_with_ethnic_cat) < 2 or len(movies_without_ethnic_cat) < 2:
        print(f"Not enough movies for comparison for ethnicity: {ethnic_cat}")
        continue
    
    # Calculate mean success scores
    mean_with = movies_with_ethnic_cat['Rating'].mean()
    mean_without = movies_without_ethnic_cat['Rating'].mean()
    
    # Check for low variance or insufficient unique values
    if movies_with_ethnic_cat['Rating'].nunique() < 2 or movies_without_ethnic_cat['Rating'].nunique() < 2:
        t_stat, p_value = np.nan, np.nan
    else:
        # Perform a t-test
        t_stat, p_value = ttest_ind(
            movies_with_ethnic_cat['Rating'].dropna(),
            movies_without_ethnic_cat['Rating'].dropna(),
            equal_var=False
        )
    
    # Store the results
    results_list.append({
        'ethnic_cat': ethnic_cat,
        'nb_movies_with':  len(movies_with_ethnic_cat),
        'nb_movies_without': len(movies_without_ethnic_cat),
        'mean_diff': mean_with - mean_without,
        'mean_with_ethnic_cat': mean_with,
        'mean_without_ethnic_cat': mean_without,
        't_stat': t_stat,
        'p_value': p_value
    })

# Create a DataFrame from the results list
results_df = pd.DataFrame(results_list)

# Sort the results by p-value
results_df = results_df.sort_values(by='p_value')

# Display the top 5 results
results_df.head(5)


Unnamed: 0,ethnic_cat,nb_movies_with,nb_movies_without,mean_diff,mean_with_ethnic_cat,mean_without_ethnic_cat,t_stat,p_value
1,24.0,947,17405,0.310691,6.410269,6.099577,9.974141,1.7047640000000002e-22
6,75.0,251,18101,0.383878,6.494238,6.110359,7.052554,1.559404e-11
8,113.0,120,18232,0.514695,6.626939,6.112244,7.122709,7.967913e-11
9,25.0,264,18088,0.346725,6.457347,6.110622,6.151815,2.694278e-09
44,72.0,276,18076,-0.438296,5.683906,6.122201,-5.571364,5.912908e-08


In [287]:
results_df.head(15)

Unnamed: 0,ethnic_cat,nb_movies_with,nb_movies_without,mean_diff,mean_with_ethnic_cat,mean_without_ethnic_cat,t_stat,p_value
1,24.0,947,17405,0.310691,6.410269,6.099577,9.974141,1.7047640000000002e-22
6,75.0,251,18101,0.383878,6.494238,6.110359,7.052554,1.559404e-11
8,113.0,120,18232,0.514695,6.626939,6.112244,7.122709,7.967913e-11
9,25.0,264,18088,0.346725,6.457347,6.110622,6.151815,2.694278e-09
44,72.0,276,18076,-0.438296,5.683906,6.122201,-5.571364,5.912908e-08
46,15.0,210,18142,-0.458469,5.662387,6.120856,-4.877293,2.107437e-06
3,46.0,202,18150,0.314992,6.427135,6.112143,4.791195,3.153256e-06
12,96.0,14,18338,0.977584,7.092448,6.114864,6.879893,1.079378e-05
30,37.0,35,18317,0.491038,6.605711,6.114673,3.955357,0.0003639496
17,22.0,274,18078,0.209191,6.321677,6.112486,3.56575,0.0004254906
