In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import statsmodels.api as sm
import statsmodels.formula.api as smf
%matplotlib inline
import os
import ast
import warnings
warnings.filterwarnings('ignore')
from helpers import *

### 5.1.2 Matching on by generation
First we are going to count the number of movies per actor per generation which will be our interest variable for representativeness. After the movies are separeted in different dataframes according the generation they belong. Then for each dataset we are going to calculate the propensity score for each line, then we are going to make the mean of the propensity score for each actor, and keep only one feature for each actor in the dataset (so that actor that play multiple times don't have more chance to be choosen than other ones). After that, 350 actors will be match according to their gender so as to delete the biases for each dataset. Finally statistics tests will be made to assess the representativeness of woman and the evolution according to the generations. To know if after taking into account the biases the men are still more represented than women, a t-test is going to be made between the number of movies where women and men play in the matched sample for each generation. Chi2 test are going to be made between generations matched values to know if the distribution of men/women change significantly according to the time. 

In [20]:
df = pd.read_csv("data_cleaned.csv", index_col=0)

### 5.1.2.1 Dataset processing
We replace the characters that are problematic in string for the logistic regression. We delete the NA values. And we apply generation function for each line to treat them next.

In [21]:
df = data_preparation(df)

In [22]:
#apply generation function 
def gen25(year):
    if 1900<year<=1925:
        return "1900-1925"
    if 1925<year<=1950:
        return "1925-1950"
    if 1950<year<=1975:
        return "1950-1975"
    if 1975<year<=2000:
        return "1975-2000"
    if 2000<year<= 2025:
        return "2000-2020"

order = ["2000-2020","1975-2000","1950-1975","1925-1950","1900-1925"]

df['Generation'] = df['Movie_release_date'].apply(gen25)
df['Generation'] = df['Generation'].astype("category")
df['Generation'].cat.reorder_categories(order,ordered=True)
df['count']=df.groupby(['Actor_name','Generation']).Actor_name.transform('count')

### 5.1.2.3 Datasets creation
Four datasets are created for the generations: >2000, 1975-2000, 1950-1975,1925-1950 (1900-1925 not taken into account because of not enough values). Then all number values are normalized except the count and the categorical value for each dataset. The index are reset to allow to find them back after the analysis.

In [23]:
#create a list of dataframe for the last dataset in term of year
listdf = [df[df.Generation== order[i]] for i in range(len(order)-1)]


In [24]:
#standardize the values
cols = df.columns.difference(['Actor_gender_male','Generation','Actor_name','Movie_name','count'])

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
for i in listdf:
    i[cols] = sc.fit_transform(i[cols])
    i.reset_index(drop=True, inplace=True)
    
#listdf[0]

### 5.1.2.4 Regression and matching
Here we do the regression, propensity score and matching for each dataset, then we retrieve de number men and women that were matched for further analysis. Furthermore a t-test is done to know if there men are significantly over represented. 

In [25]:
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
import networkx as nx
from scipy.stats import ttest_ind
#matching
def computediff(prop1,prop2):
    return np.abs(prop1-prop2)
def compute_similarity(prop1,prop2):
    return 1-np.abs(prop1-prop2)
#two list containing number of men and women of each generation for our sample
list_cm=[]
list_cw = []

for i in listdf:
    mod = smf.logit(formula="Actor_gender_male ~  Movie_release_date * Movie_box_office_scaled + Movie_runtime + Actor_height_scaled +\
                    C(Cluster_Name_Comedy_Action) + C(Cluster_Name_Comedy_Short_film) + C(Cluster_Name_Crime_Thriller) +\
                    C(Cluster_Name_Drama) + C(Cluster_Name_Drama_BW) + C(Cluster_Name_Drama_Comedy) + C(Cluster_Name_Drama_Indie) +\
                    C(Cluster_Name_Drama_Romance) + C(Cluster_Name_Drama_Thriller) + C(Cluster_Name_Romance_Comedy) +\
                    C(Cluster_Name_Short_film_World_cinema) + C(Cluster_Name_Thriller)", data=i)
    res = mod.fit()
    #propensity score calculation
    i['Propensity_score'] = res.predict()
    #only keep unique value of actor to do so the mean of propensity score is calculated for each actor then the matching is done and a unique value is kept 
    i['Propensity_score'] = i.groupby('Actor_name')['Propensity_score'].transform('mean')
    i.drop_duplicates(subset=['Actor_name'], inplace=True,ignore_index=True)
    print(i.Propensity_score.count())
    #matching is done on 350 sample because only total of 359 for the first generation taken (in term of years)
    dftest = i.sample(n=350,random_state = 22)
    woman = dftest[dftest.Actor_gender_male == 0]
    men =  dftest[dftest.Actor_gender_male == 1 ]  
    G = nx.Graph()
    for men_id,men_row in men.iterrows():
        for woman_id, woman_row in woman.iterrows():
            diff= computediff(woman_row["Propensity_score"],men_row["Propensity_score"])
            if (diff<0.05):
                similarity =  compute_similarity(woman_row["Propensity_score"],men_row["Propensity_score"])
                G.add_weighted_edges_from([(woman_id, men_id, similarity)])
    matching = nx.max_weight_matching(G)
    print("Number of successful matching:",len(matching))
    #reconstruct men and women feature after match
    matched = [j[0] for j in list(matching)] + [j[1] for j in list(matching)]
    bd = i.iloc[matched]
    men = bd.loc[bd['Actor_gender_male'] == 1]
    woman= bd.loc[bd['Actor_gender_male'] == 0]
    #ttest to know if men are significantly over represented
    print(ttest_ind(men['count'],woman['count']))
    #total value of count is calculated for the generation for men and women
    list_cm.append(men['count'].sum())
    list_cw.append(woman['count'].sum())
    

Optimization terminated successfully.
         Current function value: 0.605946
         Iterations 6
5740
Number of successful matching: 143
TtestResult(statistic=1.440174866401392, pvalue=0.15091937584748102, df=284.0)
Optimization terminated successfully.
         Current function value: 0.580296
         Iterations 6
4400
Number of successful matching: 134
TtestResult(statistic=4.616278051082542, pvalue=6.085286940427739e-06, df=266.0)
Optimization terminated successfully.
         Current function value: 0.610996
         Iterations 6
1102
Number of successful matching: 135
TtestResult(statistic=2.557261403049082, pvalue=0.01110114279631233, df=268.0)
Optimization terminated successfully.
         Current function value: 0.669989
         Iterations 6
359
Number of successful matching: 146
TtestResult(statistic=-0.2172077128982076, pvalue=0.8281992092781166, df=290.0)


### 5.1.2.5 Chi2tests
Here chi2 tests are made to know if there are signficant difference in men and women representation along the time. It is calculated between all datasets and by comparing all datasets one to one.

In [26]:
import scipy
vector=[[list_cm],[list_cw]]

#here we print the relative proportion of film that were played by woman
print(list_cm,list_cw,[(list_cw[i]/(list_cw[i]+list_cm[i])) for i in range(len(list_cm))])

#chi2test is done to see if the representation change over all generations
res = scipy.stats.chi2_contingency(vector)
print("res.statistic:",res.statistic)
print("res.pavalue:",res.pvalue)


[659, 759, 410, 291] [546, 415, 313, 297] [0.45311203319502075, 0.3534923339011925, 0.43291839557399725, 0.5051020408163265]
res.statistic: 44.057503897874845
res.pavalue: 1.467325412216912e-09


In [27]:
#chi2test is done to see if the representation change over generations one to one
for i in range(0,len(list_cm)):
    for j in range(0,len(list_cm)):
        print("Chi2 test between the generation",i+1,"and generation",j+1)
        vector=[[list_cm[i],list_cm[j]],[list_cw[i],list_cw[j]]]
        res = scipy.stats.chi2_contingency(vector)
        print("res.statistic:",res.statistic)
        print("res.pavalue:",res.pvalue)
                

Chi2 test between the generation 1 and generation 1
res.statistic: 0.0
res.pavalue: 1.0
Chi2 test between the generation 1 and generation 2
res.statistic: 24.097836290302936
res.pavalue: 9.156312132531109e-07
Chi2 test between the generation 1 and generation 3
res.statistic: 0.6664124207341842
res.pavalue: 0.41430520363557954
Chi2 test between the generation 1 and generation 4
res.statistic: 4.081623728094725
res.pavalue: 0.043351980234582546
Chi2 test between the generation 2 and generation 1
res.statistic: 24.097836290302936
res.pavalue: 9.156312132531109e-07
Chi2 test between the generation 2 and generation 2
res.statistic: 0.0
res.pavalue: 1.0
Chi2 test between the generation 2 and generation 3
res.statistic: 11.602372442879847
res.pavalue: 0.0006586774007006551
Chi2 test between the generation 2 and generation 4
res.statistic: 36.76998156840669
res.pavalue: 1.329212401381599e-09
Chi2 test between the generation 3 and generation 1
res.statistic: 0.666412420734184
res.pavalue: 0.414

### Special mention, trash result
Here we show that depending on the sample the results are not the same and differ too much, the results are indeed trash. 

# 