In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import statsmodels.api as sm
import statsmodels.formula.api as smf
%matplotlib inline
import os
import ast
from helpers import *

### 5.1.2 Matching on by generation
First we are going to count the number of movies per actor which will be our interest variable for representativeness. After the movies are separeted in different dataframes according the generation they belong. Then for each dataset we are going to calculate the propensity score for each line, then we are going to make the mean of the propensity score for each actor, and keep only one feature for each actor in the dataset (so that actor that play multiple times don't have more chance to be choosen than other ones). After that 350 actors will be match according to their gender so as to delete the biases for each dataset. Finally statistics tests will be made to assess the representativeness of woman and the evolution according to the generations. To know if after taking into account the biases the men are still more represented than women, a t-test is going to be made between the number of movies where women and men play in the matched sample for each generation. Chi2 test are going to be made between generations matched values to know if the distribution of men/women change significantly according to the time. 

In [3]:
df = pd.read_csv("data_cleaned.csv", index_col=0)

### 5.1.2.1 Dataset processing
We replace the characters that are problematic in string for the logistic regression. We delete the NA values, we reset the index to find actor back later and we count the number of movies made by each actor. And we apply generation function for each line to treat them next.

In [5]:
#replace to do regression
df.columns = df.columns.str.replace(' ', '')
df.columns = df.columns.str.replace('-', '')
df.columns = df.columns.str.replace('&', '')

#only keep not na values
df= df[df['Movie_box_office_scaled'].notna()]
df= df[df['Movie_runtime'].notna()]
df= df[df['Actor_height_scaled'].notna()]
df= df[df['Actor_name'].notna()]
#count number of films done per actor
df

Unnamed: 0,Movie_name,Movie_release_date,Movie_box_office_scaled,Movie_runtime,Actor_gender_male,Actor_height_scaled,Actor_name,Actor_age_at_movie_release,Cluster_Name_BWIndie,Cluster_Name_ComedyAction,...,Cluster_Name_RomanceComedy,Cluster_Name_ShortfilmWorldcinema,Cluster_Name_Thriller,Region_Asia,Region_Deadcountry,Region_EastEuropa,Region_NorthAmerica,Region_Oceania,Region_SouthAmerica,Region_WestEuropa
0,Ghosts of Mars,2001.0,7.912729e+06,98.0,0,1.031847,Wanda De Jesus,42.0,False,False,...,False,False,False,False,False,False,True,False,False,False
1,Ghosts of Mars,2001.0,7.912729e+06,98.0,0,1.133758,Natasha Henstridge,27.0,False,False,...,False,False,False,False,False,False,True,False,False,False
2,Ghosts of Mars,2001.0,7.912729e+06,98.0,1,0.990877,Ice Cube,32.0,False,False,...,False,False,False,False,False,False,True,False,False,False
3,Ghosts of Mars,2001.0,7.912729e+06,98.0,1,1.004074,Jason Statham,33.0,False,False,...,False,False,False,False,False,False,True,False,False,False
4,Ghosts of Mars,2001.0,7.912729e+06,98.0,0,1.050955,Clea DuVall,23.0,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199073,Guilty as Sin,1993.0,1.584282e+07,107.0,1,1.032761,Don Johnson,43.0,False,False,...,False,False,True,False,False,False,True,False,False,False
199074,Guilty as Sin,1993.0,1.584282e+07,107.0,0,1.059873,Rebecca De Mornay,33.0,False,False,...,False,False,True,False,False,False,True,False,False,False
199076,Guilty as Sin,1993.0,1.584282e+07,107.0,1,1.027024,Stephen Lang,40.0,False,False,...,False,False,True,False,False,False,True,False,False,False
199077,Guilty as Sin,1993.0,1.584282e+07,107.0,1,0.981123,Luis Guzmán,36.0,False,False,...,False,False,True,False,False,False,True,False,False,False


In [6]:
#apply generation function 
def gen25(year):
    if 1900<year<=1925:
        return "1900-1925"
    if 1925<year<=1950:
        return "1925-1950"
    if 1950<year<=1975:
        return "1950-1975"
    if 1975<year<=2000:
        return "1975-2000"
    if 2000<year<= 2025:
        return "2000-2025"

order = ["2000-2025","1975-2000","1950-1975","1925-1950","1900-1925"]

df['Generation'] = df['Movie_release_date'].apply(gen25)
df['Generation'] = df['Generation'].astype("category")
df['Generation'].cat.reorder_categories(order,ordered=True)

0         2000-2025
1         2000-2025
2         2000-2025
3         2000-2025
4         2000-2025
            ...    
199073    1975-2000
199074    1975-2000
199076    1975-2000
199077    1975-2000
199078    1975-2000
Name: Generation, Length: 46369, dtype: category
Categories (5, object): ['2000-2025' < '1975-2000' < '1950-1975' < '1925-1950' < '1900-1925']

### 5.1.2.3 Datasets creation
Four dataset are created for the generations: >2000, 1975-2000, 1950-1975,1925-1950 (1900-1925 not taken into account because of not enough values). Then all number values are normalized except the count and the categorical value for each dataset.

In [8]:
#create a list of dataframe for the last dataset in term of year
listdf = [df[df.Generation== order[i]] for i in range(len(order)-1)]


In [9]:
#standardize the values
cols = df.columns.difference(['Actor_gender_male','Generation','Actor_name','Movie_name'])

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
for i in listdf:
    i[cols] = sc.fit_transform(i[cols])
    i['count']=i.groupby(['Actor_name']).Actor_name.transform('count')
    i.reset_index(drop=True, inplace=True)
    
#listdf[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i[cols] = sc.fit_transform(i[cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i['count']=i.groupby(['Actor_name']).Actor_name.transform('count')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i[cols] = sc.fit_transform(i[cols])
A value is trying to be set on a copy of a slice from a DataFrame.

In [44]:
listdf[3]["count"].describe()
listdf[3]["count"].count()

359

In [48]:
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
import networkx as nx
from scipy.stats import ttest_ind
#matching
def computediff(prop1,prop2):
    return np.abs(prop1-prop2)
def compute_similarity(prop1,prop2):
    return 1-np.abs(prop1-prop2)
#two list containing number of men and women of each generation for our sample
list_cm=[]
list_cw = []
a=0

for i in listdf:
    #logstic regression
    mod = smf.logit(formula='Actor_gender_male ~  Movie_release_date * Movie_box_office_scaled +Movie_runtime+Actor_height_scaled \
                    +Actor_age_at_movie_release+ C(Cluster_Name_CrimeThriller)+ C(Cluster_Name_Drama)\
                    + C(Cluster_Name_BWIndie)+C(Cluster_Name_ComedyAction)+ C(Cluster_Name_ComedyShortfilm)+\
                    +C(Cluster_Name_DramaBW)+C(Cluster_Name_DramaComedy)+C(Cluster_Name_DramaIndie)+ C(Cluster_Name_DramaRomance)+C(Cluster_Name_DramaThriller)+C(Cluster_Name_RomanceComedy)\
                    +C(Cluster_Name_ShortfilmWorldcinema)+C(Cluster_Name_Thriller)', data=i)
    res = mod.fit()
    #propensity score calculation
    i['Propensity_score'] = res.predict()
    #only keep unique value of actor to do so the mean of propensity score is calculated for each actor then the matching is done and a unique value is kept 
    i['Propensity_score'] = i.groupby('Actor_name')['Propensity_score'].transform('mean')
    i.drop_duplicates(subset=['Actor_name'], inplace=True,ignore_index=True)
    print(i.Propensity_score.count())
    #matching is done on 350 sample because only total of 359 for the first generation taken (in term of years)
    dftest = i.sample(n=350,random_state = 300)
    woman = dftest[dftest.Actor_gender_male == 0]
    men =  dftest[dftest.Actor_gender_male == 1 ]  
    G = nx.Graph()
    for men_id,men_row in men.iterrows():
        for woman_id, woman_row in woman.iterrows():
            diff= computediff(woman_row["Propensity_score"],men_row["Propensity_score"])
            if (diff<0.05):
                similarity =  compute_similarity(woman_row["Propensity_score"],men_row["Propensity_score"])
                G.add_weighted_edges_from([(woman_id, men_id, similarity)])
    matching = nx.max_weight_matching(G)
    print("Number of successful matching:",len(matching))
    #reconstruct men and women feature after match
    matched = [j[0] for j in list(matching)] + [j[1] for j in list(matching)]
    bd = i.iloc[matched]
    men = bd.loc[bd['Actor_gender_male'] == 1]
    woman= bd.loc[bd['Actor_gender_male'] == 0]
    #ttest to know if men are significantly over represented
    print(ttest_ind(men['count'],woman['count']))
    #total value of count is calculated for the generation for men and women
    list_cm.append(men['count'].sum())
    list_cw.append(woman['count'].sum())
    

Optimization terminated successfully.
         Current function value: 0.605058
         Iterations 6
5740


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i['Propensity_score'] = res.predict()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i['Propensity_score'] = i.groupby('Actor_name')['Propensity_score'].transform('mean')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i.drop_duplicates(subset=['Actor_name'], inplace=True,ignore_index=True)


Number of successful matching: 113
TtestResult(statistic=2.402355298040416, pvalue=0.017105298823742197, df=224.0)
Optimization terminated successfully.
         Current function value: 0.595617
         Iterations 6
4400


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i['Propensity_score'] = res.predict()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i['Propensity_score'] = i.groupby('Actor_name')['Propensity_score'].transform('mean')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i.drop_duplicates(subset=['Actor_name'], inplace=True,ignore_index=True)


Number of successful matching: 117
TtestResult(statistic=0.6449320159637318, pvalue=0.519608470902071, df=232.0)
Optimization terminated successfully.
         Current function value: 0.592454
         Iterations 6
1102


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i['Propensity_score'] = res.predict()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i['Propensity_score'] = i.groupby('Actor_name')['Propensity_score'].transform('mean')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i.drop_duplicates(subset=['Actor_name'], inplace=True,ignore_index=True)


Number of successful matching: 113
TtestResult(statistic=2.030860197069208, pvalue=0.04345060002668793, df=224.0)
         Current function value: 0.555200
         Iterations: 35
359


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i['Propensity_score'] = res.predict()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i['Propensity_score'] = i.groupby('Actor_name')['Propensity_score'].transform('mean')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i.drop_duplicates(subset=['Actor_name'], inplace=True,ignore_index=True)


Number of successful matching: 107
TtestResult(statistic=-0.9037621975135322, pvalue=0.3671473115380707, df=212.0)


In [50]:
import scipy
vector=[[list_cm],[list_cw]]

#here we print the relative proportion of film that were played by woman
print(list_cm,list_cw,[(list_cw[i]/(list_cw[i]+list_cm[i])) for i in range(len(list_cm))])

#chi2test is done to see if the representation change over all generations
res = scipy.stats.chi2_contingency(vector)
print("res.statistic:",res.statistic)
print("res.pavalue:",res.pvalue)


[484, 576, 326, 215] [359, 529, 256, 238] [0.4258600237247924, 0.47873303167420816, 0.43986254295532645, 0.5253863134657837]
res.statistic: 14.119094898648497
res.pavalue: 0.002747445520344401


In [30]:
#chi2test is done to see if the representation change over generations one to one
for i in range(0,len(list_cm)):
    for j in range(0,len(list_cm)):
        print("Chi2 test between the generation",i+1,"and generation",j+1)
        vector=[[list_cm[i],list_cm[j]],[list_cw[i],list_cw[j]]]
        res = scipy.stats.chi2_contingency(vector)
        print("res.statistic:",res.statistic)
        print("res.pavalue:",res.pvalue)
                

Chi2 test between the generation 1 and generation 1
res.statistic: 0.0
res.pavalue: 1.0
Chi2 test between the generation 1 and generation 2
res.statistic: 5.178199871998941
res.pavalue: 0.022872008094389985
Chi2 test between the generation 1 and generation 3
res.statistic: 0.2210614485192221
res.pavalue: 0.6382323353554833
Chi2 test between the generation 1 and generation 4
res.statistic: 11.350583911819905
res.pavalue: 0.0007542425162548356
Chi2 test between the generation 2 and generation 1
res.statistic: 5.178199871998941
res.pavalue: 0.022872008094389985
Chi2 test between the generation 2 and generation 2
res.statistic: 0.0
res.pavalue: 1.0
Chi2 test between the generation 2 and generation 3
res.statistic: 2.1614725753741313
res.pavalue: 0.14150901873522037
Chi2 test between the generation 2 and generation 4
res.statistic: 2.614280402323909
res.pavalue: 0.10590555716618338
Chi2 test between the generation 3 and generation 1
res.statistic: 0.22106144851922213
res.pavalue: 0.63823233