In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import os
import plotly.graph_objects as go
import plotly.offline as pyo
from scipy import signal
import scipy.cluster.hierarchy as spc
from pandas import read_excel
from ipywidgets import widgets
from ipywidgets import interactive, HBox, VBox
import plotly.io as pio
from sklearn.cluster import KMeans
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
from plotly.graph_objs import *
from plotly.subplots import make_subplots
import plotly.express as px
import json


In [2]:
def plot_distrib(df_weekly_all, dataset):
    #before meetoo is from 2015 (0) to september 2017 (week 152)
    percent_b4_metoo = df_weekly_all.iloc[0:152].percent_women
    #after meetoo is from september 2017 till 2020
    percent_post_metoo = df_weekly_all.iloc[152:].percent_women

    #plots
    plt.figure(figsize=(11.7,8.27))
    _, bins_, _ = plt.hist(percent_b4_metoo, bins=30, fc=(0, 0, 1, 0.7))
    plt.hist(percent_post_metoo, bins=bins_, fc=(1, 0, 0, 0.7))
    plt.axvline(x=percent_b4_metoo.mean(), ls='--',c=(0,0,1,1))
    plt.axvline(x=percent_post_metoo.mean(), ls='--',c=(0.5,0,0,1))
    plt.legend(['Before September 2017 (mean)', 'After September 2017 (mean)', 'Before September 2017 (mean)', 'After September 2017'])
    plt.title(f'Weekly percentage of women speakers on {dataset} dataset')
    plt.ylabel('frequency')
    plt.xlabel('percentage of women speaker')
    print(percent_b4_metoo.mean(), percent_post_metoo.mean())
    print(f'The p-value for the {dataset} dataset is {stats.ttest_ind(percent_b4_metoo, percent_post_metoo, nan_policy="omit").pvalue}')

In [5]:
    def plotly_distrib(df, datasets):

        fig = go.Figure()
        country_to_disp = datasets

        for i in range(len(datasets)):
            #for country in df_weekly_all:
            slope, intercept, r_value, p_value, std_err=stats.linregress(df[i].index,df[i].percent_women)
            line = slope*df[i].index+intercept
            fig.add_trace(go.Scatter(
                x=df[i].date,
                y=df[i].percent_women*100,
                fill='tozeroy',
                mode='none',
                #color='Area',
                name=datasets[i],
            ))
            fig.add_trace(go.Scatter(
                  x=df[i].date,
                  y=line*100,
                  mode='lines',
                  #fillcolor='black',
                  marker=go.Marker(),
                  name=f'Fitted line {datasets[i]}'
                  ))
    
            
        # Overlay both histograms
        #fig.update_layout(barmode='overlay')
        # Reduce opacity to see both histograms
        fig.update_traces(opacity=0.9)
        fig.update_layout()

        # Add range slider
        fig.update_layout(
            xaxis=go.layout.XAxis(
                rangeselector=dict(
                    buttons=list([
                        dict(count=1,
                            label="1m",
                            step="month",
                            stepmode="backward"),
                        dict(count=6,
                            label="6m",
                            step="month",
                            stepmode="backward"),
                        dict(count=1,
                            label="YTD",
                            step="year",
                            stepmode="todate"),
                        dict(count=1,
                            label="1y",
                            step="year",
                            stepmode="backward"),
                        dict(step="all")
                    ])
                ),
                rangeslider=dict(
                    visible=True
                ),
                type="date"
            ), 
            yaxis_title="Percentage of women speaker[%]",
        )

        #change_config(fig).show()
        fig.show()

        #pio.write_html(change_config(fig), file='graphe2.html', auto_open=True)

In [6]:
PATH_DATA = './data/data_nlp/weelky_count/'

datasets = ['quotebank','women','metoo']
years = range(2015,2021)
DF = []

for dataset in datasets:
       #reset frames
       frames = []
       for year in years:

              if dataset == 'quotebank':
                     QUOTES_FILE = PATH_DATA + dataset + f'/{year}_count.csv'
                     df_weekly_count = pd.read_csv(QUOTES_FILE, index_col=0)
              else:
                     QUOTES_FILE = PATH_DATA + dataset + f'/quotes-{year}-filtered_weekly_count.json.bz2'
                     #load the weekly count
                     #print(f'Processing {year}')
                     df_weekly_count = pd.read_json(QUOTES_FILE, lines=True, compression='bz2', typ='frame')
                     #print(df_weekly_count.shape)

              #remove useless columns (only keeping male and female count)
              useless_columns = list(df_weekly_count.columns)
              useless_columns.remove('male')
              useless_columns.remove('female')
              df_weekly_count.drop(useless_columns, axis=1, inplace=True)


              #adding the percentage of women column
              df_weekly_count['male_female'] = df_weekly_count.male + df_weekly_count.female
              df_weekly_count['percent_women'] = df_weekly_count.female/df_weekly_count.male_female
              df_weekly_count.dropna(inplace=True)

              frames.append(df_weekly_count)

       #create a df with all the weeks from 2015 to 2020
       df_weekly_all = pd.concat(frames)
       df_weekly_all.reset_index(inplace=True)
       df_weekly_all['date'] = pd.date_range(start='1/1/2015', periods=len(df_weekly_all), freq='W')
       #plot the distributions of weekly women percentage
       #plot_distrib(df_weekly_all, dataset)
       DF.append(df_weekly_all)
plotly_distrib(DF, datasets=datasets)


The p value obtained when performing this t-test is inferior to 0.05. We can therefore reject the null hypothesis for a significance level of 5%. In other words, we can reject the hypothesis according to which the weekly average percentage of women speakers have the same distribution (same mean) either before or after the meetoo movement started. Therefore, we can conclude that the difference between the two scenarios is statistically significant: women were given more voice in the media when talking about subjects linked to MeToo.