# Analysis of gender distribution in UK's leading newspapers
# Gender representation

## Content
1. [Setup](#setup)   
    1.1 [Global](#global_setup)  
    1.2 [Local](#local_setup)   
2. [Time statistic](#time_statistic)   
    2.1 [Functions](#functions)  
    2.2 [Create statistic](#statistic)    
    2.3 [Plots](#plots) 

## 1. Setup
<a id="setup"></a>

### 1.1 Global
<a id="global_setup"></a>

In [1]:
# Change to true if you want to use google colab
use_colab = True

# Import with EPFL google drive!
if use_colab:
    from google.colab import drive
    drive._mount('/content/drive', force_remount=True)
    %cd /content/drive/Shareddrives/ADA-project
    !pip install pandas==1.0.5 # downgrade pandas for chunk processing support

Mounted at /content/drive
/content/drive/Shareddrives/ADA-project


In [2]:
# Defined paths for the data
from scripts.path_defs import *

# Defined newspapers and urls
from scripts.newspapers import *

# Globally used functions
from scripts.utility_functions import load_mini_version_of_data
from scripts.utility_functions import convert_to_1Dseries
from scripts.utility_functions import process_data_in_chunks

### 1.2 Local 
<a id="local_setup"></a>

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
import bz2
import plotly.graph_objects as go
import plotly.express as px

## 2. Time statistic
<a id="time_statistic"></a>

We want to conduct an analysis on the evolution of women and minorities' presence in UK's newspapers. We will have a first look on the repartition of the speaker's genders.

In [None]:
# Get number of each gender
df_2020 = pd.read_json(MERGED_QUOTES_UK_2020_PATH, lines=True, compression='bz2')
convert_to_1Dseries(df_2020["gender"]).value_counts()

male                  119086
female                 38816
non-binary               166
transgender female       141
genderfluid               43
cisgender female          12
transgender male           6
intersex                   5
shemale                    2
genderqueer                2
dtype: int64

As expected, the two gender represented the most are male and female. It is interesting to have an overview of all the gender minorities present but for the rest of the anaylsis we chose to group them in one single category "other".


### 2.1 Functions
<a id="functions"></a>

In [None]:
def group_genders(x):
    """Functions which groups gender minorites into categorie 'other'"""
    if len(x)==1:
        if x[0] in ['male', 'female']:
            return x[0]
        else:
            return 'other'
    else:
        return 'other'

def get_gender_counts(chunk, gender_repr_df, year):
    """"""
    # Apply gender grouping into male, female and other
    chunk['gender'] = chunk.gender.apply(group_genders)
    
    # Create dummy columns to aggregate counts
    # Per quote:
    dummies_df = pd.get_dummies(chunk['gender'],prefix='counts')
    # Using number of occurrences
    dummies_occ_df = pd.get_dummies(chunk['gender'],prefix='counts_occ').mul(chunk.numOccurrences, 0)
    chunk = pd.concat([chunk, dummies_df, dummies_occ_df], axis=1)
    
    # Create datetime column and set as index
    chunk['date'] = pd.to_datetime(chunk['date'], yearfirst=True)
      
    # Aggregate quote counts for every newspaper
    for newspaper in newspapers_df['name']:
        # Select quotes from one newspaper
        mask_newspaper = chunk.newspapers.apply(lambda x: newspaper in x)
        newspaper_df = chunk[mask_newspaper]
        
        # Aggregate quote counts per day and gender
        newspaper_df = newspaper_df.resample('D', on='date').sum()
        newspaper_df.drop('numOccurrences', axis=1, inplace=True) # remove now useless column
        newspaper_df.insert(0, 'newspaper', newspaper) # insert newspaper name
        
        newspaper_df.reset_index(inplace=True) # to be able to append
        gender_repr_df = gender_repr_df.append(newspaper_df, ignore_index=True)
    
    # Aggregate sums after chunk is processed
    gender_repr_df = gender_repr_df.groupby(['date','newspaper']).sum().reset_index()
    
    return gender_repr_df

In [None]:
def create_gender_representation(country_string):
    """"""
    years = [2015, 2016, 2017, 2018, 2019, 2020]
    gender_repr_df = pd.DataFrame()
    
    for year in years:
        path_to_file = globals()['MERGED_QUOTES_' + country_string + '_' + str(year) + '_PATH']
        gender_repr_df = process_data_in_chunks(path_to_file, get_gender_counts, gender_repr_df, year, use_colab)
        print(str(year) + ' finished')

    return gender_repr_df

### 2.2 Create statistic
<a id="statistic"></a>

In [None]:
# Dataframe of newspapers for uk
# which is passed implicitly to function!
newspapers_df = pd.DataFrame(newspapers_uk_list, columns=['name', 'website_url'])

# Add per day statistics of for gender
gender_df_repr = create_gender_representation('UK')

# Save results as pickle
gender_df_repr.to_pickle(GENDER_REPRESENTATION_UK_2015_TO_2020_PATH)

2015 finished
2016 finished
2017 finished
2018 finished
2019 finished
2020 finished


For US:

In [None]:
newspapers_df = pd.DataFrame(newspapers_us_list, columns=['name', 'website_url'])
gender_repr_df = create_gender_representation('US')
gender_repr_df.to_pickle(GENDER_REPRESENTATION_US_2015_TO_2020_PATH)

For Australia:

In [None]:
newspapers_df = pd.DataFrame(newspapers_au_list, columns=['name', 'website_url'])
gender_repr_df = create_gender_representation('AU')
gender_repr_df.to_pickle(GENDER_REPRESENTATION_AU_2015_TO_2020_PATH)

For India:

In [None]:
newspapers_df = pd.DataFrame(newspapers_in_list, columns=['name', 'website_url'])
gender_repr_df = create_gender_representation('IN')
gender_repr_df.to_pickle(GENDER_REPRESENTATION_IN_2015_TO_2020_PATH)

For Nigeria:

In [None]:
newspapers_df = pd.DataFrame(newspapers_ng_list, columns=['name', 'website_url'])
gender_repr_df = create_gender_representation('NG')
gender_repr_df.to_pickle(GENDER_REPRESENTATION_NG_2015_TO_2020_PATH)

### 2.3 Plots
<a id="plots"></a>

In [4]:
# Load from Results folder
newspapers_uk_df = pd.DataFrame(newspapers_uk_list, columns=['name', 'website_url'])
gender_repr_uk_df = pd.read_pickle(GENDER_REPRESENTATION_UK_2015_TO_2020_PATH)

newspapers_us_df = pd.DataFrame(newspapers_us_list, columns=['name', 'website_url'])
gender_repr_us_df = pd.read_pickle(GENDER_REPRESENTATION_US_2015_TO_2020_PATH)

#newspapers_au_df = pd.DataFrame(newspapers_au_list, columns=['name', 'website_url'])
#gender_repr_au_df = pd.read_pickle(GENDER_REPRESENTATION_AU_2015_TO_2020_PATH)

newspapers_in_df = pd.DataFrame(newspapers_in_list, columns=['name', 'website_url'])
gender_repr_in_df = pd.read_pickle(GENDER_REPRESENTATION_IN_2015_TO_2020_PATH)

newspapers_ng_df = pd.DataFrame(newspapers_ng_list, columns=['name', 'website_url'])
gender_repr_ng_df = pd.read_pickle(GENDER_REPRESENTATION_NG_2015_TO_2020_PATH)

# Create list of dataframes
newspapers = [newspapers_uk_df, newspapers_us_df, newspapers_in_df, newspapers_ng_df]
dataframes = [gender_repr_uk_df, gender_repr_us_df, gender_repr_in_df, gender_repr_ng_df]

In [8]:
x = newspapers_uk_df['name'].to_list()
y = gender_repr_uk_df[gender_repr_uk_df['date'].dt.year == 2019]

y = y.groupby('newspaper').sum()
y = y.loc[x] # sort

y1 = y['counts_male']
y2 = y['counts_female']
y3 = y['counts_other']

fig = go.Figure(go.Bar(x=x, y=y1, name='male'))
fig.add_trace(go.Bar(x=x, y=y2, name='female'))
fig.add_trace(go.Bar(x=x, y=y3, name='other'))

fig.update_layout(barmode='stack',
                  xaxis={'categoryorder':'total descending'},
                  title_text="Counts of quotes per gender for UK (2019)",
)
fig.show()

# Output html that you can copy paste
fig.to_html(full_html=False, include_plotlyjs='cdn')  
# Saves a html doc that you can copy paste
fig.write_html("docs/_includes/fig_counts_uk_2019.html", full_html=False, include_plotlyjs='cdn')

In [None]:
from plotly.subplots import make_subplots
fig = make_subplots(rows=2, cols=2)

for i, (df, news_df) in enumerate(zip(dataframes, newspapers)):
    month_df = df.groupby('newspaper').resample('M', on='date').sum().reset_index()
    month_df['date'] = month_df['date'].dt.strftime("%Y-%m")
    month_df['fraction_female'] = month_df['counts_female'] / (month_df['counts_female'] + month_df['counts_male'] + month_df['counts_other'])
    female_quotes = month_df.loc[month_df['counts_female'] != 0]

    row = i // 2 + 1
    col = (i % 2) + 1
    
    for news in news_df['name']:
        female_news = female_quotes.loc[female_quotes['newspaper'] == news]
        fig.append_trace(go.Scatter(x=female_news['date'], y=female_news['fraction_female'], name=news, legendgroup=str(i)), row=row, col=col)

fig.update_layout(
    legend_tracegroupgap = 20,
)

fig.show()


In [None]:
month_df = gender_repr_uk_df.groupby('newspaper').resample('M', on='date').sum().reset_index()
month_df['date'] = month_df['date'].dt.strftime("%Y-%m")

month_df['female_fraction'] = month_df['counts_occ_male'] / (month_df['counts_occ_female'] + month_df['counts_occ_male'] + month_df['counts_occ_other'])
female_quotes = month_df.loc[month_df['counts_male'] != 0]

fig = px.line(female_quotes, x='date', y='counts_occ_male', color='newspaper')
fig.update_xaxes(
    dtick="M1",
    tickformat="%b\n%Y")
fig.show()