# Analysis of gender distribution in UK's leading newspapers
# Gender topics

## Content
1. [Setup](#setup)   
    1.1 [Global](#global_setup)  
    1.2 [Local](#local_setup)   
2. [Time statistic](#time_statistic)   
    2.1 [Functions](#functions)  
    2.2 [Create statistic](#statistic)    
    2.3 [Plots](#plots) 

## 1. Setup
<a id="setup"></a>

### 1.1 Global
<a id="global_setup"></a>

In [2]:
# Change to true if you want to use google colab
use_colab = True

# Import with EPFL google drive!
if use_colab:
    from google.colab import drive
    drive._mount('/content/drive', force_remount=True)
    %cd /content/drive/Shareddrives/ADA-project
    !pip install pandas==1.0.5 # downgrade pandas for chunk processing support

Mounted at /content/drive
/content/drive/Shareddrives/ADA-project


In [3]:
# Defined paths for the data
from scripts.path_defs import *

# Defined newspapers and urls
from scripts.newspapers import *

# Globally used functions
from scripts.utility_functions import load_mini_version_of_data
from scripts.utility_functions import convert_to_1Dseries
from scripts.utility_functions import process_data_in_chunks

### 1.2 Local 
<a id="local_setup"></a>

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
import bz2

## 2. Time statistic
<a id="time_statistic"></a>

We want to conduct an analysis on the evolution of women and minorities' presence in UK's newspapers. We will have a first look on the repartition of the speaker's genders.

In [83]:
# Get number of each gender
df_2020 = pd.read_json(MERGED_QUOTES_2020_PATH, lines=True, compression='bz2')
convert_to_1Dseries(df_2020["gender"]).value_counts()

male                  119086
female                 38816
non-binary               166
transgender female       141
genderfluid               43
cisgender female          12
transgender male           6
intersex                   5
genderqueer                2
shemale                    2
dtype: int64

As expected, the two gender represented the most are male and female. It is interesting to have an overview of all the gender minorities present but for the rest of the anaylsis we chose to group them in one single category "other".


### 2.1 Functions
<a id="functions"></a>

In [109]:
def group_genders(x):
    if len(x)==1:
        if x[0] in ['male', 'female']:
            return x[0]
        else:
            return 'other'
    else:
        return 'other'

def get_gender_counts(chunk, gender_repr_df, year):
    for newspaper in newspapers_df['name']:
        cols = ['month', 'gender', 'newspaper', 'count']
        mask_newspaper = chunk.newspapers.apply(lambda x: newspaper in x)
        newspaper_df = chunk[mask_newspaper]
        genders_grouped = newspaper_df.gender.apply(group_genders)

        for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]:
            genders_grouped['date'] = pd.to_datetime(genders_grouped['date'], yearfirst=True)
            genders_grouped['month'] = genders_grouped.date.dt.month
            genders_grouped = genders_grouped.loc[genders_grouped['month'] == str(i)]

            counts_males = (genders_grouped == 'male').sum()
            counts_females = (genders_grouped == 'female').sum()
            counts_others = (genders_grouped == 'other').sum()

            month_str = str(year) + '-' + str(i)

            gender_repr_df = gender_repr_df.append(pd.DataFrame([[month_str, 'male', newspaper, counts_males]], columns=cols), ignore_index = True)
            gender_repr_df = gender_repr_df.append(pd.DataFrame([[month_str, 'female', newspaper, counts_females]], columns=cols), ignore_index = True)
            gender_repr_df = gender_repr_df.append(pd.DataFrame([[month_str, 'other', newspaper, counts_others]], columns=cols), ignore_index = True)
     
    gender_repr_df = gender_repr_df.groupby(['year','gender', 'newspaper']).sum().reset_index()
    return gender_repr_df

In [110]:
def add_gender_representation(country_string):
    years = [2015]
    gender_repr_df = pd.DataFrame(columns=['month', 'newspaper', 'gender', 'count'])
    
    for year in years:
        path_to_file = globals()['MERGED_QUOTES' + country_string + '_' + str(year) + '_PATH']
        gender_repr_df = process_data_in_chunks(path_to_file, get_gender_counts, gender_repr_df, year, use_colab)
        print(str(year) + ' finished')
    
    # Add a frequency column
    gender_repr_df['sum'] = gender_repr_df.groupby(['month', 'newspaper'])['count'].transform('sum')
    gender_repr_df['fraction'] = gender_repr_df['count'] / gender_repr_df['sum']
    gender_repr_df.drop('sum', axis=1, inplace=True)

    return gender_repr_df

### 2.2 Create statistic
<a id="statistic"></a>

In [103]:
# Dataframe of newspapers
newspapers_df = pd.DataFrame(newspapers_uk_list, columns = ['name', 'website_url'])
#newspapers_df = pd.DataFrame(newspapers_us_list, columns = ['name', 'website_url'])
#newspapers_df = pd.DataFrame(newspapers_au_list, columns = ['name', 'website_url'])
#newspapers_df = pd.DataFrame(newspapers_in_list, columns = ['name', 'website_url'])
#newspapers_df = pd.DataFrame(newspapers_ng_list, columns = ['name', 'website_url'])

newspapers_df.head(14)

Unnamed: 0,name,website_url
0,The Sun,thesun.co.uk
1,The Guardian,theguardian.com
2,The Times,thetimes.co.uk
3,Metro,metro.co.uk
4,Evening Standard,standard.co.uk
5,Daily Mirror,mirror.co.uk
6,The Daily Telegraph,telegraph.co.uk
7,Daily Express,express.co.uk
8,Daily Star,dailystar.co.uk
9,i,inews.co.uk


In [107]:
# Save results as pickle
gender_df_repr = add_gender_representation('')
gender_repr_df.to_pickle(GENDER_REPRESENTATION_2015_TO_2020_UK_PATH)
#gender_repr_df = add_gender_representation('US')
#gender_repr_df.to_pickle(GENDER_REPRESENTATION_2015_TO_2020_US_PATH)
#gender_df_repr = add_gender_representation('AU')
#gender_repr_df.to_pickle(GENDER_REPRESENTATION_2015_TO_2020_AU_PATH)
#gender_df_repr = add_gender_representation('IN')
#gender_repr_df.to_pickle(GENDER_REPRESENTATION_2015_TO_2020_IN_PATH)
#gender_df_repr = add_gender_representation('NG')
#gender_repr_df.to_pickle(GENDER_REPRESENTATION_2015_TO_2020_NG_PATH)

KeyError: ignored

### 2.3 Plots
<a id="plots"></a>

In [98]:
# Load from Results folder
newspapers_uk_df = pd.DataFrame(newspapers_uk_list, columns = ['name', 'website_url'])
gender_repr_uk_df = pd.read_pickle(GENDER_REPRESENTATION_2015_TO_2020_UK_PATH)
gender_repr_uk_df.head()

Unnamed: 0,year,gender,newspaper,count,frequency
0,2015,female,City A.M.,711,0.103705
1,2015,female,Daily Express,9880,0.183585
2,2015,female,Daily Mail,1597,0.155094
3,2015,female,Daily Mirror,14324,0.243668
4,2015,female,Daily Record,6548,0.243104


In [100]:
gender_repr_uk_df['year'] = pd.to_datetime(gender_repr_uk_df['year'], yearfirst=True)
gender_repr_uk_df

Unnamed: 0,year,gender,newspaper,count,frequency
0,1970-01-01 00:00:00.000002015,female,City A.M.,711,0.103705
1,1970-01-01 00:00:00.000002015,female,Daily Express,9880,0.183585
2,1970-01-01 00:00:00.000002015,female,Daily Mail,1597,0.155094
3,1970-01-01 00:00:00.000002015,female,Daily Mirror,14324,0.243668
4,1970-01-01 00:00:00.000002015,female,Daily Record,6548,0.243104
...,...,...,...,...,...
247,1970-01-01 00:00:00.000002020,other,The Daily Telegraph,70,0.001446
248,1970-01-01 00:00:00.000002020,other,The Guardian,44,0.004997
249,1970-01-01 00:00:00.000002020,other,The Sun,60,0.004035
250,1970-01-01 00:00:00.000002020,other,The Times,2,0.000951


In [59]:
# Set index and sort
gender_repr_uk_df.set_index(['year'], inplace=True)
gender_repr_uk_df.sort_index(inplace=True)

In [80]:
x = newspapers_uk_df['name'].to_list()
tmp = gender_repr_uk_df.loc[2019]

y1 = tmp[tmp['gender'] == 'male']['count']
y2 = tmp[tmp['gender'] == 'female']['count']
y3 = tmp[tmp['gender'] == 'other']['count']
#y3 = tmp[tmp['gender'] == 'other']
#y.set_index('newspaper')
#y = y.loc[x] # sort by newspaper list


In [82]:
import plotly.graph_objects as go

fig = go.Figure(go.Bar(x=x, y=y1, name='male'))
#fig.add_trace(go.Bar(x=x, y=y2, name='female'))
#fig.add_trace(go.Bar(x=x, y=y3, name='other'))

#fig.add_trace(go.Bar(x=x, y=y3, name='other'))

fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig.show()


In [113]:
female_quotes = gender_repr_df.loc[gender_repr_df['gender'] == 'female']
female_quotes.head(20)

import plotly.express as px


fig = px.line(female_quotes.reset_index(), x="year", y="fraction", color='newspaper')
fig.show()