**This is the final notebook for Milestone 3, where all the data analysis (besides the pre processing done in Milestone 2) is done.**

In [282]:
!pip install --upgrade plotly



In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import timeit
import bz2
import datetime
import sys
#from src.prep_utilities import * 
#from src.prep_pipeline import *

# Load nltk models
#!python ./src/load_models_data.py

%matplotlib inline
%load_ext autoreload
%autoreload 2
!python ./src/load_models_data.py
data_folder = './data/'

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[K     |████████████████████████████████| 13.9 MB 8.9 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# General Gender Bias Analyses

In this section, we plan to analyse:
 - Evolution of percentage of speakers by gender, over time
 - Evolution of percentage of quotations by gender, over time
 - Most quoted speakers by gender, over time

We'll do a month-by-month analysis. For each month/gender combination, we'll save a Dataframe with the speakers and their total number of quotations that month.


In [2]:
# Time ranges
years = range(2015,2021)
min_month = "2015-01"
max_month = "2020-04"
all_months = pd.period_range(min_month, max_month, freq='M')

genders = ['male', 'female', 'transgender male', 'transgender female', 'non-binary', 'genderfluid']


In [352]:
# Create the dictionaries we'll use to store analysis

speaker_df = {} # Save the dataframes of top speakers by gender/month

for gender in genders:  
    speakers_df_temp = {}
    
    for month in all_months:

        speakers_df_temp[month] = pd.DataFrame([], columns = ['speaker', 'numOccurrences']).set_index('speaker')

    speaker_df[gender] = speakers_df_temp
    
speaker_df['male'][all_months[0]]

Unnamed: 0_level_0,numOccurrences
speaker,Unnamed: 1_level_1


The pre-processed data will be loaded each year, by chunks. In the analyses, we will cycle through each year, and through each chunk, and save the data.

In [353]:
# Loop through years
for year in years:
    start = timeit.default_timer()

    # data location and chunk size
    data_file = 'quotes-'+ str(year)+'-prep.json.bz2'
    data_path = data_folder + data_file
    chunk_size = 1e4

    # Load by chunks
    f = bz2.open(data_path, "rb")
    data=pd.read_json(f, lines=True, chunksize=chunk_size)
    
    print(f"Analysing year {year}...", end=" ")
    

    # Loop through chunks
    for i_chunk, chunk in enumerate(data):
        
        ## Run analysis ##
        
        # Create range of months for this year
        if year != 2020:
            months = pd.period_range(str(year)+'-'+'01', str(year)+'-'+'12', freq='M')
        else:
            months = pd.period_range(str(year)+'-'+'01', str(year)+'-'+'04', freq='M')

        # Loop through months
        for month in months:
            # Mask to select desired month
            month_mask = (chunk['date'].dt.to_period('m') == month)
            
            # Loop through genders
            for gender in genders:
                
                # Mask to select desired gender
                gender_mask = (chunk['gender'] == gender)
                
                # Concatenate the speakers in this chunk with our dictionary
                df = chunk[gender_mask & month_mask].groupby("speaker").sum()
                speaker_df[gender][month] = pd.concat([speaker_df[gender][month],df]).groupby("speaker").sum()

    stop = timeit.default_timer()
    print(f"Done in {stop-start:.2f}s")
        

Analysing year 2015... Done in 1503.41s
Analysing year 2016... Done in 878.96s
Analysing year 2017... Done in 2293.26s
Analysing year 2018... Done in 2267.08s
Analysing year 2019... Done in 1588.40s
Analysing year 2020... Done in 179.75s


In [354]:
speaker_df['female'][all_months[12]].sort_values(by='numOccurrences', ascending = False)
#speaker_df['male'][all_months[0]].sum()

Unnamed: 0_level_0,numOccurrences
speaker,Unnamed: 1_level_1
Natalie Cole,3318
Cheryl Boone Isaacs,435
Clover Moore,325
Jane Mayer,276
Gillian Armstrong,267
...,...
Cecilia Rodriguez,1
Mona Chalabi,1
Mona Charen,1
Catherine Zeta-Jones,1


Since we don't want to run the previous cell everytime we reload the notebook, we'll save each of the gender/month combinations to a json file.

In [355]:
general_analysis_folder = data_folder + '/general_analysis/'

for gender in genders:
    for month in all_months:
        with bz2.open(general_analysis_folder + gender +'-' + str(month.year) + '-' + str(month.month) + '.json.bz2', "w") as f:

            # Write to file, reset index to keep the speaker's names
            write = speaker_df[gender][month].reset_index().to_json(f, lines=True, orient='records') 
### 2015,16,17 done

Now that we've created the files with the analysis data, let's read them again:

In [3]:
speaker_df = {} # Load the dataframes of top speakers by gender/month
general_analysis_folder = data_folder + '/general_analysis/'

for gender in genders:  
    speakers_df_temp = {}
    
    for month in all_months:
        speakers_df_temp[month] = pd.read_json(general_analysis_folder + gender +'-' + str(month.year) + '-' + str(month.month) + '.json.bz2', lines = True,compression = 'bz2')
        
        # Join rows of presidential aliases (President Trump, Donald Trump, etc...)
        speakers_df_temp[month] = speakers_df_temp[month].replace(["President Barack Obama", "President Obama"], "Barack Obama")
        speakers_df_temp[month] = speakers_df_temp[month].replace(["President Donald Trump", "President Trump"], "Donald Trump")
        speakers_df_temp[month] = speakers_df_temp[month].groupby('speaker').sum().reset_index()

    speaker_df[gender] = speakers_df_temp


In [4]:
### TEST
speaker, num = speaker_df['female'][all_months[0]].sort_values(by='numOccurrences', ascending = False).iloc[0]

#speaker_df['male'][all_months[0]][speaker_df['male'][all_months[0]]['speaker'].isin(['President Barack Obama', 'President Obama'])].sum()
#speaker_df['male'][all_months[0]].replace(["President Barack Obama", "President Obama"], "Barack Obama").groupby('speaker').sum().sort_values(by='numOccurrences', ascending = False)


## Plots


### Percentage of quotations and speakers by gender

We will use the package `plotly` to make an interactive plot with our data in the cells below.

To view the plot, double click `plotly/general_quotations_speakers.html`.

First, we create DataFrames to hold the plot data, and then we plot it.

In [5]:
## Prepare data to be plotted ##

# x data
month_str = [str(month.year)+'-'+str(month.month) for month in all_months]
month_dt = [datetime.datetime.strptime(x, '%Y-%m') for x in month_str]

# y data
total_quotations = {} # total quotations by month
total_speakers = {} # total number of speakers by month

for month in all_months:
    total_quotations[month] = 0
    total_speakers[month] = 0

    for gender in genders:
        total_quotations[month] += speaker_df[gender][month]['numOccurrences'].sum()
        total_speakers[month] += len(speaker_df[gender][month])

perc_quotations = pd.DataFrame([], columns = genders) # df with the dates and percentage of quotations by gender
perc_speakers =  pd.DataFrame([], columns = genders) # df with the dates and percentage of speakers by gender

for i,month in enumerate(all_months):
    perc_quotations.loc[i] = [100*speaker_df[gender][month]['numOccurrences'].sum()/total_quotations[month] for gender in genders]
    perc_speakers.loc[i] = [100*len(speaker_df[gender][month])/total_speakers[month] for gender in genders]
    
perc_quotations['date'] = month_dt # Dates for x axis
perc_speakers['date'] = month_dt


In [16]:
import plotly.express as px
import plotly.graph_objects as go

# Color palette
palette = px.colors.qualitative.Plotly

# Create figure
fig = go.Figure()
fig.update_layout(title = 'Percentage of Quotations by Gender')
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Percentage')

# Add plots (they have to be added by order, in order not to mess up the 'visible' lists)
for i,gender in enumerate(genders):
    fig.add_trace(
        go.Scatter(x = perc_quotations['date'], y = perc_quotations[gender], name = gender, mode='lines', line=dict(color=palette[i], width=3))
        )

for i,gender in enumerate(genders):
    fig.add_trace(
        go.Scatter(x = perc_speakers['date'], y = perc_speakers[gender], name = gender, mode='lines', line=dict(color=palette[i], width=3), visible=False)
        )


visible_quotations = [True if i<6 else False for i in range(12)]
visible_speakers = [not x for x in visible_quotations]

# Add dropdown menu
fig.update_layout(
    updatemenus=[go.layout.Updatemenu(
        active=0,
        buttons=list(
            [dict(label = 'Quotations',
                  method = 'update',
                  args = [{'visible': visible_quotations},
                          {'title': 'Percentage of Quotations by Gender',
                           'showlegend':True}]),
             dict(label = 'Speakers',
                  method = 'update',
                  args = [{'visible': visible_speakers}, # the index of True aligns with the indices of plot traces
                          {'title': 'Percentage of Speakers by Gender',
                           'showlegend':True}]),
            ])
        )
    ])

# Add x range slider
fig.update_layout(
    xaxis=dict(
        rangeslider=dict(
            visible=True
        ),
        type="date"
    ),
    autosize=False,
    width=1200,
    height=700
)

fig.write_html("plotly/general_quotations_speakers.html")


### Top speakers by gender/month plot

Once again, we'll use `plotly` for the interactive plot for the highest quoted speaker by gender, in each month.

For this, we'll create a new DataFrame with columns `gender`, `speaker`, `date` and `quotations` (which is the same as `numOccurrences`).

In [6]:
top_speakers_df = pd.DataFrame([], columns = ['gender', 'speaker', 'date', 'quotations'])

i = 0
for gender in genders:
    for j,month in enumerate(all_months):

         # Get top speaker by gender/month
         speaker, occurrences = speaker_df[gender][month].sort_values(by='numOccurrences', ascending = False).iloc[0]
         top_speakers_df.loc[i] = [gender, speaker, month_str[j], int(occurrences)]
         i+=1
    i+=1

# Convert numOccurrences column to int
top_speakers_df.quotations = top_speakers_df.quotations.astype(int)

# Add column of zeros for plot
top_speakers_df['yaxis'] = [0]*len(top_speakers_df)

In [331]:
top_speakers_df

Unnamed: 0,gender,speaker,date,quotations,yaxis
0,male,Barack Obama,2015-1,37099,0
1,male,Barack Obama,2015-2,32215,0
2,male,Jesus `` Chuy '' Garcia,2015-3,33467,0
3,male,Barack Obama,2015-4,27154,0
4,male,Barack Obama,2015-5,14155,0
...,...,...,...,...,...
384,genderfluid,Miley Cyrus,2019-12,347,0
385,genderfluid,Miley Cyrus,2020-1,180,0
386,genderfluid,Miley Cyrus,2020-2,132,0
387,genderfluid,Miley Cyrus,2020-3,1147,0


In [12]:
fig = px.scatter(
    top_speakers_df, 
    x='gender', 
    y = 'yaxis', 
    size = 'quotations', 
    animation_frame='date', 
    text = 'speaker',
    hover_name='speaker',
    hover_data={'gender':False, 'yaxis':False, 'speaker':False, 'quotations':True, 'date':True},
    color='gender',
    size_max=200,
    title='Highest Quoted Speakers',
    width=1200,
    height=700
)

#fig["layout"].pop("updatemenus") # optional, drop animation buttons
fig.update_traces(textposition='top center', textfont_size=15) # Change position of text above bubbles
fig.update_layout(uniformtext_minsize=40, uniformtext_mode='hide')

fig.update_layout(
    yaxis=dict( # Disable yaxis
        visible = False
    ),
    xaxis=dict( # Remove xaxis title
        title=''
    ),
    hoverlabel=dict( # Change font on hover tool
        font_size=16,
    )
)

fig.write_html("plotly/top_speakers.html")


In [302]:
px.data.gapminder()['pop']


0        8425333
1        9240934
2       10267083
3       11537966
4       13079460
          ...   
1699     9216418
1700    10704340
1701    11404948
1702    11926563
1703    12311143
Name: pop, Length: 1704, dtype: int64

In [306]:
top_speakers_df['numOccurrences']

0      37099
1      32215
2      33467
3      27154
4      14155
       ...  
384      347
385      180
386      132
387     1147
388      230
Name: numOccurrences, Length: 384, dtype: int64