**This is the final notebook for Milestone 3, where all the data analysis (besides the pre processing done in Milestone 2) is done.**

In [4]:
!pip install --upgrade plotly
!pip install spacy



In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import timeit
import bz2
import datetime
import sys
import spacy
#from src.prep_utilities import * 
#from src.prep_pipeline import *

# Load nltk models
#!python ./src/load_models_data.py

%matplotlib inline
%load_ext autoreload
%autoreload 2
!python ./src/load_models_data.py
data_folder = './data/'

Collecting en-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
spacy.load('en_core_web_sm')

<spacy.lang.en.English at 0x217ab779310>

# General Gender Bias Analyses

In this section, we plan to analyse:
 - Evolution of percentage of speakers by gender, over time
 - Evolution of percentage of quotations by gender, over time
 - Most quoted speakers by gender, over time

We'll do a month-by-month analysis. For each month/gender combination, we'll save a Dataframe with the speakers and their total number of quotations that month.


In [3]:
# Time ranges
years = range(2015,2021)
min_month = "2015-01"
max_month = "2020-04"
all_months = pd.period_range(min_month, max_month, freq='M')

genders = ['male', 'female', 'transgender male', 'transgender female', 'non-binary', 'genderfluid']


In [3]:
# Create the dictionaries we'll use to store analysis

speaker_df = {} # Save the dataframes of top speakers by gender/month

for gender in genders:  
    speakers_df_temp = {}
    
    for month in all_months:

        speakers_df_temp[month] = pd.DataFrame([], columns = ['speaker', 'numOccurrences']).set_index('speaker')

    speaker_df[gender] = speakers_df_temp
    
speaker_df['male'][all_months[0]]

Unnamed: 0_level_0,numOccurrences
speaker,Unnamed: 1_level_1


The pre-processed data will be loaded each year, by chunks. In the analyses, we will cycle through each year, and through each chunk, and save the data.

In [4]:
# Loop through years
for year in years:
    start = timeit.default_timer()

    # data location and chunk size
    data_file = 'quotes-'+ str(year)+'-prep.json.bz2'
    data_path = data_folder + data_file
    chunk_size = 1e4

    # Load by chunks
    f = bz2.open(data_path, "rb")
    data=pd.read_json(f, lines=True, chunksize=chunk_size)
    
    print(f"Analysing year {year}...", end=" ")
    

    # Loop through chunks
    for i_chunk, chunk in enumerate(data):
        
        ## Run analysis ##
        
        # Create range of months for this year
        if year != 2020:
            months = pd.period_range(str(year)+'-'+'01', str(year)+'-'+'12', freq='M')
        else:
            months = pd.period_range(str(year)+'-'+'01', str(year)+'-'+'04', freq='M')

        # Loop through months
        for month in months:
            # Mask to select desired month
            month_mask = (chunk['date'].dt.to_period('m') == month)
            
            # Loop through genders
            for gender in genders:
                
                # Mask to select desired gender
                gender_mask = (chunk['gender'] == gender)
                
                # Concatenate the speakers in this chunk with our dictionary
                df = chunk[gender_mask & month_mask].groupby("speaker").sum()
                speaker_df[gender][month] = pd.concat([speaker_df[gender][month],df]).groupby("speaker").sum()

    stop = timeit.default_timer()
    print(f"Done in {stop-start:.2f}s")
        

Analysing year 2015... Done in 1406.01s
Analysing year 2016... Done in 805.98s
Analysing year 2017... Done in 1922.07s
Analysing year 2018... 

KeyboardInterrupt: 

In [17]:
speaker_df['female'][all_months[12]].sort_values(by='numOccurrences', ascending = False)
#speaker_df['male'][all_months[0]].sum()

Unnamed: 0_level_0,numOccurrences
speaker,Unnamed: 1_level_1
Natalie Cole,1598
Cheryl Boone Isaacs,418
Jane Mayer,272
Nikki Haley,151
Sandra Bland,145
...,...
Julie Goodenough,1
Julie Schneider,1
Julie Smith,1
Julie Snyder,1


Since we don't want to run the previous cell everytime we reload the notebook, we'll save each of the gender/month combinations to a json file.

In [5]:
general_analysis_folder = data_folder + '/general_analysis/'

for gender in genders:
    for month in all_months:
        if month.year>=2018: ### REMOVE
            with bz2.open(general_analysis_folder + gender +'-' + str(month.year) + '-' + str(month.month) + '.json.bz2', "w") as f:

                # Write to file, reset index to keep the speaker's names
                write = speaker_df[gender][month].reset_index().to_json(f, lines=True, orient='records') 
### 2015,16,17 done

Now that we've created the files with the analysis data, let's read them again:

In [4]:
speaker_df = {} # Load the dataframes of top speakers by gender/month
general_analysis_folder = data_folder + '/general_analysis/'

for gender in genders:  
    speakers_df_temp = {}
    
    for month in all_months:
        if month.year <= 2017: ### REMOVE THIS
            speakers_df_temp[month] = pd.read_json(general_analysis_folder + gender +'-' + str(month.year) + '-' + str(month.month) + '.json.bz2', lines = True,compression = 'bz2')

    speaker_df[gender] = speakers_df_temp

## Plots

We'll use the `bokeh` package to generate interactive plots.

In [218]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.layouts import row
from bokeh.plotting import figure
from bokeh.models import DatetimeTickFormatter
from bokeh.models import HoverTool, Legend
from bokeh.palettes import brewer


output_notebook()

In [347]:
## Prepare data to be plotted ##

# x data
month_str = [str(month.year)+'-'+str(month.month) for month in all_months]
month_dt = [datetime.datetime.strptime(x, '%Y-%m') for x in month_str]

# y data
total_quotations = {} # total quotations by month
for month in all_months[:36]: ### CHANGE THIS
    total_quotations[month] = 0
    for gender in genders:
        total_quotations[month] += speaker_df[gender][month]['numOccurrences'].sum()

perc_quotations = pd.DataFrame([], columns = genders)

for i,month in enumerate(all_months[:36]):
    perc_quotations.loc[i] = [100*speaker_df[gender][month]['numOccurrences'].sum()/total_quotations[month] for gender in genders] ###REMOVE THIS
    
perc_quotations['date'] = month_dt[:36]


In [348]:
# create a new plot with a title and axis labels
TOOLS = "hover,box_zoom,pan,zoom_in,zoom_out,undo,redo"
colors = brewer['Dark2'][6]

# Figures
p = figure(plot_width = 800, plot_height = 600,tools=TOOLS,title="Percentage of Quotations by Gender", x_axis_label='Time', y_axis_label='Percentage')
p.add_layout(Legend(), 'right')

# Change the mouse hovering over datapoints
hover = p.select(dict(type=HoverTool))

hover.tooltips = [   
    ("Month", "@x{%B %Y}"), # show year month in hover tool
    ("Percentage", "@y{1.1111}"), # this will format as 2-decimal float
]
hover.formatters = {'@x': 'datetime'}


month_dt = month_dt[:36] ###REMOVE THIS
# Line plots
for i,gender in enumerate(genders):
    p.line(perc_quotations['date'], perc_quotations[gender], legend_label=gender, color = colors[i], line_width=2)
#p.multi_line([perc_quotations['date'] for gender in genders], [perc_quotations[gender] for gender in genders], color=brewer['Dark2'][6], line_width = 2, legend_label=genders)


p.xaxis.formatter=DatetimeTickFormatter(
        hours=["%B %Y"],
        days=["%B %Y"],
        months=["%B %Y"],
        years=["%B %Y"],
    )
p.xaxis.major_label_orientation = np.pi/4
p.legend.orientation = "vertical"
p.legend.location = "top"
p.legend.click_policy = 'hide'

In [349]:
show(p)

### Sentiment Analysis

In [4]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [11]:
data_file = 'quotes-2020'+'-prep.json.bz2'
data_path = data_folder + data_file
sent_analysis_folder = data_folder + 'sentiment_analysis/'

In [56]:
def sent_description(sentiment_score):
    d=''
    if sentiment_score>=0.6:
        d='Strongly positive'
    elif (sentiment_score>=0.4)&(sentiment_score<0.6):
        d='Positive'
    elif (sentiment_score>-0.4)&(sentiment_score<0.4):
        d='Neutral'
    elif (sentiment_score<=-0.4)&(sentiment_score>-0.6):
        d='Negative'
    elif sentiment_score<=-0.6:
        d='Strongly negative'
    return d

In [58]:
f = bz2.open(data_path, "rb")
data=pd.read_json(f, lines=True, chunksize=10000)

start = timeit.default_timer()
start_progress = timeit.default_timer()
progress_step = 100

with bz2.open(sent_analysis_folder +'2020'+ '.json.bz2', "w") as fn:
    for i_chunk, chunk in enumerate(data):

            # Print progress
            if i_chunk%progress_step == 0:
                stop_progress = timeit.default_timer()
                print(f'Time since last: {stop_progress-start_progress:.1f}s\n')
                print(f"Processing chunks {i_chunk}-{i_chunk+progress_step-1}")
                start_progress = timeit.default_timer()

            # Process chunk
            chunk.drop(columns=['numOccurrences','probas','phase'], inplace=True)
            chunk['sentiment_score']=chunk['quotation'].apply(lambda x: sia.polarity_scores(x)['compound'])
            chunk['sent_description']=chunk['sentiment_score'].apply(lambda x: sent_description(x))
            
            # Write to json.bz2 file
            write = sent.to_json(fn, lines=True, orient='records')

stop = timeit.default_timer()
print(f'Total time: {stop-start:.1f}s\n')

Time since last: 2.5s

Processing chunks 0-99


KeyboardInterrupt: 