**This is the final notebook for Milestone 3, where all the data analysis (besides the pre processing done in Milestone 2) is done.**

In [282]:
!pip install --upgrade plotly



In [2]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import timeit
import bz2
import datetime
import sys
from empath import Empath
import json
#from src.prep_utilities import * 
#from src.prep_pipeline import *

# Load nltk models
#!python ./src/load_models_data.py

%matplotlib inline
%load_ext autoreload
%autoreload 2
!python ./src/load_models_data.py
data_folder = './data/'

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[K     |████████████████████████████████| 13.9 MB 13.3 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# General Gender Bias Analyses

In this section, we plan to analyse:
 - Evolution of percentage of speakers by gender, over time
 - Evolution of percentage of quotations by gender, over time
 - Most quoted speakers by gender, over time

We'll do a month-by-month analysis. For each month/gender combination, we'll save a Dataframe with the speakers and their total number of quotations that month.


In [3]:
# Time ranges
years = range(2015,2021)
min_month = "2015-01"
max_month = "2020-04"
all_months = pd.period_range(min_month, max_month, freq='M')

genders = ['male', 'female', 'transgender male', 'transgender female', 'non-binary', 'genderfluid']


In [352]:
# Create the dictionaries we'll use to store analysis

speaker_df = {} # Save the dataframes of top speakers by gender/month

for gender in genders:  
    speakers_df_temp = {}
    
    for month in all_months:

        speakers_df_temp[month] = pd.DataFrame([], columns = ['speaker', 'numOccurrences']).set_index('speaker')

    speaker_df[gender] = speakers_df_temp
    
speaker_df['male'][all_months[0]]

Unnamed: 0_level_0,numOccurrences
speaker,Unnamed: 1_level_1


The pre-processed data will be loaded each year, by chunks. In the analyses, we will cycle through each year, and through each chunk, and save the data.

In [353]:
# Loop through years
for year in years:
    start = timeit.default_timer()

    # data location and chunk size
    data_file = 'quotes-'+ str(year)+'-prep.json.bz2'
    data_path = data_folder + data_file
    chunk_size = 1e4

    # Load by chunks
    f = bz2.open(data_path, "rb")
    data=pd.read_json(f, lines=True, chunksize=chunk_size)
    
    print(f"Analysing year {year}...", end=" ")
    

    # Loop through chunks
    for i_chunk, chunk in enumerate(data):
        
        ## Run analysis ##
        
        # Create range of months for this year
        if year != 2020:
            months = pd.period_range(str(year)+'-'+'01', str(year)+'-'+'12', freq='M')
        else:
            months = pd.period_range(str(year)+'-'+'01', str(year)+'-'+'04', freq='M')

        # Loop through months
        for month in months:
            # Mask to select desired month
            month_mask = (chunk['date'].dt.to_period('m') == month)
            
            # Loop through genders
            for gender in genders:
                
                # Mask to select desired gender
                gender_mask = (chunk['gender'] == gender)
                
                # Concatenate the speakers in this chunk with our dictionary
                df = chunk[gender_mask & month_mask].groupby("speaker").sum()
                speaker_df[gender][month] = pd.concat([speaker_df[gender][month],df]).groupby("speaker").sum()

    stop = timeit.default_timer()
    print(f"Done in {stop-start:.2f}s")
        

Analysing year 2015... Done in 1503.41s
Analysing year 2016... Done in 878.96s
Analysing year 2017... Done in 2293.26s
Analysing year 2018... Done in 2267.08s
Analysing year 2019... Done in 1588.40s
Analysing year 2020... Done in 179.75s


In [354]:
speaker_df['female'][all_months[12]].sort_values(by='numOccurrences', ascending = False)
#speaker_df['male'][all_months[0]].sum()

Unnamed: 0_level_0,numOccurrences
speaker,Unnamed: 1_level_1
Natalie Cole,3318
Cheryl Boone Isaacs,435
Clover Moore,325
Jane Mayer,276
Gillian Armstrong,267
...,...
Cecilia Rodriguez,1
Mona Chalabi,1
Mona Charen,1
Catherine Zeta-Jones,1


Since we don't want to run the previous cell everytime we reload the notebook, we'll save each of the gender/month combinations to a json file.

In [355]:
general_analysis_folder = data_folder + '/general_analysis/'

for gender in genders:
    for month in all_months:
        with bz2.open(general_analysis_folder + gender +'-' + str(month.year) + '-' + str(month.month) + '.json.bz2', "w") as f:

            # Write to file, reset index to keep the speaker's names
            write = speaker_df[gender][month].reset_index().to_json(f, lines=True, orient='records') 
### 2015,16,17 done

Now that we've created the files with the analysis data, let's read them again:

In [6]:
speaker_df = {} # Load the dataframes of top speakers by gender/month
general_analysis_folder = data_folder + '/general_analysis/'

for gender in genders:  
    speakers_df_temp = {}
    
    for month in all_months:
        speakers_df_temp[month] = pd.read_json(general_analysis_folder + gender +'-' + str(month.year) + '-' + str(month.month) + '.json.bz2', lines = True,compression = 'bz2')
        
        # Join rows of presidential aliases (President Trump, Donald Trump, etc...)
        speakers_df_temp[month] = speakers_df_temp[month].replace(["President Barack Obama", "President Obama"], "Barack Obama")
        speakers_df_temp[month] = speakers_df_temp[month].replace(["President Donald Trump", "President Trump"], "Donald Trump")
        speakers_df_temp[month] = speakers_df_temp[month].groupby('speaker').sum().reset_index()

    speaker_df[gender] = speakers_df_temp


In [4]:
### TEST
speaker, num = speaker_df['female'][all_months[0]].sort_values(by='numOccurrences', ascending = False).iloc[0]

#speaker_df['male'][all_months[0]][speaker_df['male'][all_months[0]]['speaker'].isin(['President Barack Obama', 'President Obama'])].sum()
#speaker_df['male'][all_months[0]].replace(["President Barack Obama", "President Obama"], "Barack Obama").groupby('speaker').sum().sort_values(by='numOccurrences', ascending = False)


## Plots


### Percentage of quotations and speakers by gender

We will use the package `plotly` to make an interactive plot with our data in the cells below.

To view the plot, double click `plotly/general_quotations_speakers.html`.

First, we create DataFrames to hold the plot data, and then we plot it.

In [7]:
## Prepare data to be plotted ##

# x data
month_str = [str(month.year)+'-'+str(month.month) for month in all_months]
month_dt = [datetime.datetime.strptime(x, '%Y-%m') for x in month_str]

# y data
total_quotations = {} # total quotations by month
total_speakers = {} # total number of speakers by month

for month in all_months:
    total_quotations[month] = 0
    total_speakers[month] = 0

    for gender in genders:
        total_quotations[month] += speaker_df[gender][month]['numOccurrences'].sum()
        total_speakers[month] += len(speaker_df[gender][month])

perc_quotations = pd.DataFrame([], columns = genders) # df with the dates and percentage of quotations by gender
perc_speakers =  pd.DataFrame([], columns = genders) # df with the dates and percentage of speakers by gender

for i,month in enumerate(all_months):
    perc_quotations.loc[i] = [100*speaker_df[gender][month]['numOccurrences'].sum()/total_quotations[month] for gender in genders]
    perc_speakers.loc[i] = [100*len(speaker_df[gender][month])/total_speakers[month] for gender in genders]
    
perc_quotations['date'] = month_dt # Dates for x axis
perc_speakers['date'] = month_dt


Run the cell below to generate the "Percentage of Quotations by Gender" plot. Double click `plotly/perc_quotations.html` to view.

In [8]:
import plotly.express as px
import plotly.graph_objects as go

# Color palette
palette = px.colors.qualitative.Plotly

# Create figure
fig = go.Figure()
fig.update_layout(title = 'Percentage of Quotations by Gender')
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Percentage of Quotations')

# Add plots (they have to be added by order, in order not to mess up the 'visible' lists)
for i,gender in enumerate(genders):
    fig.add_trace(
        go.Scatter(x = perc_quotations['date'], y = perc_quotations[gender], name = gender, mode='lines', line=dict(color=palette[i], width=3))
        )


visible_quotations = [True if i<6 else False for i in range(12)]
visible_speakers = [not x for x in visible_quotations]

# Add x range slider
fig.update_layout(
    xaxis=dict(
        rangeslider=dict(
            visible=True
        ),
        type="date"
    )
)

fig.write_html("plotly/perc_quotations.html")


Run the cell below to generate the "Percentage of Speakers by Gender" plot. Double click `plotly/perc_speakers.html` to view.

In [9]:

# Color palette
palette = px.colors.qualitative.Plotly

# Create figure
fig = go.Figure()
fig.update_layout(title = 'Percentage of Speakers by Gender')
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Percentage of Speakers')

# Add plots
for i,gender in enumerate(genders):
    fig.add_trace(
        go.Scatter(x = perc_speakers['date'], y = perc_speakers[gender], name = gender, mode='lines', line=dict(color=palette[i], width=3))
        )

# Add x range slider
fig.update_layout(
    xaxis=dict(
        rangeslider=dict(
            visible=True
        ),
        type="date"
    )
)
fig.write_html("plotly/perc_speakers.html")


### Top speakers by gender/month plot

Once again, we'll use `plotly` for the interactive plot for the highest quoted speaker by gender, in each month.

For this, we'll create a new DataFrame with columns `gender`, `speaker`, `date` and `quotations` (which is the same as `numOccurrences`).

In [10]:
top_speakers_df = pd.DataFrame([], columns = ['gender', 'speaker', 'date', 'quotations'])

i = 0
for gender in genders:
    for j,month in enumerate(all_months):

         # Get top speaker by gender/month
         speaker, occurrences = speaker_df[gender][month].sort_values(by='numOccurrences', ascending = False).iloc[0]
         top_speakers_df.loc[i] = [gender, speaker, month_str[j], int(occurrences)]
         i+=1
    i+=1

# Convert numOccurrences column to int
top_speakers_df.quotations = top_speakers_df.quotations.astype(int)


In [11]:
top_speakers_df

Unnamed: 0,gender,speaker,date,quotations
0,male,Barack Obama,2015-1,37099
1,male,Barack Obama,2015-2,32215
2,male,Jesus `` Chuy '' Garcia,2015-3,33467
3,male,Barack Obama,2015-4,27154
4,male,Barack Obama,2015-5,14155
...,...,...,...,...
384,genderfluid,Miley Cyrus,2019-12,347
385,genderfluid,Miley Cyrus,2020-1,180
386,genderfluid,Miley Cyrus,2020-2,132
387,genderfluid,Miley Cyrus,2020-3,1147


We'll now create a bar plot with the highest quoted speakers of each gender, and add an animation slider to move between months. We save it to `plotly/top_speakers.html`.

In [12]:
fig = px.bar(
    top_speakers_df, 
    x='gender', 
    y = 'quotations', 
    animation_frame='date', 
    text = 'speaker',
    hover_name='speaker',
    hover_data={'gender':False, 'speaker':False, 'quotations':True, 'date':True},
    color='gender',
    title='Highest Quoted Speakers'
)

fig.update_yaxes(range=[0, 400000])

fig.update_layout(
    uniformtext_minsize=13,
    uniformtext_mode='show', 
    showlegend=False,
    hovermode='x',
    yaxis=dict( # Disable yaxis
        visible = True
    ),
    xaxis=dict( # Remove xaxis title
        title=''
    ),
    hoverlabel=dict( # Change font on hover tool
        font_size=16,
    )
)

fig.write_html("plotly/top_speakers.html")


# Topic Analysis

Like on the section above, we'll start by retrieving the relevant information from the pre-processed data. The code to do that is presented below.

To analyze the data, we'll run it through empath to determine what is the number of words spoken about each category. Before we do that, we'll add two new categories to empath.

In [None]:
lexicon = Empath()

lexicon.create_category('climate_change', ['global_warming','green_house','death','water','fossil_fuel','burning','summit','environment','energy','renewable','consumption','petrol','gas','wind','solar_power','earth'], model='nytimes')
lexicon.create_category('lgbt', ['rights', 'gay', 'trans', 'discriminantion', 'phobia', 'lesbian', 'transsexual','cis','queer','asexual','heterosexual','straight'], model='nytimes')

Now we can process all the data by chuncks, and store the results to use later. The results will be written on one file per year, inside `./data_processed` and they will be called `empath_<year>.txt`.

The internal structure of these files is the following.
- First line: Distribution of the number of words per gender, per month. This information is displayed inside of a dictionary, where the keys are the genders found for that year, and to each key we have associated a list with 12 entries, one for each month, containing the number of words said by that gender in that month.
- Second line: Number of chunks processed. No real functional purpose but it allowed to restart the program from a certain point if it got interrupted.
- Third line: All the information extracted from the data. This information is displayed inside of a dictionary, where the keys correspond to the genders found for that year. Each key points to a list with 12 entries, one for each month. And in each entry of the list there is another dictionary, which is the output of empath for that month, where the keys correspond to the topics 'eating', 'alcohol', 'cleaning', 'sports',...

In [None]:
dfs_quotes = []
# read all the pre-processed files and store them
for file in glob.glob(data_folder + data_file):
    dfs_quotes.append(pd.read_json(file, lines=True, chunksize=1e4))

start = timeit.default_timer()
# create a set with all the stopwords so we can remove them
stop_words = set(stopwords.words())
# iterate through all the files, each one corresponding to one year
for year, file in enumerate(dfs_quotes):
    themes = {}
    n_words = {}
    i = 0
    # we need to read the file in chuncks, they are too big
    for chunk in file:
        tokens = {}
        # extract the info about quotes, dates and genders
        quotes = chunk['tokens'].tolist()
        date = chunk['date'].tolist()
        gender = chunk['gender'].tolist()
        clean_quotes = []
        for index, words in enumerate(quotes):
            # remove stopwords and join the split tokens
            processed_quote = [word for word in words if word not in stop_words]
            clean_quotes.append(' '.join(processed_quote))
            # if we have not yet added this gender to the number of words, we add it
            if gender[index] not in n_words.keys():
                n_words[gender[index]] = [0 for _ in range(12)]
            # and then we sum the words in this quote
            n_words[gender[index]][int(str(date[index])[5:7]) - 1] += len(processed_quote)

        # now we divide all the quotes by their dates and genders, to make it easier to process them
        for index, quote in enumerate(clean_quotes):
            if gender[index] not in tokens.keys():
                tokens[gender[index]] = [[] for _ in range(12)]
            tokens[gender[index]][int(str(date[index])[5:7]) - 1].append(quote)
        
        # and finally we iterate through all the genders and all the months....
        for gender in tokens.keys():
            # we create the necessary entries in the dict
            if gender not in themes.keys():
                themes[gender] = [{} for _ in range(12)]
                # and we analyze the quotes by topics
                for month,quotes in enumerate(tokens[gender]):
                    themes[gender][month]= lexicon.analyze(quotes, normalize = False)
            # if the gender was already in the dictionary, we add the new info to the info that was already there
            else:
                for month,quotes in enumerate(tokens[gender]):
                    themes_partial = lexicon.analyze(quotes, normalize = False)
                    themes[gender][month] = {k: themes[gender][month].get(k, 0) + themes_partial.get(k, 0) for k in themes[gender][month].keys() | themes_partial.keys()}

        i += 1
        # we write the info to the file every 10 chunks so we don't have to start over if it crashes
        if i % 10 == 0:
            with open(f'./data_processed/empath_{year + 2015}.txt', 'w') as f:
                f.write(f'Num words: {n_words}\n')
                f.write(f'Chunks processed: {i}\n')
                f.write(json.dumps(themes))
            print(i, end = ',')

    # at the end we write it all one last time
    with open(f'./data_processed/empath_{year + 2015}.txt', 'w') as f:
        f.write(f'Num words: {n_words} \n')
        f.write(f'Chunks processed: {i} \n')
        f.write(json.dumps(themes))
    print()


print(f'Time to analyze all chunks {timeit.default_timer() - start}!!')        

After the analysis is done, we can simply get the relevant data from the saved files, which is much faster.

In [None]:
empath_files = glob.glob('./data_processed/empath*.txt')

n_words = []
empath_data = []

for file in empath_files:
    with open(file, 'r') as f:
        txt = f.read()
        txt = txt.split('\n')
        
        n_words.append(eval(txt[0][11:]))
        empath_data.append(eval(txt[2]))

We also save this data as a normalized version, because it will be useful for plotting purposes.

In [None]:
empath_data_norm = copy.deepcopy(empath_data)

for year, data in enumerate(empath_data_norm):
    for gender in data:
        for month, datum in enumerate(data[gender]):
            for key in datum:
                if n_words[year][gender][month] != 0:
                    datum[key] = datum.get(key, 1) / n_words[year][gender][month] * 100   

Now we convert these dictionaries into a list of tuples, so that we can sort them, while keeping only the main list of genders.

In [None]:
sorted_data = []
genders = ['male', 'female', 'transgender male', 'transgender female', 'non-binary', 'genderfluid']

for data in empath_data:
    temp_data = {}
    for gender in data:
        if gender in genders:
            temp_data[gender] = [[] for _ in range(12)]
            for month, datum in enumerate(data[gender]):
                temp_data[gender][month] = list(datum.items())
                temp_data[gender][month].sort(key=lambda tup:tup[1], reverse=True)
    sorted_data.append(temp_data)

sorted_data_norm = []
for data in empath_data_norm:
    temp_data = {}
    for gender in data:
        if gender in genders:
            temp_data[gender] = [[] for _ in range(12)]
            for month, datum in enumerate(data[gender]):
                temp_data[gender][month] = list(datum.items())
                temp_data[gender][month].sort(key=lambda top:top[1], reverse=True)
    sorted_data_norm.append(temp_data)

With them sorted, we can prepare the information to start plotting it. For that, let's only consider a subset of key topics:
- Business
- Sports
- Government
- Climate Change
- LGBT
- Money
- Family
- Health

For only these topics, we'll save their information, both in raw and normalized format.

In [None]:
topics = {'business': {},
          'sports': {},
          'government': {},
          'climate_change': {},
          'lgbt': {},
          'money': {},
          'family': {},
          'health': {}
         }

for year, data in enumerate(sorted_data):
    for gender in data:
        for datum in data[gender]:
            for topic in datum:
                if topic[0] in topics.keys():
                    if gender in topics[topic[0]].keys():
                        topics[topic[0]][gender].append(topic[1])
                    else:
                        topics[topic[0]][gender] = [topic[1]]
                        
topics_norm = {'business': {},
               'sports': {},
               'government': {},
               'climate_change': {},
               'lgbt': {},
               'money': {},
               'family': {},
               'health': {}
              }

for year, data in enumerate(sorted_data_norm):
    for gender in data:
        for datum in data[gender]:
            for topic in datum:
                if topic[0] in topics_norm.keys():
                    if gender in topics_norm[topic[0]].keys():
                        topics_norm[topic[0]][gender].append(topic[1])
                    else:
                        topics_norm[topic[0]][gender] = [topic[1]]

Now we can finally get started on the plots. The first one we'll create is a dynamic graph showing the evolution of the distribution of topics per gender. 

In [None]:
# this cell will plot the distribution of topics per gender
import plotly.express as px
import plotly.graph_objects as go

# we prepare our data for plotting
genders_show = [' '.join([word.capitalize() for word in gender.split()]) for gender in genders]

plot_values_norm = {'gender': [],
                    'topic': [],
                    'percentage': [],
                    'date':[]
                   }

for gender in genders:
    for topic in topics_norm:
        for index, val in enumerate(topics_norm[topic][gender]):
            if all(np.array(topics_norm[topic][gender][index:]) == 0):
                break
            month = index % 12 + 1
            year = 2015 + index // 12
            plot_values_norm['gender'].append(gender)
            plot_values_norm['topic'].append(topic)
            plot_values_norm['percentage'].append(val)
            plot_values_norm['date'].append(f'{year}-{month:02}')
            
plot_values_norm = pd.DataFrame.from_dict(plot_values_norm)

fig = px.bar(plot_values_norm,
             x='gender',
             y='percentage',
             color='topic',
             title='Distribution of Key Topics per Gender',
             animation_frame='date',
             height=750,
             range_y=[min(plot_values_norm['percentage']), max(plot_values_norm['percentage'])],
             barmode='group',
             hover_name='topic',
             hover_data={'gender':False, 'topic':False, 'percentage':True, 'date':False},
             )

fig.update_layout(
    # xaxis_title="Gender",
    yaxis_title="Percentage of Words about Topic in Quotes by Gender",
)

fig.write_html("./topic_in_gender.html")

The second plot is also a dynamic graph, but this time showing the distribution of genders inside a given topic.

In [None]:
# this cell is for showing the gender distribution for any topic
# this cell will be for the overall topic evolution
import plotly.express as px
import plotly.graph_objects as go

# we prepare our data for plotting
genders_show = [' '.join([word.capitalize() for word in gender.split()]) for gender in genders]

plot_values_abs = {'gender': [],
                   'topic': [],
                   'counts': [],
                   'date':[]
                  }

for gender in genders:
    for topic in topics:
        for index, val in enumerate(topics[topic][gender]):
            if all(np.array(topics[topic][gender][index:]) == 0):
                break
            month = index % 12 + 1
            year = 2015 + index // 12
            plot_values_abs['gender'].append(gender)
            plot_values_abs['topic'].append(topic)
            plot_values_abs['counts'].append(val)
            plot_values_abs['date'].append(f'{year}-{month:02}')
            
plot_values_abs = pd.DataFrame.from_dict(plot_values_abs)

fig = px.bar(plot_values_abs,
             x='topic',
             y='counts',
             color='gender',
             title='Distribution of Genders per Key Topic',
             animation_frame='date',
             height=750,
             range_y=[max(min(plot_values_abs['counts']),1), max(plot_values_abs['counts'])],
             log_y = True,
             barmode='group',
             hover_name='gender',
             hover_data={'gender':False, 'topic':False, 'counts':True, 'date':False},
             )

fig.update_layout(
    #xaxis_title="Topic",
    yaxis_title="Word Counts by Gender per Topic",
)

fig.write_html("./gender_in_topic.html")