**This is the final notebook for Milestone 3, where all the data analysis (besides the pre processing done in Milestone 2) is done.**

In [None]:
import pandas as pd
import math 
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import timeit
import bz2
import datetime
import sys
from empath import Empath
import json
import glob
#from src.prep_utilities import * 
#from src.prep_pipeline import *

# Load nltk models
#!python ./src/load_models_data.py

%matplotlib inline
%load_ext autoreload
%autoreload 2
!python ./src/load_models_data.py
data_folder = './data/'

# General Gender Bias Analyses

In this section, we plan to analyse:
 - Evolution of percentage of speakers by gender, over time
 - Evolution of percentage of quotations by gender, over time
 - Most quoted speakers by gender, over time

We'll do a month-by-month analysis. For each month/gender combination, we'll save a Dataframe with the speakers and their total number of quotations that month.


In [19]:
# Time ranges
years = range(2015,2021)
min_month = "2015-01"
max_month = "2020-04"
all_months = pd.period_range(min_month, max_month, freq='M')

genders = ['male', 'female', 'transgender male', 'transgender female', 'non-binary', 'genderfluid']


In [20]:
# Create the dictionaries we'll use to store analysis

speaker_df = {} # Save the dataframes of top speakers by gender/month

for gender in genders:  
    speakers_df_temp = {}
    
    for month in all_months:

        speakers_df_temp[month] = pd.DataFrame([], columns = ['speaker', 'numOccurrences']).set_index('speaker')

    speaker_df[gender] = speakers_df_temp


The pre-processed data will be loaded each year, by chunks. In the analyses, we will cycle through each year, and through each chunk, and save the data.

In [21]:
# Loop through years
for year in years:
    start = timeit.default_timer()

    # data location and chunk size
    data_file = 'quotes-'+ str(year)+'-prep.json.bz2'
    data_path = data_folder + data_file
    chunk_size = 1e4

    # Load by chunks
    f = bz2.open(data_path, "rb")
    data=pd.read_json(f, lines=True, chunksize=chunk_size)
    
    print(f"Analysing year {year}...", end=" ")
    

    # Loop through chunks
    for i_chunk, chunk in enumerate(data):
        
        ## Run analysis ##
        
        # Create range of months for this year
        if year != 2020:
            months = pd.period_range(str(year)+'-'+'01', str(year)+'-'+'12', freq='M')
        else:
            months = pd.period_range(str(year)+'-'+'01', str(year)+'-'+'04', freq='M')

        # Loop through months
        for month in months:
            # Mask to select desired month
            month_mask = (chunk['date'].dt.to_period('m') == month)
            
            # Loop through genders
            for gender in genders:
                
                # Mask to select desired gender
                gender_mask = (chunk['gender'] == gender)
                
                # Concatenate the speakers in this chunk with our dictionary
                df = chunk[gender_mask & month_mask].groupby("speaker").sum()
                speaker_df[gender][month] = pd.concat([speaker_df[gender][month],df]).groupby("speaker").sum()

    stop = timeit.default_timer()
    print(f"Done in {stop-start:.2f}s")
        

Analysing year 2015... Done in 1399.95s
Analysing year 2016... Done in 829.45s
Analysing year 2017... Done in 2002.01s
Analysing year 2018... Done in 2067.94s
Analysing year 2019... Done in 1489.73s
Analysing year 2020... Done in 164.27s


Since we don't want to run the previous cell everytime we reload the notebook, we'll save each of the gender/month combinations to a json file.

In [22]:
general_analysis_folder = './data_processed/'

for gender in genders:
    for month in all_months:
        with bz2.open(general_analysis_folder + gender +'-' + str(month.year) + '-' + str(month.month) + '.json.bz2', "w") as f:

            # Write to file, reset index to keep the speaker's names
            write = speaker_df[gender][month].reset_index().to_json(f, lines=True, orient='records') 
            

Now that we've created the files with the analysis data, let's read them again:

In [23]:
speaker_df = {} # Load the dataframes of top speakers by gender/month
general_analysis_folder = './data_processed/'

for gender in genders:  
    speakers_df_temp = {}
    for month in all_months:
        speakers_df_temp[month] = pd.read_json(general_analysis_folder + gender +'-' + str(month.year) + '-' + str(month.month) + '.json.bz2', lines = True,compression = 'bz2')
        
        # Join rows of presidential aliases (President Trump, Donald Trump, etc...)
        speakers_df_temp[month] = speakers_df_temp[month].replace(["President Barack Obama", "President Obama"], "Barack Obama")
        speakers_df_temp[month] = speakers_df_temp[month].replace(["President Donald Trump", "President Trump"], "Donald Trump")
        speakers_df_temp[month] = speakers_df_temp[month].groupby('speaker').sum().reset_index()

    speaker_df[gender] = speakers_df_temp

## Plots


### Percentage of quotations and speakers by gender

We will use the package `plotly` to make an interactive plot with our data in the cells below.

To view the plot, double click `plotly/general_quotations_speakers.html`.

First, we create DataFrames to hold the plot data, and then we plot it.

In [25]:
## Prepare data to be plotted ##

# x data
month_str = [str(month.year)+'-'+str(month.month) for month in all_months]
month_dt = [datetime.datetime.strptime(x, '%Y-%m') for x in month_str]

# y data
total_quotations = {} # total quotations by month
total_speakers = {} # total number of speakers by month

# Dictionaries with total values for each month
for month in all_months:
    total_quotations[month] = 0
    total_speakers[month] = 0

    for gender in genders:
        total_quotations[month] += speaker_df[gender][month]['numOccurrences'].sum()
        total_speakers[month] += len(speaker_df[gender][month])

perc_quotations = pd.DataFrame([], columns = genders) # df with the dates and percentage of quotations by gender
perc_speakers =  pd.DataFrame([], columns = genders) # df with the dates and percentage of speakers by gender

for i,month in enumerate(all_months):
    perc_quotations.loc[i] = [100*speaker_df[gender][month]['numOccurrences'].sum()/total_quotations[month] for gender in genders]
    perc_speakers.loc[i] = [100*len(speaker_df[gender][month])/total_speakers[month] for gender in genders]
    
perc_quotations['date'] = month_dt # Dates for x axis
perc_speakers['date'] = month_dt


Run the cell below to generate the "Percentage of Occurrences by Gender" plot. Double click `plotly/perc_quotations.html` to view.

In [26]:
import plotly.express as px
import plotly.graph_objects as go

# Color palette
palette = px.colors.qualitative.Plotly

# Create figure
fig = go.Figure()
fig.update_layout(title = 'Percentage of Occurrences by Gender')
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Percentage of Occurrences')

# Add plots (they have to be added by order, in order not to mess up the 'visible' lists)
for i,gender in enumerate(genders):
    fig.add_trace(
        go.Scatter(x = perc_quotations['date'], y = perc_quotations[gender], name = gender, mode='lines', line=dict(color=palette[i], width=3))
        )


visible_quotations = [True if i<6 else False for i in range(12)]
visible_speakers = [not x for x in visible_quotations]

# Add x range slider
fig.update_layout(
    xaxis=dict(
        rangeslider=dict(
            visible=True
        ),
        type="date"
    )
)

fig.write_html("./plotly/perc_quotations.html")


Run the cell below to generate the "Percentage of Speakers by Gender" plot. Double click `plotly/perc_speakers.html` to view.

In [27]:

# Color palette
palette = px.colors.qualitative.Plotly

# Create figure
fig = go.Figure()
fig.update_layout(title = 'Percentage of Speakers by Gender')
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Percentage of Speakers')

# Add plots
for i,gender in enumerate(genders):
    fig.add_trace(
        go.Scatter(x = perc_speakers['date'], y = perc_speakers[gender], name = gender, mode='lines', line=dict(color=palette[i], width=3))
        )

# Add x range slider
fig.update_layout(
    xaxis=dict(
        rangeslider=dict(
            visible=True
        ),
        type="date"
    )
)
fig.write_html("./plotly/perc_speakers.html")


### Top speakers by gender/month plot

Once again, we'll use `plotly` for the interactive plot for the highest quoted speaker by gender, in each month.

For this, we'll create a new DataFrame with columns `gender`, `speaker`, `date` and `occurrences` (which is the same as `numOccurrences`).

In [28]:
top_speakers_df = pd.DataFrame([], columns = ['gender', 'speaker', 'date', 'occurrences'])

i = 0
for gender in genders:
    for j,month in enumerate(all_months):
        # Get top speaker by gender/month
        speaker, occurrences = speaker_df[gender][month].sort_values(by='numOccurrences', ascending = False).iloc[0]
        top_speakers_df.loc[i] = [gender, speaker, month_str[j], int(occurrences)]
        i+=1
    i+=1

# Convert numOccurrences column to int
top_speakers_df.occurrences = top_speakers_df.occurrences.astype(int)


In [29]:
top_speakers_df

Unnamed: 0,gender,speaker,date,occurrences
0,male,Barack Obama,2015-1,37099
1,male,Barack Obama,2015-2,32215
2,male,Jesus `` Chuy '' Garcia,2015-3,33467
3,male,Barack Obama,2015-4,27154
4,male,Barack Obama,2015-5,14155
...,...,...,...,...
384,genderfluid,Miley Cyrus,2019-12,347
385,genderfluid,Miley Cyrus,2020-1,180
386,genderfluid,Miley Cyrus,2020-2,132
387,genderfluid,Miley Cyrus,2020-3,1147


We'll now create a bar plot with the highest quoted speakers of each gender, and add an animation slider to move between months. We save it to `plotly/top_speakers.html`.

In [12]:
fig = px.bar(
    top_speakers_df, 
    x='gender', 
    y = 'quotations', 
    animation_frame='date', 
    text = 'speaker',
    hover_name='speaker',
    hover_data={'gender':False, 'speaker':False, 'quotations':True, 'date':True},
    color='gender',
    title='Highest Quoted Speakers'
)

fig.update_yaxes(range=[0, 400000])

fig.update_layout(
    uniformtext_minsize=13,
    uniformtext_mode='show', 
    showlegend=False,
    hovermode='x',
    yaxis=dict( # Disable yaxis
        visible = True
    ),
    xaxis=dict( # Remove xaxis title
        title=''
    ),
    hoverlabel=dict( # Change font on hover tool
        font_size=16,
    )
)

fig.write_html("./plotly/top_speakers.html")


# Topic Analysis

Like on the section above, we'll start by retrieving the relevant information from the pre-processed data. The code to do that is presented below.

To analyze the data, we'll run it through empath to determine what is the number of words spoken about each category. Before we do that, we'll add two new categories to empath.

In [None]:
lexicon = Empath()

lexicon.create_category('climate_change', ['global_warming','green_house','death','water','fossil_fuel','burning','summit','environment','energy','renewable','consumption','petrol','gas','wind','solar_power','earth'], model='nytimes')
lexicon.create_category('lgbt', ['rights', 'gay', 'trans', 'discriminantion', 'phobia', 'lesbian', 'transsexual','cis','queer','asexual','heterosexual','straight'], model='nytimes')

Now we can process all the data by chuncks, and store the results to use later. The results will be written on one file per year, inside `./data_processed` and they will be called `empath_<year>.txt`.

The internal structure of these files is the following.
- First line: Distribution of the number of words per gender, per month. This information is displayed inside of a dictionary, where the keys are the genders found for that year, and to each key we have associated a list with 12 entries, one for each month, containing the number of words said by that gender in that month.
- Second line: Number of chunks processed. No real functional purpose but it allowed to restart the program from a certain point if it got interrupted.
- Third line: All the information extracted from the data. This information is displayed inside of a dictionary, where the keys correspond to the genders found for that year. Each key points to a list with 12 entries, one for each month. And in each entry of the list there is another dictionary, which is the output of empath for that month, where the keys correspond to the topics 'eating', 'alcohol', 'cleaning', 'sports',...

In [None]:
dfs_quotes = []
# read all the pre-processed files and store them
for file in glob.glob(data_folder + data_file):
    dfs_quotes.append(pd.read_json(file, lines=True, chunksize=1e4))

start = timeit.default_timer()
# create a set with all the stopwords so we can remove them
stop_words = set(stopwords.words())
# iterate through all the files, each one corresponding to one year
for year, file in enumerate(dfs_quotes):
    themes = {}
    n_words = {}
    i = 0
    # we need to read the file in chuncks, they are too big
    for chunk in file:
        tokens = {}
        # extract the info about quotes, dates and genders
        quotes = chunk['tokens'].tolist()
        date = chunk['date'].tolist()
        gender = chunk['gender'].tolist()
        clean_quotes = []
        for index, words in enumerate(quotes):
            # remove stopwords and join the split tokens
            processed_quote = [word for word in words if word not in stop_words]
            clean_quotes.append(' '.join(processed_quote))
            # if we have not yet added this gender to the number of words, we add it
            if gender[index] not in n_words.keys():
                n_words[gender[index]] = [0 for _ in range(12)]
            # and then we sum the words in this quote
            n_words[gender[index]][int(str(date[index])[5:7]) - 1] += len(processed_quote)

        # now we divide all the quotes by their dates and genders, to make it easier to process them
        for index, quote in enumerate(clean_quotes):
            if gender[index] not in tokens.keys():
                tokens[gender[index]] = [[] for _ in range(12)]
            tokens[gender[index]][int(str(date[index])[5:7]) - 1].append(quote)
        
        # and finally we iterate through all the genders and all the months....
        for gender in tokens.keys():
            # we create the necessary entries in the dict
            if gender not in themes.keys():
                themes[gender] = [{} for _ in range(12)]
                # and we analyze the quotes by topics
                for month,quotes in enumerate(tokens[gender]):
                    themes[gender][month]= lexicon.analyze(quotes, normalize = False)
            # if the gender was already in the dictionary, we add the new info to the info that was already there
            else:
                for month,quotes in enumerate(tokens[gender]):
                    themes_partial = lexicon.analyze(quotes, normalize = False)
                    themes[gender][month] = {k: themes[gender][month].get(k, 0) + themes_partial.get(k, 0) for k in themes[gender][month].keys() | themes_partial.keys()}

        i += 1
        # we write the info to the file every 10 chunks so we don't have to start over if it crashes
        if i % 10 == 0:
            with open(f'./data_processed/empath_{year + 2015}.txt', 'w') as f:
                f.write(f'Num words: {n_words}\n')
                f.write(f'Chunks processed: {i}\n')
                f.write(json.dumps(themes))
            print(i, end = ',')

    # at the end we write it all one last time
    with open(f'./data_processed/empath_{year + 2015}.txt', 'w') as f:
        f.write(f'Num words: {n_words} \n')
        f.write(f'Chunks processed: {i} \n')
        f.write(json.dumps(themes))
    print()


print(f'Time to analyze all chunks {timeit.default_timer() - start}!!')        

After the analysis is done, we can simply get the relevant data from the saved files, which is much faster.

In [None]:
empath_files = glob.glob('./data_processed/empath*.txt')

n_words = []
empath_data = []

for file in empath_files:
    with open(file, 'r') as f:
        txt = f.read()
        txt = txt.split('\n')
        
        n_words.append(eval(txt[0][11:]))
        empath_data.append(eval(txt[2]))

We also save this data as a normalized version, because it will be useful for plotting purposes.

In [None]:
empath_data_norm = copy.deepcopy(empath_data)

for year, data in enumerate(empath_data_norm):
    for gender in data:
        for month, datum in enumerate(data[gender]):
            for key in datum:
                if n_words[year][gender][month] != 0:
                    datum[key] = datum.get(key, 1) / n_words[year][gender][month] * 100   

Now we convert these dictionaries into a list of tuples, so that we can sort them, while keeping only the main list of genders.

In [None]:
sorted_data = []
genders = ['male', 'female', 'transgender male', 'transgender female', 'non-binary', 'genderfluid']

for data in empath_data:
    temp_data = {}
    for gender in data:
        if gender in genders:
            temp_data[gender] = [[] for _ in range(12)]
            for month, datum in enumerate(data[gender]):
                temp_data[gender][month] = list(datum.items())
                temp_data[gender][month].sort(key=lambda tup:tup[1], reverse=True)
    sorted_data.append(temp_data)

sorted_data_norm = []
for data in empath_data_norm:
    temp_data = {}
    for gender in data:
        if gender in genders:
            temp_data[gender] = [[] for _ in range(12)]
            for month, datum in enumerate(data[gender]):
                temp_data[gender][month] = list(datum.items())
                temp_data[gender][month].sort(key=lambda top:top[1], reverse=True)
    sorted_data_norm.append(temp_data)

With them sorted, we can prepare the information to start plotting it. For that, let's only consider a subset of key topics:
- Business
- Sports
- Government
- Climate Change
- LGBT
- Money
- Family
- Health

For only these topics, we'll save their information, both in raw and normalized format.

In [None]:
topics = {'business': {},
          'sports': {},
          'government': {},
          'climate_change': {},
          'lgbt': {},
          'money': {},
          'family': {},
          'health': {}
         }

for year, data in enumerate(sorted_data):
    for gender in data:
        for datum in data[gender]:
            for topic in datum:
                if topic[0] in topics.keys():
                    if gender in topics[topic[0]].keys():
                        topics[topic[0]][gender].append(topic[1])
                    else:
                        topics[topic[0]][gender] = [topic[1]]
                        
topics_norm = {'business': {},
               'sports': {},
               'government': {},
               'climate_change': {},
               'lgbt': {},
               'money': {},
               'family': {},
               'health': {}
              }

for year, data in enumerate(sorted_data_norm):
    for gender in data:
        for datum in data[gender]:
            for topic in datum:
                if topic[0] in topics_norm.keys():
                    if gender in topics_norm[topic[0]].keys():
                        topics_norm[topic[0]][gender].append(topic[1])
                    else:
                        topics_norm[topic[0]][gender] = [topic[1]]

Now we can finally get started on the plots. The first one we'll create is a dynamic graph showing the evolution of the distribution of topics per gender. 

In [None]:
# this cell will plot the distribution of topics per gender
import plotly.express as px
import plotly.graph_objects as go

# we prepare our data for plotting
genders_show = [' '.join([word.capitalize() for word in gender.split()]) for gender in genders]

plot_values_norm = {'gender': [],
                    'topic': [],
                    'percentage': [],
                    'date':[]
                   }

for gender in genders:
    for topic in topics_norm:
        for index, val in enumerate(topics_norm[topic][gender]):
            if all(np.array(topics_norm[topic][gender][index:]) == 0):
                break
            month = index % 12 + 1
            year = 2015 + index // 12
            plot_values_norm['gender'].append(gender)
            plot_values_norm['topic'].append(topic)
            plot_values_norm['percentage'].append(val)
            plot_values_norm['date'].append(f'{year}-{month:02}')
            
plot_values_norm = pd.DataFrame.from_dict(plot_values_norm)

fig = px.bar(plot_values_norm,
             x='gender',
             y='percentage',
             color='topic',
             title='Distribution of Key Topics per Gender',
             animation_frame='date',
             height=750,
             range_y=[min(plot_values_norm['percentage']), max(plot_values_norm['percentage'])],
             barmode='group',
             hover_name='topic',
             hover_data={'gender':False, 'topic':False, 'percentage':True, 'date':False},
             )

fig.update_layout(
    # xaxis_title="Gender",
    yaxis_title="Percentage of Words about Topic in Quotes by Gender",
)

fig.write_html("./plotly/topic_in_gender.html")

The second plot is also a dynamic graph, but this time showing the distribution of genders inside a given topic.

In [None]:
# this cell is for showing the gender distribution for any topic
# this cell will be for the overall topic evolution
import plotly.express as px
import plotly.graph_objects as go

# we prepare our data for plotting
genders_show = [' '.join([word.capitalize() for word in gender.split()]) for gender in genders]

plot_values_abs = {'gender': [],
                   'topic': [],
                   'counts': [],
                   'date':[]
                  }

for gender in genders:
    for topic in topics:
        for index, val in enumerate(topics[topic][gender]):
            if all(np.array(topics[topic][gender][index:]) == 0):
                break
            month = index % 12 + 1
            year = 2015 + index // 12
            plot_values_abs['gender'].append(gender)
            plot_values_abs['topic'].append(topic)
            plot_values_abs['counts'].append(val)
            plot_values_abs['date'].append(f'{year}-{month:02}')
            
plot_values_abs = pd.DataFrame.from_dict(plot_values_abs)

fig = px.bar(plot_values_abs,
             x='topic',
             y='counts',
             color='gender',
             title='Distribution of Genders per Key Topic',
             animation_frame='date',
             height=750,
             range_y=[max(min(plot_values_abs['counts']),1), max(plot_values_abs['counts'])],
             log_y = True,
             barmode='group',
             hover_name='gender',
             hover_data={'gender':False, 'topic':False, 'counts':True, 'date':False},
             )

fig.update_layout(
    #xaxis_title="Topic",
    yaxis_title="Word Counts by Gender per Topic",
)

fig.write_html("./plotly/gender_in_topic.html")

# Sentiment Analysis

In [None]:
#For this part of our analysis we will be using the scores generated by the sentiment intensity analyzer of nltk
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [None]:
# Loop through years
for year in years:
    print(f"Analysing year {year}...", end=" ")
    
    # data location
    data_file = 'quotes-'+str(year)+'-prep.json.bz2'
    data_path = data_folder + data_file
    sent_analysis_folder = data_folder + 'sentiment_analysis/'
    
    # Load by chunks
    f = bz2.open(data_path, "rb")
    data=pd.read_json(f, lines=True, chunksize=10000)
    
    start = timeit.default_timer()
    start_progress = timeit.default_timer()
    progress_step = 100
    
    with bz2.open(sent_analysis_folder +str(year)+ '.json.bz2', "w") as fn:
        for i_chunk, chunk in enumerate(data):
            # Print progress
            if i_chunk%progress_step == 0:
                stop_progress = timeit.default_timer()
                print(f'Time since last: {stop_progress-start_progress:.1f}s\n')
                print(f"Processing chunks {i_chunk}-{i_chunk+progress_step-1}")
                start_progress = timeit.default_timer()
                
            # Process chunk
            chunk.drop(columns=['probas','phase'], inplace=True)
            chunk['sentiment_score']=chunk['quotation'].apply(lambda x: sia.polarity_scores(x)['compound'])
            chunk['year']=chunk['date'].apply(lambda x: x.year)
            chunk['month']=chunk['date'].apply(lambda x:x.month)
            
            # Write to json.bz2 file
            write = chunk.to_json(fn, lines=True, orient='records')
            
    stop = timeit.default_timer()
    print(f'Total time: {stop-start:.1f}s\n')

In order to observe the differences between conservative and liberal news sources, to represent these two categories we have created 2 lists that contain some of the most popular liberal and conservative news sources according to ThoughtCo and Aelieve Digital Marketing.

In [None]:
#Define lists of representative liberal and conservative news sources
libr=['cnn', 'huffingtonpost', 'nytimes', 'politico', 'slate', 'abcnews', 'dailykos', 'washingtonpost', 
      'time', 'theatlantic']
cons=['nationalreview', 'spectator', 'theamericanconservative', 'washingtontimes', 'thenewamerican', 'freebeacon',
      'frontpagemag', 'theblaze', 'humanevents', 'cnsnews']

In [None]:
### Never ever run again

df_repr=pd.DataFrame([])  #to keep only the representative websites in the list above
df_sent=pd.DataFrame([])  #to keep the aggregated values

# Loop through years
for year in years:
    print(f"Analysing year {year}...", end=" ")
    
    # Load by chunks
    f = bz2.open(sent_analysis_folder +str(year)+ '.json.bz2', "rb")
    data=pd.read_json(f, lines=True, chunksize=10000)
    
    start = timeit.default_timer()
    start_progress = timeit.default_timer()
    progress_step = 100

    for i_chunk, chunk in enumerate(data):
        # Print progress
        if i_chunk%progress_step == 0:
            stop_progress = timeit.default_timer()
            print(f'Time since last: {stop_progress-start_progress:.1f}s\n')
            print(f"Processing chunks {i_chunk}-{i_chunk+progress_step-1}")
            start_progress = timeit.default_timer()
            
        # Process chunk
        # aggregate over year-month-gender and save to df_sent
        df_sent=df_sent.append(chunk.groupby(['year','month','gender']).agg({'sentiment_score':'mean', 'numOccurrences':'sum'}))
        # keep only the quotes appeared in the representative lists
        chunk=chunk[chunk['websites'].apply(lambda x: 1 if any(i in x for i in libr+cons) else 0)==1]
        # explode each website list and drop the ones that are not in the lists
        chunk=chunk.explode('websites')
        chunk=chunk[chunk['websites'].isin(libr+cons)].drop_duplicates(['quoteID', 'quotation', 'speaker', 'date', 'numOccurrences', 'websites', 'gender', 'sentiment_score'])
        df_repr=df_repr.append(chunk)
                     
    stop = timeit.default_timer()
    print(f'Total time: {stop-start:.1f}s\n')

df_sent.reset_index(inplace=True)
df_repr.reset_index(inplace=True, drop=True)

In [None]:
# Save the representative data into json

start = timeit.default_timer()
start_progress = timeit.default_timer()
progress_step = 100

with bz2.open(sent_analysis_folder + 'representatives' + '.json.bz2', "w") as f:
    for i in range(0, math.ceil(df_repr.shape[0]/10000)):
        temp=df_repr[i*10000 : (i+1)*10000]
        
        # Print progress
        stop_progress = timeit.default_timer()
        print(f"Saving rows {i*10000}-{(i+1)*10000}")
        print(f'Time since last: {stop_progress-start_progress:.1f}s\n')
        start_progress = timeit.default_timer()
            
        # Write to json.bz2 file
        write = temp.to_json(f, lines=True, orient='records')

stop = timeit.default_timer()
print(f'Total time: {stop-start:.1f}s\n')

In [None]:
#Keep only the genders subject to analysis
keep_gender_list=['male', 'female', 'transgender male', 'transgender female', 'non-binary', 'genderfluid']
df_sent=df_sent[df_sent['gender'].isin(keep_gender_list)].reset_index(drop=True)
df_repr=df_repr[df_repr['gender'].isin(keep_gender_list)].reset_index(drop=True)

In [None]:
# Create a summary table of representative set and save 
df_repr_sum=df_repr.groupby(['year','month','gender', 'websites']).agg({'sentiment_score':'mean', 'quoteID':'count'}).reset_index()
df_repr_sum.rename(columns={'quoteID': "quoteID_count"},inplace=True)

with bz2.open(sent_analysis_folder + 'representatives_summary' + '.json.bz2', "w") as f:
    write = df_repr_sum.to_json(f, lines=True, orient='records')

Since the aggregation of the overall summary table df_sent was done chunk by chunk, we need to correct it.

In [None]:
# Correct the aggregation in summary table, keep both sentiment_score and numOccurrences in the table final table

df_sent2=df_sent.groupby(['year','month','gender']).apply(lambda x: (x['sentiment_score']*x['numOccurrences']).sum()/x['numOccurrences'].sum()).reset_index()
temp=df_sent.groupby(['year','month','gender'])['numOccurrences'].agg('sum').reset_index()
df_sent2=df_sent2.merge(temp, how='inner', on=['year','month','gender']) 
df_sent2.rename(columns={0: "avg_sentiment_score"},inplace=True)

In [None]:
#combinde 'year'&'month'
df_sent2['year']=df_sent2['year'].apply(str)
df_sent2['month']=df_sent2['month'].apply(str)
df_sent2['date']=df_sent2['year']+'-'+df_sent2['month']

df_repr_sum['year']=df_repr_sum['year'].apply(str)
df_repr_sum['month']=df_repr_sum['month'].apply(str)
df_repr_sum['date']=df_repr_sum['year']+'-'+df_repr_sum['month']

Let's see how the average sentiment scores have changed over the time for each gender category.

In [None]:
import plotly.express as px

fig = px.line(df_sent2, x="date", y="avg_sentiment_score", color='gender', 
              color_discrete_sequence=["#636EFA", "#EF553B",  "#00CC96", "#AB63FA", "#FFA15A", "#19D3F3"],
              category_orders={"gender":["male", "female", "transgender male", "transgender female", "non-binary", "genderfluid"]}
             )

fig.update_layout(title_text='Average sentiment scores of quotes from different genders over time', title_x=0.5,
                  xaxis_title='Date', yaxis_title='Average sentiment score', legend_title_text='Gender')

fig.write_html('./plotly/sent_vs_time_allgenders_sep.html')
fig.show()

To be able to make more significant comparisons we combine transgender-male, transgender-female, genderfluid and non-binary genders in a single group named 'others' and plot the graph again.

In [None]:
# Create the others group and rearrange the summary table 
df_sent2['gender']=np.where(df_sent2['gender'].isin(['male','female']), df_sent2['gender'] ,'others')

df_sent2_others=df_sent2.groupby(['year','month','gender','date']).apply(lambda x: (x['avg_sentiment_score']*x['numOccurrences']).sum()/x['numOccurrences'].sum()).reset_index()
temp=df_sent2.groupby(['year','month','gender','date'])['numOccurrences'].agg('sum').reset_index()
df_sent2_others=df_sent2_others.merge(temp, how='inner', on=['year','month','gender', 'date'])

df_sent2_others.rename(columns={0: "avg_sentiment_score"},inplace=True)

In [None]:
fig = px.line(df_sent2_others, x="date", y="avg_sentiment_score", color='gender', 
              color_discrete_sequence=["#636EFA", "#EF553B",  "#FF97FF"],
              category_orders={"gender":["male", "female", "others"]}
             )

fig.update_layout(title_text='Average sentiment scores of quotes from different genders over time', title_x=0.5,
                  xaxis_title='Date', yaxis_title='Average sentiment score', legend_title_text='Gender')

fig.write_html('./plotly/sent_vs_time_allgenders.html')
fig.show()

We can see that the average sentiment scores attached to male and female quotes lie between 0.15 and 0.26, which is close to neutral but still positive. On the other hand, quotes belonging to transgender-male, transgender-female, genderfluid and non-binary genders, which are aggregated in a single group named 'others', tend to have more oscillating average sentiment scores. For these genders, we can observe both positive and negative average scores that cover a wider range from -0.15 to 0.32. 

When we focus on quotes from males and females, we see that almost consistently scores of male quotes lie above the scores of female quotes. Whereas there is no such consistent pattern for other genders. Let's take a closer look into this consistent difference between male and female quotes.

Let's see if this situation continues when we divide news sources into two categories as liberal and conservative. First, we make an overall comparison between liberal (denoted by L) and conservative (denoted by C) news sources, then take a deep dive into each website separately. 

In [None]:
#Create a new column specifying the class of the news source
df_sent3=df_sent2[df_sent2['gender'].isin(['male','female'])]
df_repr_sum2=df_repr_sum[df_repr_sum['gender'].isin(['male','female'])]
df_repr_sum2['L/C']=df_repr_sum2['websites'].apply(lambda x: np.where(x in libr,'L','C'))

#Aggregate on to the new column
df_repr_sum3=df_repr_sum2.groupby(['year','month','gender','L/C']).apply(lambda x: (x['sentiment_score']*x['quoteID_count']).sum()/x['quoteID_count'].sum()).reset_index()
temp=df_repr_sum2.groupby(['year','month','gender','L/C'])['quoteID_count'].agg('sum').reset_index()
df_repr_sum3=df_repr_sum3.merge(temp, how='inner', on=['year','month','gender','L/C'])

df_repr_sum3.rename(columns={0: "avg_sentiment_score"},inplace=True)

#Combine year and month
df_repr_sum3['year']=df_repr_sum3['year'].apply(str)
df_repr_sum3['month']=df_repr_sum3['month'].apply(str)
df_repr_sum3['date']=(df_repr_sum3['year']+'-'+df_repr_sum3['month'])
df_repr_sum3['date']=pd.to_datetime(df_repr_sum3['date'], format='%Y-%m')
df_repr_sum3.sort_values(by='date', ascending=True, inplace=True)

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

#Subsets for the subplots
fm_l=df_repr_sum3[(df_repr_sum3['gender']=='female')&(df_repr_sum3['L/C']=='L')]
fm_c=df_repr_sum3[(df_repr_sum3['gender']=='female')&(df_repr_sum3['L/C']=='C')]
ml_l=df_repr_sum3[(df_repr_sum3['gender']=='male')&(df_repr_sum3['L/C']=='L')]
ml_c=df_repr_sum3[(df_repr_sum3['gender']=='male')&(df_repr_sum3['L/C']=='C')]

fig = make_subplots(rows=2, cols=1, subplot_titles=("Liberal", "Conservative"))

fig.append_trace(go.Scatter(x=ml_l['date'], y=ml_l['avg_sentiment_score'], name='Male-L', marker={ 'color':'#636EFA'}), 
                 row=1, col=1)
fig.append_trace(go.Scatter(x=fm_l['date'], y=fm_l['avg_sentiment_score'], name='Female-L', marker={ 'color':'#EF553B'}),
                 row=1, col=1)
fig.append_trace(go.Scatter(x=ml_c['date'], y=ml_c['avg_sentiment_score'], name='Male-C', marker={ 'color':'#636EFA'}),
                 row=2, col=1)
fig.append_trace(go.Scatter(x=fm_c['date'], y=fm_c['avg_sentiment_score'], name='Female-C', marker={ 'color':'#EF553B'}),
                 row=2, col=1)

fig.update_xaxes(title_text="Date", row=2, col=1)
fig.update_yaxes(title_text="Average sentiment score")
fig.update_layout(title_text="Average sentiment scores of quotes over time in liberal and conservative news sources", title_x=0.5, 
                  legend_title_text='Gender')

fig.write_html('./plotly/sent_vs_time_vs_lib&cons_male&female.html')
fig.show()

From the plot above, we can see that in liberal news sources, sentiment scores of male and female quotes are close to each other, but the female quotes portrayed in these news sources have slightly higher positive scores compared to males'. On the other hand, in conservative news sources, we see that the gap between the male and female quotes' sentiment scores is a little wider compared to the gap in the liberal graph. Also, we can see that contrary to the quotes in liberal news sources, in conservative news sources the coverage given to male quotes have higher positive sentiment scores than the female quotes.

In [None]:
l_compare=fm_l.merge(ml_l, how='inner', on=['year','month','L/C'])
c_compare=fm_c.merge(ml_c, how='inner', on=['year','month','L/C'])

l_compare['diff']=l_compare['avg_sentiment_score_x']-l_compare['avg_sentiment_score_y']
c_compare['diff']=c_compare['avg_sentiment_score_x']-c_compare['avg_sentiment_score_y']

print('Mean female-male sentiment score diff in L: ', np.abs(l_compare['diff']).mean())
print('Mean female-male sentiment score diff in C: ', np.abs(c_compare['diff']).mean())

print('\nFemale sent avg in L: ', fm_l['avg_sentiment_score'].mean())
print('Female sent avg in C: ', fm_c['avg_sentiment_score'].mean())

print('\nMale sent avg in L: ', ml_l['avg_sentiment_score'].mean())
print('Male sent avg in C: ', ml_c['avg_sentiment_score'].mean())

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['Mean of', 'Value']),
                 cells=dict(values=[['Female-male quotes\' sentiment score difference in L',
                                     'Female-male quotes\' sentiment score difference in C',
                                     'Female quotes\' sentiment score in L', 
                                     'Female quotes\' sentiment score in C',
                                     'Male quotes\' sentiment score in L',
                                     'Male quotes\' sentiment score in C'], 
                                    [round(np.abs(l_compare['diff']).mean(),3), 
                                     round(np.abs(c_compare['diff']).mean(),3), 
                                     round(fm_l['avg_sentiment_score'].mean(),3), 
                                     round(fm_c['avg_sentiment_score'].mean(),3), 
                                     round(ml_l['avg_sentiment_score'].mean(),3),
                                     round(ml_c['avg_sentiment_score'].mean(),3)]
                                   ]
                           )
                              )
                     ]
               )
fig.write_html("./plotly/mean_table.html")
fig.show()

This table shows that the mean difference between the average sentiment scores of female and male quotes in conservative news sources is twice of the liberal news sources. Also, we see that there is a 14% difference in the mean sentiment scores of female quotes in liberal and conservative news sources and a 38% difference for the male quotes. These results and the graphs above brings up two questions:

**'Do the conservative news sources tend to give more coverage to quotes with higher positivity from males?'**

**'Do the liberal news sources tend to give more coverage to quotes with higher positivity from females?'**

Let's have a look at the distributions of the sentiment scores of quotes from males and females in liberal and conservative news sources.

In [None]:
#We go back to the summary table df_repr and divide it to 4 new tables:
#female&liberal, female&conservative, male&liberal, male&conservative
df_repr=df_repr[df_repr['gender'].isin(['female','male'])].reset_index(drop=True)
df_repr['L/C']=df_repr['websites'].apply(lambda x: np.where(x in libr,'L','C'))

df_repr_fm_l=df_repr[(df_repr['gender']=='female')&(df_repr['L/C']=='L')].reset_index()
df_repr_fm_c=df_repr[(df_repr['gender']=='female')&(df_repr['L/C']=='C')].reset_index()

df_repr_ml_l=df_repr[(df_repr['gender']=='male')&(df_repr['L/C']=='L')].reset_index()
df_repr_ml_c=df_repr[(df_repr['gender']=='male')&(df_repr['L/C']=='C')].reset_index()

In [None]:
fig = go.Figure()

fig.add_trace(go.Box(y=df_repr_ml_l['sentiment_score'], name='Male-Liberal', marker_color="#636EFA"))
fig.add_trace(go.Box(y=df_repr_ml_c['sentiment_score'], name='Male-Conservative', marker_color="#636EFA"))
fig.add_trace(go.Box(y=df_repr_fm_l['sentiment_score'], name='Female-Liberal', marker_color="#EF553B"))
fig.add_trace(go.Box(y=df_repr_fm_c['sentiment_score'], name='Female-Conservative', marker_color="#EF553B"))


fig.update_layout(title_text='Sentiment score distribution in liberal and conservative news sources ', title_x=0.5,
                  xaxis_title='Gender & News Source', yaxis_title='Sentiment score', 
                  legend_title_text='Genders')
fig.write_html("./plotly/box_sent_lib&cons_male&female.html")
fig.show()

In [None]:
print('Var of male quotes in liberal news sources: ', df_repr_ml_l['sentiment_score'].var())
print('Var of male quotes in conservative news sources: ', df_repr_ml_c['sentiment_score'].var())

print('\nVar of female quotes in liberal news sources: ', df_repr_fm_l['sentiment_score'].var())
print('Var of female quotes in conservative news sources: ', df_repr_fm_c['sentiment_score'].var())

From the boxplot we can directly see that there is a general shift towards more positive sentiment scores of quotes belonging to men in conservative news sources. On the other side, we see that although the median and third quartile values of sentiment scores of female quotes are very similar in both liberal and conservative news sources, the first quartile value, which falls in the area of negative sentiment, is higher in the liberal news sources.

Since the variance of the distributions are similar, to see if the differences we have seen in the sentiment scores with respect to news source category L/C in the sample we have created with the chosen news sources are significant for the whole population, we perform a one-sided independent student t-test with the following null hypotheses:

    - Test 1 - H0: The mean sentiment score of quotes from males in conservative news sources is less than that of liberal 
    news sources.
    - Test 2 - H0: The mean sentiment score of quotes from females in liberal news sources is less than that of conservative
    news sources.

In [None]:
from scipy import stats

#Student's t-test
statistic, pvalue = stats.ttest_ind(df_repr_ml_c['sentiment_score'], df_repr_ml_l['sentiment_score'], alternative='greater')
print("Student's t-test p-value for 'sentiment_score' of males' quotes in L and C: " + str(pvalue/2))

statistic2, pvalue2 = stats.ttest_ind(df_repr_fm_l['sentiment_score'], df_repr_fm_c['sentiment_score'], alternative='greater')
print("Student's t-test p-value for 'sentiment_score' of females' quotes in L and C: " + str(pvalue2/2))

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['Test number', 'p-value/2']),
                 cells=dict(values=[['Test - 1',
                                     'Test - 2'], 
                                    [round(pvalue/2,5), 
                                     round(pvalue2/2,5)]
                                   ]
                           )
                              )
                     ]
               )
fig.write_html("./plotly/p_table.html")
fig.show()

Since for both tests, the p-values are less than the significance level threshold of 0.005, we have enough evidence to reject the null hypotheses and conclude that:

    - The mean sentiment score of quotes from males portrayed in conservative news sources is higher than that of liberal 
      news sources. In other words, conservative news sources tend to give more coverage to quotes with higher positivity 
      from males compared to liberals
    - The mean sentiment score of quotes from females portrayed in liberal news sources is greater than that of conservative 
      news sources. In other words, liberal news sources tend to give more coverage to quotes with higher positivity from 
      females compared to conservatives.

Now that we had an overview of the sentiment score differences with respect to gender and news source category, let's have a closer look into the behavior of individual news sources within our representative news source lists.

In [None]:
website_sum=df_repr.groupby(['websites','gender', 'L/C']).agg({'sentiment_score':'mean', 'quoteID':'count'}).reset_index()

website_sum.rename(columns={'quoteID': "quoteID_count"},inplace=True)
website_sum.rename(columns={'sentiment_score': "avg_sentiment_score"},inplace=True)

In [None]:
import plotly.express as px

fig = px.scatter(website_sum[website_sum['L/C']=='L'], x = 'websites', y = 'avg_sentiment_score', 
                 size = 'quoteID_count', color = 'gender', size_max=50, 
                 color_discrete_map={"male": "#636EFA", "female": "#EF553B"},
                 category_orders={"gender":["male", "female"]})

fig.update_layout(title = "Sentiment scores in liberal news sources", title_x=0.5,
                  xaxis_title='News source websites', yaxis_title='Average sentiment score', legend_title_text='Gender')

fig.write_html("./plotly/websites_L.html")
fig.show()

We can see that among the selected liberal news sources The New York Times is the one that portrays quotes with the most similar sentiment scores from males and females, followed by Slate, The Atlantic, Politico and CNN. Whereas Huffington Post is the one with the biggest difference in sentiment scores, followed by Time Magazine. We can also note that 9 out of 10 of the representative liberal news sources have quotes with higher sentiment scores from females, only 1 has a reversed trend, which is The Washington Post.

In [None]:
import plotly.express as px

fig = px.scatter(website_sum[website_sum['L/C']=='C'], x = 'websites', y = 'avg_sentiment_score', 
                 size = 'quoteID_count', color = 'gender',size_max=50, 
                 color_discrete_map={"male": "#636EFA", "female": "#EF553B"},
                 category_orders={"gender":["male", "female"]})

fig.update_layout(title = "Sentiment scores in conservative news sources", title_x=0.5,
                  xaxis_title='News source websites', yaxis_title='Average sentiment score', legend_title_text='Gender')

fig.write_html("./plotly/websites_C.html")
fig.show()

We can see that among the selected conservative news sources National Review is the one that portrays quotes with the most similar sentiment scores from males and females, followed by The American Conservative, The American Spectator and The Blaze. Whereas Human Events and The Washington Times are the ones having the biggest difference in sentiment scores. The Washington Times also constitutes the majority of the quotes in the conservative group and therefore is an important factor in the trend towards more positive quotes from men in conservative news sources. By looking at the sizes of the data points in the graphs, in addition to the sentimental differences, we can see that conservative news sources make less use of quotations than liberal news sources.

# Text Complexity

To measure the text complexity, the process is very similar all the other parts. We'll start by analyzing the data and extracting the relevant information, in this case the complexity information, per gender, per date, per website.

These files are going to be saved in the `./data_processed` folder, under the names `complexity_<year>`. Furthermore, so we can do a more significant statistical analysis, we'll also save the standard deviation for all of these, in the files called `std_complexity_<year>`.

The internal organization of these files is as follows. They separate the data by website, gender and date. So in each file we have one dictionary where the keys are the websites. In each entry we have another dictionary, where the keys are the genders. Finally, for each entry we have a list with 12 elements, one for each month, where we'll save the text complexity / standard deviation for that month. In short, if `root` is the root dictionary extracted from the folder, then to go all the way into it we need `root[website][gender][month]`.

In [None]:
# calculate the means
from src.text_scores import *

libr=['cnn', 'huffingtonpost', 'huffpost', 'nytimes', 'politico', 'slate', 'abcnews', 'dailykos', 'washingtonpost', 'time', 'theatlantic']
cons=['nationalreview', 'spectator', 'theamericanconservative', 'washingtontimes', 'thenewamerican', 'freebeacon', 'frontpagemag', 'theblaze', 'humanevents', 'cnsnews']

websites = libr + cons
genders = ['male', 'female', 'transgender male', 'transgender female', 'non-binary', 'genderfluid']


dfs_quotes = []
for file in glob.glob(data_folder + data_file):
    dfs_quotes.append(pd.read_json(file, lines=True, chunksize=1e4))

start = timeit.default_timer()
for year, file in enumerate(dfs_quotes):
    complexity = {}
    i = 0
    for chunk in file:
        tokens = {}
        quotes = chunk['tokens'].tolist()
        date = chunk['date'].tolist()
        gender = chunk['gender'].tolist()
        sources = chunk['websites'].tolist()
        
        for index, quote in enumerate(quotes):
            if gender[index] not in genders:
                continue
            for source in sources[index]:
                if source not in websites:
                    continue
                if source not in tokens.keys():
                    tokens[source] = {}
                if gender[index] not in tokens[source].keys():
                    tokens[source][gender[index]] = [[] for _ in range(12)]
                tokens[source][gender[index]][int(str(date[index])[5:7]) - 1].append(quote)
        
        for website in tokens.keys():
            if website not in complexity.keys():
                complexity[website] = {}
            for gender in tokens[website].keys():
                if gender not in complexity[website].keys():
                    complexity[website][gender]= [[0,0] for _ in range(12)]
                for month, quotes in enumerate(tokens[website][gender]):
                    for quote in quotes:
                        quote_join = ' '.join(quote)
                        if len(quote_join) != 0:
                            complexity[website][gender][month][0] += dale_chall_score(quote_join)
                            complexity[website][gender][month][1] += 1

        i += 1
        if i % 10 == 0:
            with open(f'complexity_{year + 2015}.txt', 'w') as f:
                f.write(f'Chunks processed: {i}\n')
                f.write(json.dumps(complexity))
            print(i, end = ',')

            
    # now we decompose the means
    for website in complexity.keys():
        for gender in complexity[website].keys():
            for month, scores in enumerate(complexity[website][gender]):
                if scores[1] != 0:
                    complexity[website][gender][month] = scores[0] / scores[1]
                else:
                    complexity[website][gender][month] = 0
    with open(f'complexity_{year + 2015}.txt', 'w') as f:
        f.write(json.dumps(complexity))
    print()


print(f'Time to analyze all chunks {timeit.default_timer() - start}!!')        

In [None]:
# calculate the standard deviations
from src.text_scores import *

dfs_quotes = []
for file in glob.glob(data_folder + data_file):
    dfs_quotes.append(pd.read_json(file, lines=True, chunksize=1e4))

complexity_means = []
for file in glob.glob('./complexity*.txt'):
    with open(file, 'r') as f:
        complexity_means.append(eval(f.read()))

        
start = timeit.default_timer()
for year, file in enumerate(dfs_quotes):
    complexity_std = {}
    i = 0
    for chunk in file:
        tokens = {}
        quotes = chunk['tokens'].tolist()
        date = chunk['date'].tolist()
        gender = chunk['gender'].tolist()
        sources = chunk['websites'].tolist()
        
        for index, quote in enumerate(quotes):
            if gender[index] not in genders:
                continue
            for source in sources[index]:
                if source not in websites:
                    continue
                if source not in tokens.keys():
                    tokens[source] = {}
                if gender[index] not in tokens[source].keys():
                    tokens[source][gender[index]] = [[] for _ in range(12)]
                tokens[source][gender[index]][int(str(date[index])[5:7]) - 1].append(quote)
        
        for website in tokens.keys():
            if website not in complexity_std.keys():
                complexity_std[website] = {}
            for gender in tokens[website].keys():
                if gender not in complexity_std[website].keys():
                    complexity_std[website][gender]= [[0,0] for _ in range(12)]
                for month, quotes in enumerate(tokens[website][gender]):
                    # print('sgmi', website, gender, month, i)
                    for quote in quotes:
                        quote_join = ' '.join(quote)
                        if len(quote_join) != 0:
                            complexity_std[website][gender][month][0] += (dale_chall_score(quote_join) - complexity_means[year][website][gender][month])**2
                            complexity_std[website][gender][month][1] += 1

        i += 1
        if i % 10 == 0:
            with open(f'std_complexity_{year + 2015}.txt', 'w') as f:
                f.write(f'Chunks processed: {i}\n')
                f.write(json.dumps(complexity_std))
            print(i, end = ',')

    # now we decompose the means
    for website in complexity_std.keys():
        for gender in complexity_std[website].keys():
            for month, scores in enumerate(complexity_std[website][gender]):
                if scores[1] == 0 or scores[1] == 1:
                    complexity_std[website][gender][month] = 0
                else:
                    complexity_std[website][gender][month] = np.sqrt(scores[0] / (scores[1] * (scores[1] - 1)))
    with open(f'std_complexity_{year + 2015}.txt', 'w') as f:
        f.write(json.dumps(complexity_std))
    print()


print(f'Time to analyze all chunks {timeit.default_timer() - start}!!')        

After all the data has already been processed, we just need to access it in the save files, which is much faster.

In [None]:
complexity_data_files = glob.glob('./data_processed/complexity*.txt')
complexity_std_files  = glob.glob('./data_processed/std_*.txt')

complexity_data = []
complexity_std  = []

for file in complexity_data_files:
    with open(file, 'r') as f:
        txt = f.read()
        complexity_data.append(eval(txt))
        
for file in complexity_std_files:
    with open(file, 'r') as f:
        txt = f.read()
        complexity_std.append(eval(txt))

Next we save it into a DataFrame, because `plotly` handles information much more easily if it comes in a DataFrame. Besides, we're much more interested in the differences of complexity between men and women, so that's what we'll keep.

In [None]:
# Now let's sort all of this into a dataframe
clean_data = {'date': [],
              'gender': [],
              'website': [],
              'complexity': [],
              'std': []
             }

for year, data in enumerate(complexity_data):
    for website in data.keys():
        for gender in data[website].keys():
            for month, complexity in enumerate(data[website][gender]):
                if complexity > 1:
                    clean_data['date'].append(f"{month + 1:02}-{year + 2015}")
                    clean_data['gender'].append(gender)
                    clean_data['website'].append(website)
                    clean_data['complexity'].append(float(complexity))
                    clean_data['std'].append(complexity_std[year][website][gender][month])
                                
df = pd.DataFrame.from_dict(clean_data)
print(df)

df_diff = {'date': [],
           'website': [],
           'diff_comp': [],
           'diff_std': [],
           'male': [],
           'female': []
          }

for date in df['date'].unique():
    for website in df['website'].unique():
        try:
            male = df[(df['date'] == date) & (df['website'] == website) & (df['gender'] == 'male')]
            female = df[(df['date'] == date) & (df['website'] == website) & (df['gender'] == 'female')]
            diff_data = male['complexity'].values[0] - female['complexity'].values[0]
            diff_std  = np.sqrt(male['std'].values[0]**2 + female['std'].values[0]**2)
            
            df_diff['date'].append(date)
            df_diff['website'].append(website)
            df_diff['diff_comp'].append(diff_data)
            df_diff['diff_std'].append(diff_std)
            df_diff['male'].append(male['complexity'].values[0])
            df_diff['female'].append(female['complexity'].values[0])
        except:
            pass
df_diff = pd.DataFrame.from_dict(df_diff)
display(df_diff)

Since we're not particularly interested in a temporal analysis of the complexity, we'll cluster all the temporal data, keeping only the division into websites.

In [None]:
df_diff['count'] = df_diff['website']
df_grouped = df_diff.groupby(['website'], as_index=False).agg({
    'diff_comp' : np.mean,
    'diff_std': np.std,
    'count': np.size,
    'male': np.mean,
    'female': np.mean
})

df_grouped['diff_std'] /= np.sqrt(df_grouped['count'])
df_grouped = df_grouped.drop(['count'], axis=1)

display(df_grouped)

Finally, we divide into liberal and conservative newspapers and plot the two graphs.


In [None]:
libr=['cnn', 'huffingtonpost', 'huffpost', 'nytimes', 'politico', 'slate', 'abcnews', 'dailykos', 'washingtonpost', 'time', 'theatlantic']
cons=['nationalreview', 'spectator', 'theamericanconservative', 'washingtontimes', 'thenewamerican', 'freebeacon', 'frontpagemag', 'theblaze', 'humanevents', 'cnsnews']

df_libr = df_grouped[df_grouped['website'].isin(libr)]
df_cons = df_grouped[df_grouped['website'].isin(cons)]

In [None]:
# this cell is for showing the gender distribution for any topic
# this cell will be for the overall topic evolution
import plotly.express as px
import plotly.graph_objects as go

# we prepare our data for plotting
fig = px.bar(df_libr,
             x='website',
             y='diff_comp',
             title='Difference Of Text Complexity (Men - Women) on Liberal Websites',
             width=750,
             error_y = 'diff_std',
             hover_name = 'website',
             hover_data={'male':True, 'female': True, 'website':False,'diff_comp':True}
             )

fig.update_layout(
    xaxis_title="Website",
    yaxis_title="Complexity Men - Complexity Women",
    xaxis={'categoryorder':'total descending'}
)

fig.write_html("./plotly/text_complexity_libr.html")

fig = px.bar(df_cons,
             x='website',
             y='diff_comp',
             title='Difference Of Text Complexity (Men - Women) on Conservative Websites',
             width=750,
             error_y = 'diff_std',
             hover_name = 'website',
             hover_data={'male':True, 'female': True, 'website':False,'diff_comp':True}
             )

fig.update_layout(
    xaxis_title="Website",
    yaxis_title="Complexity Men - Complexity Women",
    xaxis={'categoryorder':'total descending'}
)

fig.write_html("./plotly/text_complexity_cons.html")