In [None]:
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
%matplotlib inline

Load all the datasets and extract the climate quotes for each year into a separate pickle file

In [None]:
# RUN ONLY ONCE
# iteratively extract climate quotation datasets for each year

for year in range(2015, 2021):
    print(year)
    df_reader = pd.read_json('data/quotebank/quotes-{}.json.bz2'.format(year), lines=True, compression='bz2', chunksize=10000)

    climate_quotes = []
    for chunk in df_reader:
        df = chunk[chunk.quotation.str.contains('climate', case=False, na=False)]
        climate_quotes.append(df)
    print('Chunks for year {} done'.format(year))
    climate_df = pd.concat(climate_quotes)
    # pickle pickle pickle
    climate_df.to_pickle('data/climate_df_{}.pkl'.format(year))
    print('Pickle done')

2015
Chunks for year 2015 done
Pickle done
2016
Chunks for year 2016 done
Pickle done
2017


In [None]:
# read the climate dataframes pickles for each year into a dictionary
climate_dfs = {}
for i in range(2015, 2021):
    climate_dfs['{}'.format(i)] = pd.read_pickle('data/climate_df_{}.pkl'.format(i)) 

"Top 10" functions

In [None]:
#PROBLEM: presidents appear in several forms within the speaker column (full name, president+full name, etc.)
def top_speakers(df):
    """function to extract the top 10 speakers"""
    df = clean_speakers(df)
    
    speakers = df.groupby('speaker')['speaker'].count().sort_values(ascending=False).to_frame().add_suffix('_count')
    # remove unknown speakers 
    top_speakers = speakers[speakers.index != 'none'].reset_index(drop=False)
    
    return top_speakers.head(10)


def top_quotations(df):
    """function to extract the top 10 msot repeated quotations"""
    df = clean_speakers(df)
    
    quotations = df.sort_values('numOccurrences', ascending=False)[['speaker', 'quotation', 'numOccurrences']]
    # remove quotations of unidentified speakers
    top_quotations = quotations[quotations['speaker'] != 'none'].reset_index(drop=True)
    
    return top_quotations.head(10)

In [None]:
# quick visualization
def visualize_top(x, y):
    plt.figure(figsize=(18, 8))
    plt.bar(x, y, color=['orange', 'blue', 'green' , 'red', 'brown', 'yellow', 'pink', 'gray', 'lime', 'darkblue'])


In [None]:
# removing doubled speakers
#idea: check the names contained in other names and merge the rows count (e.g. 'Donald trump' is in 'president donal trump')

def clean_speakers(df):
    # all names to lower case
    df['speaker'] = df['speaker'].str.lower()

    return df

# check for almost same quotations
def clean_quotations(df):
    for i in range(df.shape[0]):
        print(sum([df.iloc[i].quotation in x.quotation for idx, x in df.iterrows()]))
            

In [None]:
df[df.speaker.apply(lambda s: 'president' in s)].groupby('speaker').speaker.count()

speaker
president barack obama         67
president bill clinton          7
president carter                2
president donald j. trump       1
president donald trump        167
president george h.w. bush      1
president george w. bush        1
president moon                  1
president obama                18
president trump                94
Name: speaker, dtype: int64

In [None]:
# show top speakers from 2017
top_speakers(climate_dfs['2017'])
#visualize_top(top_speakers(climate_dfs['2017']).speaker, top_speakers(climate_dfs['2017']).quotation_count)

Unnamed: 0,speaker,speaker_count
0,president donald trump,354
1,al gore,323
2,jerry brown,292
3,scott pruitt,265
4,catherine mckenna,215
5,president trump,199
6,emmanuel macron,198
7,michael mann,190
8,donald trump,172
9,angela merkel,156


In [None]:
# show top quotations from 2015
top_quotations(climate_dfs['2020'])

Unnamed: 0,speaker,quotation,numOccurrences
0,jeff bezos,I want to work alongside others both to amplif...,333
1,russell crowe,Make no mistake. The tragedy unfolding in Aust...,217
2,george eustice,We'll never be able to protect every single ho...,210
3,jens holm,has worked hard to make politicians open their...,204
4,richard boyd barrett,It wouldn't be sustainable for very long but I...,179
5,joe solmonese,"In our current climate of uncertainty, we beli...",179
6,rebecca long-bailey,offshore bank account and places it on the bal...,176
7,jeff bezos,I want to work alongside others both to amplif...,159
8,scott morrison,This is a longer-term risk framework model whi...,144
9,scott morrison,There is no dispute in this country about the ...,142


In [None]:
from tld import get_tld

def get_domain(url):
    res = get_tld(url, as_object=True)
    return res.domain

# function extracting the top 10 newspapers mentioning climate 
def top_newspapers(df):
    top_newspapers = pd.Series([x for _list in df['urls'] for x in _list]).apply(lambda u: get_domain(u)).value_counts()
    top_newspapers = top_newspapers.to_frame().reset_index(drop=False)
    top_newspapers.columns = ['newspaper', 'count']
    
    return top_newspapers.head(10)


In [None]:
top_newspapers(climate_dfs['2018'])

Unnamed: 0,newspaper,count
0,msn,2093
1,phys,1469
2,breitbart,1340
3,yahoo,1243
4,eurekalert,1120
5,commondreams,1047
6,sfgate,978
7,carbonbrief,977
8,washingtontimes,881
9,businessinsider,828


In [None]:
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
%matplotlib inline

Load all the datasets and extract the climate quotes for each year into a separate pickle file

In [None]:
# RUN ONLY ONCE
# iteratively extract climate quotation datasets for each year

for year in range(2015, 2021):
    print(year)
    df_reader = pd.read_json('data/quotebank/quotes-{}.json.bz2'.format(year), lines=True, compression='bz2', chunksize=10000)

    climate_quotes = []
    for chunk in df_reader:
        df = chunk[chunk.quotation.str.contains('climate', case=False, na=False)]
        climate_quotes.append(df)
    print('Chunks for year {} done'.format(year))
    climate_df = pd.concat(climate_quotes)
    # pickle pickle pickle
    climate_df.to_pickle('data/climate_df_{}.pkl'.format(year))
    print('Pickle done')

2015
Chunks for year 2015 done
Pickle done
2016
Chunks for year 2016 done
Pickle done
2017


In [None]:
# read the climate dataframes pickles for each year into a dictionary
climate_dfs = {}
for i in range(2015, 2021):
    climate_dfs['{}'.format(i)] = pd.read_pickle('data/climate_df_{}.pkl'.format(i)) 

"Top 10" functions

In [None]:
#PROBLEM: presidents appear in several forms within the speaker column (full name, president+full name, etc.)
def top_speakers(df):
    """function to extract the top 10 speakers"""
    df = clean_speakers(df)
    
    speakers = df.groupby('speaker')['speaker'].count().sort_values(ascending=False).to_frame().add_suffix('_count')
    # remove unknown speakers 
    top_speakers = speakers[speakers.index != 'none'].reset_index(drop=False)
    
    return top_speakers.head(10)


def top_quotations(df):
    """function to extract the top 10 msot repeated quotations"""
    df = clean_speakers(df)
    
    quotations = df.sort_values('numOccurrences', ascending=False)[['speaker', 'quotation', 'numOccurrences']]
    # remove quotations of unidentified speakers
    top_quotations = quotations[quotations['speaker'] != 'none'].reset_index(drop=True)
    
    return top_quotations.head(10)

In [None]:
# quick visualization
def visualize_top(x, y):
    plt.figure(figsize=(18, 8))
    plt.bar(x, y, color=['orange', 'blue', 'green' , 'red', 'brown', 'yellow', 'pink', 'gray', 'lime', 'darkblue'])


In [None]:
# removing doubled speakers
#idea: check the names contained in other names and merge the rows count (e.g. 'Donald trump' is in 'president donal trump')

def clean_speakers(df):
    # all names to lower case
    df['speaker'] = df['speaker'].str.lower()

    return df

# check for almost same quotations
def clean_quotations(df):
    for i in range(df.shape[0]):
        print(sum([df.iloc[i].quotation in x.quotation for idx, x in df.iterrows()]))
            

In [None]:
df[df.speaker.apply(lambda s: 'president' in s)].groupby('speaker').speaker.count()

speaker
president barack obama         67
president bill clinton          7
president carter                2
president donald j. trump       1
president donald trump        167
president george h.w. bush      1
president george w. bush        1
president moon                  1
president obama                18
president trump                94
Name: speaker, dtype: int64

In [None]:
# show top speakers from 2017
top_speakers(climate_dfs['2017'])
#visualize_top(top_speakers(climate_dfs['2017']).speaker, top_speakers(climate_dfs['2017']).quotation_count)

Unnamed: 0,speaker,speaker_count
0,president donald trump,354
1,al gore,323
2,jerry brown,292
3,scott pruitt,265
4,catherine mckenna,215
5,president trump,199
6,emmanuel macron,198
7,michael mann,190
8,donald trump,172
9,angela merkel,156


In [None]:
# show top quotations from 2015
top_quotations(climate_dfs['2020'])

Unnamed: 0,speaker,quotation,numOccurrences
0,jeff bezos,I want to work alongside others both to amplif...,333
1,russell crowe,Make no mistake. The tragedy unfolding in Aust...,217
2,george eustice,We'll never be able to protect every single ho...,210
3,jens holm,has worked hard to make politicians open their...,204
4,richard boyd barrett,It wouldn't be sustainable for very long but I...,179
5,joe solmonese,"In our current climate of uncertainty, we beli...",179
6,rebecca long-bailey,offshore bank account and places it on the bal...,176
7,jeff bezos,I want to work alongside others both to amplif...,159
8,scott morrison,This is a longer-term risk framework model whi...,144
9,scott morrison,There is no dispute in this country about the ...,142


In [None]:
from tld import get_tld

def get_domain(url):
    res = get_tld(url, as_object=True)
    return res.domain

# function extracting the top 10 newspapers mentioning climate 
def top_newspapers(df):
    top_newspapers = pd.Series([x for _list in df['urls'] for x in _list]).apply(lambda u: get_domain(u)).value_counts()
    top_newspapers = top_newspapers.to_frame().reset_index(drop=False)
    top_newspapers.columns = ['newspaper', 'count']
    
    return top_newspapers.head(10)


In [None]:
top_newspapers(climate_dfs['2018'])

Unnamed: 0,newspaper,count
0,msn,2093
1,phys,1469
2,breitbart,1340
3,yahoo,1243
4,eurekalert,1120
5,commondreams,1047
6,sfgate,978
7,carbonbrief,977
8,washingtontimes,881
9,businessinsider,828
