In [157]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Getting quotes related to climate change

In [2]:
reg_query = "|".join(["climate", "global warming", "greenhouse gas", "gas emissions", "greenhouse effect", "pesticide", "pollution", "carbon emissions"])


In [4]:
reg_query

'climate|global warming|greenhouse gas|gas emissions|greenhouse effect|pesticide|pollution|carbon emissions'

In [7]:
df_reader = pd.read_json('data/quotebank/quotes-2020.json.bz2', lines=True, compression='bz2', chunksize =100000)

In [6]:
%%time
climate_like_quotes = []
for chunk in df_reader:
    df = chunk[chunk.quotation.str.contains(reg_query, case=False, na=False)]
    climate_like_quotes.append(df)

CPU times: user 5min 22s, sys: 6.49 s, total: 5min 28s
Wall time: 5min 28s


In [8]:
climate_like_df = pd.concat(climate_like_quotes)

In [9]:
climate_like_df.shape

(22690, 9)

In [100]:
climate_like_df.quotation.iloc[1401]

"We have international students who are worried about what's going on in Australia... I think some of the worry is the air pollution. People have seen the pictures of the red sky."

In [11]:
climate_like_df.to_pickle('data/climate_like_2020.pkl')

In [4]:
climate_df = pd.read_pickle("data/climate_df_2020.pkl")

# Basic analysis

In [26]:
climate_dfs = {}
for y in range(2015, 2021):
    climate_dfs[y] = pd.read_pickle('data/climate_df_{}.pkl'.format(y))

In [179]:
full_climate_df = pd.DataFrame(columns = climate_dfs[2015].columns)
for y in range(2015, 2021):
    full_climate_df = full_climate_df.append(climate_dfs[y])

## Cleaning dataset

In [3]:
%%time
# load wiki data
wiki_data = pd.read_parquet('parquet-data/speaker_attributes.parquet')

CPU times: user 12.2 s, sys: 2.79 s, total: 15 s
Wall time: 11.9 s


In [155]:
full_climate_df.loc[1266]

quoteID                                           2015-09-09-008765
quotation         As siltation is a natural process in the Meghn...
speaker                                         Anisul Islam Mahmud
qids                                                    [Q23762578]
date                                            2015-09-09 07:40:37
numOccurrences                                                   11
probas                [[Anisul Islam Mahmud, 0.942], [None, 0.058]]
urls              [http://trust.org/item/20150909073825-39czr, h...
phase                                                             E
Name: 1266, dtype: object

In [180]:
def replace_name(q, qids_to_name={}):
    if 0 < len(q.qids):
        if q.qids[0] not in qids_to_name:
            qids_to_name[q.qids[0]] = q.speaker
        else:
            q.speaker = qids_to_name[q.qids[0]]
    return q
    

In [181]:
%%time
full_climate_df = full_climate_df.apply(replace_name, axis=1)

CPU times: user 22.2 s, sys: 234 ms, total: 22.4 s
Wall time: 22.4 s


## Getting top speakers

In [119]:
top_speakers = pd.DataFrame()
for y in range(2015, 2021):
    count = climate_dfs[y].query('speaker != "None"').groupby('speaker')['quoteID'].count().sort_values(ascending=False).to_frame()
    count = count.rename(columns={"quoteID": "count"}).reset_index()
    count.insert(0, 'year', y)
    count.insert(3, 'rank', count.index + 1)
    top_speakers = top_speakers.append(count)

In [125]:
top_speakers = top_speakers.reset_index()

In [126]:
top_speakers

Unnamed: 0,index,year,speaker,count,rank
0,0,2015,President Barack Obama,731,1
1,1,2015,President Obama,567,2
2,2,2015,Narendra Modi,362,3
3,3,2015,Bernie Sanders,345,4
4,4,2015,Ban Ki-moon,331,5
...,...,...,...,...,...
72171,5809,2020,Kevin Sheekey,1,5810
72172,5810,2020,Kevin Stiroh,1,5811
72173,5811,2020,Kevin Vann,1,5812
72174,5812,2020,Kevin Walters,1,5813


In [183]:
selected_speakers = full_climate_df.query('speaker != "None"').groupby('speaker')['quoteID'].count().sort_values(ascending=False).to_frame()

In [184]:
selected_speakers[:10]

Unnamed: 0_level_0,quoteID
speaker,Unnamed: 1_level_1
President Barack Obama,2304
Donald Trump,1714
Bernie Sanders,1413
António Guterres,1147
Jerry Brown,894
Jay Inslee,880
Catherine Mckenna,856
Greta Thunberg,855
Narendra Modi,835
Al Gore,791


In [185]:
selected_speakers.query('speaker == "President Barack Obama"')

Unnamed: 0_level_0,quoteID
speaker,Unnamed: 1_level_1
President Barack Obama,2304


In [5]:
speakers = climate_like_df.groupby('speaker')['quoteID'].count().sort_values(ascending=False)

In [6]:
speakers.head(15)

speaker
None                7227
Bernie Sanders       185
Greta Thunberg       146
Scott Morrison       123
Larry Fink            95
Jane Fonda            68
Tom Steyer            68
Joe Biden             66
Joaquin Phoenix       65
Antonio Guterres      65
Pete Buttigieg        64
Malcolm Turnbull      59
Michael Mann          58
Elizabeth Warren      56
Boris Johnson         56
Name: quoteID, dtype: int64