In [None]:
import numpy as np
import pandas as pd
from os import path
from PIL import Image
#from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
%matplotlib inline

Load all the datasets and extract the climate quotes for each year into a separate pickle file

In [None]:
# RUN ONLY ONCE
# iteratively extract climate quotation datasets for each year

for year in range(2015, 2021):
    print(year)
    df_reader = pd.read_json('data/quotebank/quotes-{}.json.bz2'.format(year), lines=True, compression='bz2', chunksize=10000)

    climate_quotes = []
    for chunk in df_reader:
        df = chunk[chunk.quotation.str.contains('climate', case=False, na=False)]
        climate_quotes.append(df)
    print('Chunks for year {} done'.format(year))
    climate_df = pd.concat(climate_quotes)
    # pickle pickle pickle
    climate_df.to_pickle('data/climate_df_{}.pkl'.format(year))
    print('Pickle done')

2015
Chunks for year 2015 done
Pickle done
2016
Chunks for year 2016 done
Pickle done
2017


Read the climate dataframes pickles for each year into a dictionary

In [None]:
climate_dfs = {}
for i in range(2015, 2021):
    climate_dfs['{}'.format(i)] = pd.read_pickle('data/climate_df_{}.pkl'.format(i))

**Cleaning dataset**

In [108]:
# set a unique label_name for speakers with several versions of name
# RESOLVES THE TRUMP PROBLEM
def clean_speaker(df, name):
    
    df_copy = df.copy()
    
    df_name = df[df['speaker'].apply(lambda x: name in x)]
    # get the qid of the speaker in wiki_data
    wiki_qid = df_name['qids'].iloc[0][0]
    # get the label of speaker
    label_name = wiki_data[wiki_data['id'] == wiki_qid]['label'].iloc[0]
    #replace speaker with his label
    df_copy = df.replace(df_name['speaker'].iloc[0], label_name)
    
    return df_copy

In [109]:
def clean_df(df):
    """Cleans the dataset for speakers only for now"""
    
    # remove all the 'None' speakers
    df = df[df['speaker'] != 'None']
    
    #clean different Trump names
    trump_names = df[df['speaker'].apply(lambda x: ('trump' in x) and ('president' in x) or x=='donald trump' or x=='donald j. trump')]['speaker'].unique()
    for t in trump_names:
        climate_dfs['2018'] = clean_speaker(climate_dfs['2018'], t)
        
    # clean different Obama names
    obama_names = df[df['speaker'].apply(lambda x: ('barack' in x) or ('president' in x and 'obama' in x))]['speaker'].unique()
    for i in obama_names:
        climate_dfs['2018'] = clean_speaker(climate_dfs['2018'], i)
    
    # all names to lower case
    df['speaker'] = df['speaker'].str.lower()
    
    return df

We clean climate dataset 2018

In [110]:
climate_dfs['2018'] = clean_df(climate_dfs['2018'])

**"Top 10" functions**

In [None]:
#PROBLEM: presidents appear in several forms within the speaker column (full name, president+full name, etc.)
def top_speakers(df):
    """function to extract the top 10 speakers"""
    df = clean_speakers(df)
    
    speakers = df.groupby('speaker')['speaker'].count().sort_values(ascending=False).to_frame().add_suffix('_count')
    # remove unknown speakers 
    top_speakers = speakers[speakers.index != 'none'].reset_index(drop=False)
    
    return top_speakers.head(10)


def top_quotations(df):
    """function to extract the top 10 msot repeated quotations"""
    df = clean_speakers(df)
    
    quotations = df.sort_values('numOccurrences', ascending=False)[['speaker', 'quotation', 'numOccurrences']]
    # remove quotations of unidentified speakers
    top_quotations = quotations[quotations['speaker'] != 'none'].reset_index(drop=True)
    
    return top_quotations.head(10)

In [None]:
# quick visualization
def visualize_top(x, y):
    plt.figure(figsize=(18, 8))
    plt.bar(x, y, color=['orange', 'blue', 'green' , 'red', 'brown', 'yellow', 'pink', 'gray', 'lime', 'darkblue'])


In [None]:
# removing doubled speakers
#idea: check the names contained in other names and merge the rows count (e.g. 'Donald trump' is in 'president donal trump')

def clean_speakers(df):
    # all names to lower case
    df['speaker'] = df['speaker'].str.lower()

    return df

# check for almost same quotations
def clean_quotations(df):
    for i in range(df.shape[0]):
        print(sum([df.iloc[i].quotation in x.quotation for idx, x in df.iterrows()]))
            

In [None]:
# show top speakers from 2017
top_speakers(climate_dfs['2018'])
#visualize_top(top_speakers(climate_dfs['2017']).speaker, top_speakers(climate_dfs['2017']).quotation_count)

Unnamed: 0,speaker,speaker_count
0,catherine mckenna,273
1,jerry brown,228
2,antonio guterres,194
3,michael mann,193
4,narendra modi,186
5,president donald trump,159
6,al gore,146
7,justin trudeau,140
8,patricia espinosa,133
9,emmanuel macron,132


In [None]:
# show top quotations from 2015
top_quotations(climate_dfs['2018'])

Unnamed: 0,speaker,quotation,numOccurrences
0,valeri liukin,"the present climate causes me, and more import...",578
1,greta thunberg,School strike for the Climate.,425
2,greta thunberg,"School Strike For Climate,",302
3,antonio guterres,Even as we witness devastating climate impacts...,296
4,jim steele,What's happened with the more warming climate ...,288
5,president donald trump,"There is a cooling, and there's a heating. I m...",287
6,president donald trump,"I'm not denying climate change,",285
7,kim jong un,"warm climate of reconciliation and dialogue,",283
8,dave robertson,I would say that the obvious inability in the ...,278
9,katharine hayhoe,"As a climate scientist, it is almost surreal.",256


In [None]:
from tld import get_tld

def get_domain(url):
    res = get_tld(url, as_object=True)
    return res.domain

# function extracting the top 10 newspapers mentioning climate 
def top_newspapers(df):
    top_newspapers = pd.Series([x for _list in df['urls'] for x in _list]).apply(lambda u: get_domain(u)).value_counts()
    top_newspapers = top_newspapers.to_frame().reset_index(drop=False)
    top_newspapers.columns = ['newspaper', 'count']
    
    return top_newspapers.head(10)


In [None]:
top_newspapers(climate_dfs['2019'])

Unnamed: 0,newspaper,count
0,msn,4664
1,commondreams,2560
2,breitbart,2501
3,einnews,2191
4,businessinsider,1741
5,phys,1638
6,smh,1622
7,brisbanetimes,1398
8,miragenews,1363
9,washingtontimes,1355


## Working with wikidata

In [None]:
# load wiki data
wiki_data = pd.read_parquet('parquet-data/speaker_attributes.parquet')

# load the labels for wiki data
wiki_labels = pd.read_csv('data/wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col='QID')


In [None]:
wiki_data

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
0,"[Washington, President Washington, G. Washingt...",[+1732-02-22T00:00:00Z],"[Q161885, Q30]",[Q6581097],1395141751,,W000178,"[Q82955, Q189290, Q131512, Q1734662, Q294126, ...",[Q327591],,Q23,George Washington,"[Q698073, Q697949]",item,[Q682443]
1,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",[+1952-03-11T00:00:00Z],[Q145],[Q6581097],1395737157,[Q7994501],,"[Q214917, Q28389, Q6625963, Q4853732, Q1884422...",,,Q42,Douglas Adams,,item,
2,"[Paul Marie Ghislain Otlet, Paul Marie Otlet]",[+1868-08-23T00:00:00Z],[Q31],[Q6581097],1380367296,,,"[Q36180, Q40348, Q182436, Q1265807, Q205375, Q...",,,Q1868,Paul Otlet,,item,
3,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
4,"[Velázquez, Diego Rodríguez de Silva y Velázqu...",[+1599-06-06T00:00:00Z],[Q29],[Q6581097],1391704596,,,[Q1028181],,,Q297,Diego Velázquez,,item,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9055976,[Barker Howard],,[Q30],[Q6581097],1397399351,,,[Q82955],,,Q106406560,Barker B. Howard,,item,
9055977,[Charles Macomber],,[Q30],[Q6581097],1397399471,,,[Q82955],,,Q106406571,Charles H. Macomber,,item,
9055978,,[+1848-04-01T00:00:00Z],,[Q6581072],1397399751,,,,,,Q106406588,Dina David,,item,
9055979,,[+1899-03-18T00:00:00Z],,[Q6581072],1397399799,,,,,,Q106406593,Irma Dexinger,,item,


In [None]:
wiki_labels.head(5)

Unnamed: 0_level_0,Label,Description
QID,Unnamed: 1_level_1,Unnamed: 2_level_1
Q31,Belgium,country in western Europe
Q45,Portugal,country in southwestern Europe
Q75,Internet,global system of connected computer networks
Q148,People's Republic of China,sovereign state in East Asia
Q155,Brazil,country in South America


**Extracting gender**

In [None]:
# get the gender of the speaker
# id: the qid of the speaker corresponding to id in wiki_data
def get_gender(id):
    
    qid = wiki_data[wiki_data['id'] == id]['gender'].iloc[0][0]
    
    gender = wiki_labels.loc[qid]['Label']
    
    return gender

In [None]:
climate_dfs['2018']['gender'] = climate_dfs['2018']['qids'].apply(lambda q: get_gender(q[0]))

IndexError: list index out of range