In [1]:
PATH_ROOT = 'data'
PATH_OUTPUT = 'output'
PATH_QUOTEBANK = PATH_ROOT + '/Quotebank'
PATH_TO_QUOTES = PATH_QUOTEBANK + '/quotes-{year}.json.bz2'

In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import seaborn as sns

#NLP libraries
import spacy, nltk, gensim, sklearn
import pyLDAvis.gensim_models

import matplotlib.pyplot as plt
%matplotlib inline  

import pickle



# 1. Loading Data

### Retrieve the labeled speakers

In [3]:
# First retrieve the dtypes of the corresponding columns
with open('output/speakers_labeled_dtypes.pickle', 'rb') as handle:
    speakers_labeled_dtypes = pickle.load(handle)

In [4]:
retrieved_labeled_speakers = pd.read_csv(PATH_OUTPUT + '/speakers_labeled.csv', dtype=speakers_labeled_dtypes)
retrieved_labeled_speakers.head()

Unnamed: 0,qid,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,label,candidacy,type,religion,age,age_group
0,Q23,['Washington' 'President Washington' 'G. Washi...,+1732-02-22T00:00:00Z,Great Britain,male,1395141751,,W000178,politician,independent politician,,George Washington,1792 United States presidential election,item,Episcopal Church,289.0,>100
1,Q23,['Washington' 'President Washington' 'G. Washi...,+1732-02-22T00:00:00Z,Great Britain,male,1395141751,,W000178,politician,independent politician,,George Washington,1788–89 United States presidential election,item,Episcopal Church,289.0,>100
2,Q23,['Washington' 'President Washington' 'G. Washi...,+1732-02-22T00:00:00Z,Great Britain,male,1395141751,,W000178,military officer,independent politician,,George Washington,1792 United States presidential election,item,Episcopal Church,289.0,>100
3,Q23,['Washington' 'President Washington' 'G. Washi...,+1732-02-22T00:00:00Z,Great Britain,male,1395141751,,W000178,military officer,independent politician,,George Washington,1788–89 United States presidential election,item,Episcopal Church,289.0,>100
4,Q23,['Washington' 'President Washington' 'G. Washi...,+1732-02-22T00:00:00Z,Great Britain,male,1395141751,,W000178,farmer,independent politician,,George Washington,1792 United States presidential election,item,Episcopal Church,289.0,>100


We can for example filter all speakers that are politicians.

In [5]:
politicians_df = retrieved_labeled_speakers[retrieved_labeled_speakers.occupation == 'politician']
politicians_df.head()

Unnamed: 0,qid,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,label,candidacy,type,religion,age,age_group
0,Q23,['Washington' 'President Washington' 'G. Washi...,+1732-02-22T00:00:00Z,Great Britain,male,1395141751,,W000178,politician,independent politician,,George Washington,1792 United States presidential election,item,Episcopal Church,289.0,>100
1,Q23,['Washington' 'President Washington' 'G. Washi...,+1732-02-22T00:00:00Z,Great Britain,male,1395141751,,W000178,politician,independent politician,,George Washington,1788–89 United States presidential election,item,Episcopal Church,289.0,>100
16,Q23,['Washington' 'President Washington' 'G. Washi...,+1732-02-22T00:00:00Z,United States of America,male,1395141751,,W000178,politician,independent politician,,George Washington,1792 United States presidential election,item,Episcopal Church,289.0,>100
17,Q23,['Washington' 'President Washington' 'G. Washi...,+1732-02-22T00:00:00Z,United States of America,male,1395141751,,W000178,politician,independent politician,,George Washington,1788–89 United States presidential election,item,Episcopal Church,289.0,>100
45,Q207,['George Walker Bush' 'Bush Jr.' 'Dubya' 'GWB'...,+1946-07-06T00:00:00Z,United States of America,male,1395142029,,,politician,Republican Party,,George W. Bush,2000 United States presidential election,item,United Methodist Church,75.0,70s


In [6]:
### Retrieve environment related speakers

Let's get the Quotebank quotations related to environment from the year 2020

In [7]:
quotations_environment_related = pd.read_json(path_or_buf='data/quotes-2020-filtered.json')
quotations_environment_related

Unnamed: 0,quotation
2020-01-01-000132,[ o ] ur collective failure to act strongly an...
2020-01-01-000363,2020 will see more youngsters defying conventi...
2020-01-01-000495,A circular economy development path could sign...
2020-01-01-000858,"A path of resolve, of sustainable solutions. A..."
2020-01-01-001549,All other waste or recyclables should be taken...
...,...
2020-04-16-069260,You're looking at anywhere from a 20 to 40% de...
2020-04-16-069275,"You're not locking carbon in a vault,"
2020-04-17-000463,The wind industry's rapid and continued growth...
2020-04-17-000605,"Wind power provides long-term price stability,..."


In [8]:
#retrieved_labeled_speakers.index = retrieved_labeled_speakers.index.astype('int64')
#retrieved_labeled_speakers.index

In [9]:
default_speakers_attributes = ['label', 'aliases', 'nationality', 'gender', 'ethnic_group', 'occupation', 'party',
                               'US_congress_bio_ID', 'academic_degree', 'candidacy', 'religion', 'age', 'age_group']

def merge_quotations_with_speakers(speakers_df, quotations_df,
                                   speakers_attributes= default_speakers_attributes,
                                   quotations_attributes = ['quoteID', 'qids', 'urls'],
                                   year=2020):
    '''
    Merge the quotations with their speakers attributes based on the qid of the speaker
    
    Returns:
        The final df
    '''
    merged_chunks = []

    with pd.read_json(path_or_buf=PATH_TO_QUOTES.format(year=year), compression='bz2', lines=True, chunksize=500000) as df_reader:
        for chunk in df_reader:
            # filter the columns
            chunk = chunk[quotations_attributes]
            # check if chunk is environmentally related
            chunk = chunk[chunk.quoteID.isin(quotations_environment_related.index)]
            # Explode the chunk's speakers' qids
            chunk = chunk.explode('qids')
            # Merge chunk and speakers on qid
            merged_chunks.append(chunk.merge(right=retrieved_labeled_speakers[['qid'] + speakers_attributes], left_on='qids', right_on='qid'))

    return pd.concat(merged_chunks, ignore_index=True)

In [10]:
final_df = merge_quotations_with_speakers(retrieved_labeled_speakers, quotations_environment_related)
final_df

Unnamed: 0,quoteID,qids,urls,qid,label,aliases,nationality,gender,ethnic_group,occupation,party,US_congress_bio_ID,academic_degree,candidacy,religion,age,age_group
0,2020-01-24-004182,Q7199798,[http://aninews.in/news/world/europe/piyush-go...,Q7199798,Piyush Goyal,,India,male,,politician,Bharatiya Janata Party,,,,Hinduism,57.0,50s
1,2020-02-07-004808,Q206017,[http://www.desmogblog.com/patrick-michaels],Q206017,Patrick Michaels,['Patrick J. Michaels' 'Patrick J Michaels' 'P...,United States of America,male,,lobbyist,,,,,,71.0,70s
2,2020-02-07-004808,Q206017,[http://www.desmogblog.com/patrick-michaels],Q206017,Patrick Michaels,['Patrick J. Michaels' 'Patrick J Michaels' 'P...,United States of America,male,,university teacher,,,,,,71.0,70s
3,2020-02-07-001100,Q206017,[http://www.desmogblog.com/patrick-michaels],Q206017,Patrick Michaels,['Patrick J. Michaels' 'Patrick J Michaels' 'P...,United States of America,male,,lobbyist,,,,,,71.0,70s
4,2020-02-07-001100,Q206017,[http://www.desmogblog.com/patrick-michaels],Q206017,Patrick Michaels,['Patrick J. Michaels' 'Patrick J Michaels' 'P...,United States of America,male,,university teacher,,,,,,71.0,70s
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237392,2020-02-04-109912,Q264374,[http://www.newcastlestar.com.au/story/6614761...,Q264374,Heather Wheeler,['Heather Kay Wheeler'],United Kingdom,female,,politician,Conservative Party,,,2019 United Kingdom general election,,62.0,60s
237393,2020-02-04-109912,Q264374,[http://www.newcastlestar.com.au/story/6614761...,Q264374,Heather Wheeler,['Heather Kay Wheeler'],United Kingdom,female,,politician,Conservative Party,,,2010 United Kingdom general election,,62.0,60s
237394,2020-02-04-109912,Q264374,[http://www.newcastlestar.com.au/story/6614761...,Q264374,Heather Wheeler,['Heather Kay Wheeler'],United Kingdom,female,,politician,Conservative Party,,,2015 United Kingdom general election,,62.0,60s
237395,2020-02-04-109912,Q264374,[http://www.newcastlestar.com.au/story/6614761...,Q264374,Heather Wheeler,['Heather Kay Wheeler'],United Kingdom,female,,politician,Conservative Party,,,2017 United Kingdom general election,,62.0,60s
