In [1]:
import numpy as np
import pandas as pd
import json
from urllib.request import urlopen
import sqlite3


# I- Load the data

### Load Quotebank data

First, let's recover the quotation of interest : as project is based on the caracterisation of the speaker, we decide to pre-select the quotations that are related to a speaker (i.e speaker value is different from 'None'). 

##### *2020 quotes extractions*

In [2]:
def chunk_filtering(chunk):
    template=[] #creation of an empty list :it's always cheaper to append to a list and create a DataFrame than append on a empty dataframe.
    template.append(chunk[chunk["speaker"].apply(lambda x: x!= "None")]) #select the quotation with value in speaker column different from 'None'
    return pd.concat(template, ignore_index=True) # return a dataframe with our data of interest
    

with pd.read_json('data/quotes-2020.json.bz2', lines=True, compression='bz2', chunksize=1000) as df_reader:
    for chunk in df_reader:
        chunk_clean=chunk_filtering(chunk)
        chunk_clean.to_csv(path_or_buf='data/clean_quotes-2020.bz2', compression='bz2', mode = 'a') # create a new csv files compress with bz2 containing all the dataframe recover from the chunk; 

In [3]:
quotes_2020= pd.read_csv('data/clean_quotes-2020.bz2', compression='bz2')
quotes_2020.head()

Unnamed: 0.1,Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
0,0.0,2020-01-16-000088,[ Department of Homeland Security ] was livid ...,Sue Myrick,['Q367796'],2020-01-16 12:00:13,1,"[['Sue Myrick', '0.8867'], ['None', '0.0992'],...",['http://thehill.com/opinion/international/478...,E
1,1.0,2020-01-24-000168,[ I met them ] when they just turned 4 and 7. ...,Meghan King Edmonds,['Q20684375'],2020-01-24 20:37:09,4,"[['Meghan King Edmonds', '0.5446'], ['None', '...",['https://people.com/parents/meghan-king-edmon...,E
2,2.0,2020-01-17-000357,[ The delay ] will have an impact [ on Slough ...,Dexter Smith,['Q5268447'],2020-01-17 13:03:00,1,"[['Dexter Smith', '0.924'], ['None', '0.076']]",['http://www.sloughexpress.co.uk/gallery/sloug...,E
3,3.0,2020-04-02-000239,[ The scheme ] treats addiction as an illness ...,Barry Coppinger,['Q4864119'],2020-04-02 14:18:20,1,"[['Barry Coppinger', '0.9017'], ['None', '0.09...",['http://www.theweek.co.uk/106479/why-police-a...,E
4,4.0,2020-03-19-000276,[ These ] actions will allow households who ha...,Ben Carson,['Q816459'],2020-03-19 19:14:00,1,"[['Ben Carson', '0.9227'], ['None', '0.0773']]",['https://mortgageorb.com/hud-fha-suspend-fore...,E


##### *2019 quotes extractions*

In [None]:
with pd.read_json('data/quotes-2019.json.bz2', lines=True, compression='bz2', chunksize=100000) as df_reader:
    for chunk in df_reader:
        chunk_clean=chunk_filtering(chunk)
        chunk_clean.to_csv(path_or_buf='data/clean_quotes-2019.bz2', compression='bz2', mode = 'a') # create a new csv files compress with bz2 containing all the dataframe recover from the chunk; 

In [None]:
quotes_2019= pd.read_csv('data/clean_quotes-2019.bz2', compression='bz2')
quotes_2019.shape

##### *2018 quotes extractions*

In [None]:
with pd.read_json('data/quotes-2018.json.bz2', lines=True, compression='bz2', chunksize=100000) as df_reader:
    for chunk in df_reader:
        chunk_clean=chunk_filtering(chunk)
        chunk_clean.to_csv(path_or_buf='data/clean_quotes-2018.bz2', compression='bz2', mode = 'a') # create a new csv files compress with bz2 containing all the dataframe recover from the chunk; 

In [None]:
quotes_2018= pd.read_csv('data/clean_quotes-2018.bz2', compression='bz2')
quotes_2018.shape

##### *2017 quotes extractions*

In [None]:
with pd.read_json('data/quotes-2017.json.bz2', lines=True, compression='bz2', chunksize=100000) as df_reader:
    for chunk in df_reader:
        chunk_clean=chunk_filtering(chunk)
        chunk_clean.to_csv(path_or_buf='data/clean_quotes-2017.bz2', compression='bz2', mode = 'a') # create a new csv files compress with bz2 containing all the dataframe recover from the chunk; 

In [None]:
quotes_2017= pd.read_csv('data/clean_quotes-2018.bz2', compression='bz2')
quotes_2017.shape

##### *2016 quotes extractions*

In [None]:
with pd.read_json('data/quotes-2016.json.bz2', lines=True, compression='bz2', chunksize=100000) as df_reader:
    for chunk in df_reader:
        chunk_clean=chunk_filtering(chunk)
        chunk_clean.to_csv(path_or_buf='data/clean_quotes-2016.bz2', compression='bz2', mode = 'a') # create a new csv files compress with bz2 containing all the dataframe recover from the chunk; 

In [None]:
quotes_2016= pd.read_csv('data/clean_quotes-2016.bz2', compression='bz2')
quotes_2016.shape

##### *2015 quotes extractions*

In [4]:
with pd.read_json('data/quotes-2015.json.bz2', lines=True, compression='bz2', chunksize=100000) as df_reader:
    for chunk in df_reader:
        chunk_clean=chunk_filtering(chunk)
        chunk_clean.to_csv(path_or_buf='data/clean_quotes-2015.bz2', compression='bz2', mode = 'a') # create a new csv files compress with bz2 containing all the dataframe recover from the chunk; 

(13829, 10)

In [None]:
quotes_2015= pd.read_csv('data/clean_quotes-2016.bz2', compression='bz2')
quotes_2015.shape

At result we obtained 5 set of Data each with a sufficient size, on total we obtained data to analyze.

## Load additional data Relative to speakers

The provided speaker_attributes.parquet file contains attributes in terms of QIDs, thereby being uninterpretable by humans (df_qid).
To map the QIDs to meaningful labels, we used the provied wikidata_labels_descriptions_quotebank.csv.bz2 containg the labels and value fo the respective QID containing the df_qid (df_label_qid)
By combaning the information of both we can obtained usefule information about speakers. 

In [18]:
df_qid = pd.read_parquet("speaker_attributes.parquet",engine= "pyarrow" )
dd_label_qid = pd.read_csv('data/wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col='QID')

In [10]:
parquet.head()

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
0,"[Washington, President Washington, G. Washingt...",[+1732-02-22T00:00:00Z],"[Q161885, Q30]",[Q6581097],1395141751,,W000178,"[Q82955, Q189290, Q131512, Q1734662, Q294126, ...",[Q327591],,Q23,George Washington,"[Q698073, Q697949]",item,[Q682443]
1,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",[+1952-03-11T00:00:00Z],[Q145],[Q6581097],1395737157,[Q7994501],,"[Q214917, Q28389, Q6625963, Q4853732, Q1884422...",,,Q42,Douglas Adams,,item,
2,"[Paul Marie Ghislain Otlet, Paul Marie Otlet]",[+1868-08-23T00:00:00Z],[Q31],[Q6581097],1380367296,,,"[Q36180, Q40348, Q182436, Q1265807, Q205375, Q...",,,Q1868,Paul Otlet,,item,
3,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
4,"[Velázquez, Diego Rodríguez de Silva y Velázqu...",[+1599-06-06T00:00:00Z],[Q29],[Q6581097],1391704596,,,[Q1028181],,,Q297,Diego Velázquez,,item,


In [20]:
df.head()

Unnamed: 0_level_0,Label,Description
QID,Unnamed: 1_level_1,Unnamed: 2_level_1
Q31,Belgium,country in western Europe
Q45,Portugal,country in southwestern Europe
Q75,Internet,global system of connected computer networks
Q148,People's Republic of China,sovereign state in East Asia
Q155,Brazil,country in South America


In [19]:
df.loc['Q31']['Label']

'Belgium'

# II- Filter the data

As a good data scientist, the first thing to do is to clean up the data . 
move the missing rows; if there are
Let's check if the idenfier is unique, and we haven't duplicate rows;
Let's check if the alias match the label for the df_qid data. 


We also need to extract quotation that refers to our subject of interest : climate change. 
To do so, we decided to creat a list of key_world (based of https://www.climaterealityproject.org/blog/key-terms-you-need-understand-climate-change) and extract contation containing these world. 


In [None]:
key_world = ["carbon dioxide", "greenhouse gas", "global warming",
             "climate change",  "fossil fuels", "sea-level rise", "renewable energy"]

