In [7]:
import numpy as np
import pandas as pd
import json
from urllib.request import urlopen
import sqlite3
import string

#load the statistical libraries
from statsmodels.stats import diagnostic
from scipy import stats

# General information Remark

### In the loading part we will recover data from 2015 to 2020, however, first visulation (part III) will only be on the data from 2020.

# I- Load the data

### Load Quotebank data

First, let's recover the quotation of interest : as project is based on the caracterisation of the speaker, we decide to pre-select the quotations that are related to a speaker (i.e speaker value is different from 'None'). 
Moreover, we select the quotations whose subject is related to climate change : to do so we create a list of key word based on https://www.climaterealityproject.org/blog/key-terms-you-need-understand-climate-change and select quotes that contains at least one of these word.  (cf chunk_filtering method) . We are aware that this methode incude biais,and we thought to later utlise NPL in order to filter quotation related to climate from other

> ##### A/ Select data representative for climate interest

In [11]:
#declaration of a key_world list
key_word = ["carbon dioxide", "greenhouse gas", "global warming",
             "climate change",  "fossil fuels", "sea-level rise",
             "renewable energy", "CO2","methane","PPM","COP","GIEC", 
             "biofuels","business as usual", "carbon footprint", "carbon neutral", "carbon sequestration"] 

In [18]:
def chunk_filtering(chunk, lst):
    template=[] #creation of an empty list :it's always cheaper to append to a list and create a DataFrame than append on a empty dataframe.
    for i in lst: 
        template.append(chunk.loc[chunk["quotation"].apply(lambda x : i in x) & 
                                  chunk["speaker"].apply(lambda x: x!= "None")&chunk["qids"].apply(lambda x: len(np.array(x))==1)].drop(['phase'], axis=1))#select the quotation with value in speaker column different from 'None' 
                                                                                #and quotations containing the key word and drop Phase column
        
    return (pd.concat(template, ignore_index=True))# return a dataframe with our data of interest

##### *2020 quotes extractions*
 > The original dataset is of 792,3 Mo, so we decided to divide the dataset into chucks of 1000 rows and process each of them (by using the chunck_filtering). 
Then we load the process chunck into a new csv compressed bz2 file. 

In [19]:
df_reader = pd.read_json('data/quotes-2020.json.bz2', lines=True, compression='bz2', chunksize=1000)
for i, chunk in enumerate(df_reader):
        chunk_clean=chunk_filtering(chunk, key_word) #recover interested row of the chunk
        header = i == 0 #we kept the name of the column only for the first chunk
        mode = 'w' if i == 0 else 'a' # For appending data to an existing CSV file (so for every chunk exepct the first one), 
                                        #we can use mode = a
            
        chunk_clean.to_csv(path_or_buf="data/clean_quotes-2020.bz2",compression='bz2',header=header, mode=mode, index = False )

In [29]:
quotes_2020= pd.read_csv('data/clean_quotes-2020.bz2', compression='bz2')
quotes_2020

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls
0,2020-02-24-013709,For every doubling of carbon dioxide concentra...,E. Calvin Beisner,['Q19877395'],2020-02-24 16:02:23,1,"[['E. Calvin Beisner', '0.677'], ['None', '0.3...",['https://www.heartland.org/news-opinion/news/...
1,2020-02-24-028340,If you're a doctor that cares about the wellbe...,Fiona Stanley,['Q1653736'],2020-02-24 12:45:00,4,"[['Fiona Stanley', '0.9473'], ['None', '0.0527']]",['http://watoday.com.au/business/banking-and-f...
2,2020-01-24-004182,"Also, spoke about a range of emerging sectors ...",Piyush Goyal,['Q7199798'],2020-01-24 19:02:14,2,"[['Piyush Goyal', '0.6385'], ['Peter Voser', '...",['http://aninews.in/news/world/europe/piyush-g...
3,2020-02-10-076321,the National Energy and Climate Plans are how ...,Kadri Simson,['Q13570003'],2020-02-10 05:51:51,1,"[['Kadri Simson', '0.9269'], ['None', '0.0504'...",['https://www.politico.eu/newsletter/brussels-...
4,2020-01-24-110153,When we're talking about... trying to promote ...,Stephen Poloz,['Q15127111'],2020-01-24 01:50:08,1,"[['Stephen Poloz', '0.5145'], ['None', '0.4855']]",['http://thestar.com/politics/federal/2020/01/...
...,...,...,...,...,...,...,...,...
7755,2020-01-22-057467,Last year Formula 1 launched its first-ever su...,Chase Carey,['Q5087105'],2020-01-22 00:00:00,3,"[['Chase Carey', '0.7369'], ['Jean Todt', '0.1...",['http://formula1.com/en/latest/article.formul...
7756,2020-02-05-114487,We will be an energetic champion of free trade...,Dominic Raab,['Q268584'],2020-02-05 16:30:31,2,"[['Dominic Raab', '0.8813'], ['None', '0.0998'...",['http://forbesadvocate.com.au/story/6616761/a...
7757,2020-02-10-057505,"Obama got health care, Trump got his tax cut, ...",Tom Steyer,['Q16189531'],2020-02-10 11:00:08,1,"[['Tom Steyer', '0.8143'], ['None', '0.1857']]",['https://thebulletin.org/2020/02/on-the-new-h...
7758,2020-02-19-061285,Our colleagues will also continue to work toge...,Markus Dohle,['Q1901431'],2020-02-19 08:37:21,1,"[['Markus Dohle', '0.7837'], ['None', '0.2163']]",['http://thebookseller.com/news/dohle-prh-well...


In [9]:
#quotes_2020[quotes_2020['qids'].apply(lambda x : len(x.split())==1)]

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls
0,2020-02-24-013709,For every doubling of carbon dioxide concentra...,E. Calvin Beisner,['Q19877395'],2020-02-24 16:02:23,1,"[['E. Calvin Beisner', '0.677'], ['None', '0.3...",['https://www.heartland.org/news-opinion/news/...
1,2020-02-24-028340,If you're a doctor that cares about the wellbe...,Fiona Stanley,['Q1653736'],2020-02-24 12:45:00,4,"[['Fiona Stanley', '0.9473'], ['None', '0.0527']]",['http://watoday.com.au/business/banking-and-f...
2,2020-01-24-004182,"Also, spoke about a range of emerging sectors ...",Piyush Goyal,['Q7199798'],2020-01-24 19:02:14,2,"[['Piyush Goyal', '0.6385'], ['Peter Voser', '...",['http://aninews.in/news/world/europe/piyush-g...
4,2020-02-10-076321,the National Energy and Climate Plans are how ...,Kadri Simson,['Q13570003'],2020-02-10 05:51:51,1,"[['Kadri Simson', '0.9269'], ['None', '0.0504'...",['https://www.politico.eu/newsletter/brussels-...
9,2020-01-24-110153,When we're talking about... trying to promote ...,Stephen Poloz,['Q15127111'],2020-01-24 01:50:08,1,"[['Stephen Poloz', '0.5145'], ['None', '0.4855']]",['http://thestar.com/politics/federal/2020/01/...
...,...,...,...,...,...,...,...,...
11086,2020-01-22-057467,Last year Formula 1 launched its first-ever su...,Chase Carey,['Q5087105'],2020-01-22 00:00:00,3,"[['Chase Carey', '0.7369'], ['Jean Todt', '0.1...",['http://formula1.com/en/latest/article.formul...
11087,2020-02-05-114487,We will be an energetic champion of free trade...,Dominic Raab,['Q268584'],2020-02-05 16:30:31,2,"[['Dominic Raab', '0.8813'], ['None', '0.0998'...",['http://forbesadvocate.com.au/story/6616761/a...
11088,2020-02-10-057505,"Obama got health care, Trump got his tax cut, ...",Tom Steyer,['Q16189531'],2020-02-10 11:00:08,1,"[['Tom Steyer', '0.8143'], ['None', '0.1857']]",['https://thebulletin.org/2020/02/on-the-new-h...
11089,2020-02-19-061285,Our colleagues will also continue to work toge...,Markus Dohle,['Q1901431'],2020-02-19 08:37:21,1,"[['Markus Dohle', '0.7837'], ['None', '0.2163']]",['http://thebookseller.com/news/dohle-prh-well...


In [5]:
print( " We extract {} quotes from the 2020 files".format(len(quotes_2020)))

 We extract 11091 quotes from the 2020 files


In [3]:
quotes_2020.head(10)

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls
0,2020-02-24-013709,For every doubling of carbon dioxide concentra...,E. Calvin Beisner,['Q19877395'],2020-02-24 16:02:23,1,"[['E. Calvin Beisner', '0.677'], ['None', '0.3...",['https://www.heartland.org/news-opinion/news/...
1,2020-02-24-028340,If you're a doctor that cares about the wellbe...,Fiona Stanley,['Q1653736'],2020-02-24 12:45:00,4,"[['Fiona Stanley', '0.9473'], ['None', '0.0527']]",['http://watoday.com.au/business/banking-and-f...
2,2020-01-24-004182,"Also, spoke about a range of emerging sectors ...",Piyush Goyal,['Q7199798'],2020-01-24 19:02:14,2,"[['Piyush Goyal', '0.6385'], ['Peter Voser', '...",['http://aninews.in/news/world/europe/piyush-g...
3,2020-01-29-062975,Many make the link today between their experie...,Peter Maurer,"['Q117796', 'Q42426597']",2020-01-29 09:04:36,5,"[['Peter Maurer', '0.8787'], ['None', '0.1213']]",['http://whbl.com/news/articles/2020/jan/29/hu...
4,2020-02-10-076321,the National Energy and Climate Plans are how ...,Kadri Simson,['Q13570003'],2020-02-10 05:51:51,1,"[['Kadri Simson', '0.9269'], ['None', '0.0504'...",['https://www.politico.eu/newsletter/brussels-...
5,2020-01-06-069057,"This budget also allows us to earmark £ 500,00...",John Whitehead,"['Q55436000', 'Q58150131', 'Q6263827', 'Q62638...",2020-01-06 11:49:00,1,"[['John Whitehead', '0.7971'], ['None', '0.202...",['https://www.buryfreepress.co.uk/news/draft-b...
6,2020-01-06-069057,"This budget also allows us to earmark £ 500,00...",John Whitehead,"['Q55436000', 'Q58150131', 'Q6263827', 'Q62638...",2020-01-06 11:49:00,1,"[['John Whitehead', '0.7971'], ['None', '0.202...",['https://www.buryfreepress.co.uk/news/draft-b...
7,2020-01-22-106994,We have seen years with extremely high carbon ...,Rob Jackson,"['Q7340237', 'Q7340238']",2020-01-22 18:57:00,3,"[['Rob Jackson', '0.622'], ['None', '0.378']]",['https://www.nbcnews.com/science/environment/...
8,2020-03-10-005294,"As a state, we will pursue every option availa...",Kate Brown,"['Q16727692', 'Q6375399']",2020-03-10 20:47:28,2,"[['Kate Brown', '0.7275'], ['None', '0.2726']]",['http://www.courthousenews.com/oregon-governo...
9,2020-01-24-110153,When we're talking about... trying to promote ...,Stephen Poloz,['Q15127111'],2020-01-24 01:50:08,1,"[['Stephen Poloz', '0.5145'], ['None', '0.4855']]",['http://thestar.com/politics/federal/2020/01/...


##### *2019 quotes extractions*
> The original dataset is of 3.32 Go, so we decided to divide the dataset into chucks of 1000 rows and process each of them (by using the chunck_filtering). Then we load the process chunck into a new csv compressed bz2 file.

In [9]:
df_reader = pd.read_json('data/quotes-2019.json.bz2', lines=True, compression='bz2', chunksize=1000)
for i, chunk in enumerate(df_reader):
        chunk_clean=chunk_filtering(chunk, key_word) #recover interested row of the chunk
        header = i == 0 #we kept the name of the column only for the first chunk
        mode = 'w' if i == 0 else 'a' # For appending data to an existing CSV file (so for every chunk exepct the first one), 
                                        #we can use mode = a
            
        chunk_clean.to_csv(path_or_buf="data/clean_quotes-2019.bz2",compression='bz2',header=header, mode=mode, index = False )

In [30]:
quotes_2019= pd.read_csv('data/clean_quotes-2019.bz2', compression='bz2') # load into the quotes_2019 df

In [None]:
print( " We extracted {} quotes from the 2019 files".format(len(quotes_2019)))

(47280, 10)

##### *2018 quotes extractions*
> The original dataset is of 4.48 Go, so we decided to divide the dataset into chucks of 1000 rows and process each of them ((by using the chunck_filtering). Then we load the process chunck into a new csv compressed bz2 file.

In [14]:
df_reader = pd.read_json('data/quotes-2018.json.bz2', lines=True, compression='bz2', chunksize=1000)
for i, chunk in enumerate(df_reader):
        chunk_clean=chunk_filtering(chunk, key_word) #recover interested row of the chunk
        header = i == 0 #we kept the name of the column only for the first chunk
        mode = 'w' if i == 0 else 'a' # For appending data to an existing CSV file (so for every chunk exepct the first one), 
                                        #we can use mode = a
            
        chunk_clean.to_csv(path_or_buf="data/clean_quotes-2018.bz2",compression='bz2',header=header, mode=mode, index = False )

In [35]:
quotes_2018= pd.read_csv('data/clean_quotes-2018.bz2', compression='bz2') #load the data to quotes_2018 df

In [18]:
print( " We extracted {} quotes from the 2018 files".format(len(quotes_2018)))

 We extracted 35847 quotes from the 2018 files


##### *2017 quotes extractions*
> The original dataset is of 4.84 Go, so we decided to divide the dataset into chucks of 1000 rows and process each of them (by using the chunck_filtering). Then we load the process chunck into a new csv compressed bz2 file

In [17]:
df_reader = pd.read_json('data/quotes-2017.json.bz2', lines=True, compression='bz2', chunksize=1000)
for i, chunk in enumerate(df_reader):
        chunk_clean=chunk_filtering(chunk, key_word) #recover interested row of the chunk
        header = i == 0 #we kept the name of the column only for the first chunk
        mode = 'w' if i == 0 else 'a' # For appending data to an existing CSV file (so for every chunk exepct the first one), 
                                        #we can use mode = a
            
        chunk_clean.to_csv(path_or_buf="data/clean_quotes-2017.bz2",compression='bz2',header=header, mode=mode, index = False )

In [31]:
quotes_2017= pd.read_csv('data/clean_quotes-2017.bz2', compression='bz2')

In [16]:
print( " We extracted {} quotes from the 2017 files".format(len(quotes_2017)))

 We extracted 35324 quotes from the 2017 files


##### *2016 quotes extractions*
 > The original dataset is of 2.16 Go, so we decided to divide the dataset into chucks of 1000 rows and process each of them(by using the chunck_filtering). Then we load the process chunck into a new csv compressed bz2 file.

In [20]:
df_reader = pd.read_json('data/quotes-2016.json.bz2', lines=True, compression='bz2', chunksize=1000)
for i, chunk in enumerate(df_reader):
        chunk_clean=chunk_filtering(chunk, key_word) #recover interested row of the chunk
        header = i == 0 #we kept the name of the column only for the first chunk
        mode = 'w' if i == 0 else 'a' # For appending data to an existing CSV file (so for every chunk exepct the first one), 
                                        #we can use mode = a
            
        chunk_clean.to_csv(path_or_buf="data/clean_quotes-2016.bz2",compression='bz2',header=header, mode=mode, index = False )

In [32]:
quotes_2016= pd.read_csv('data/clean_quotes-2016.bz2', compression='bz2')

In [17]:
print( " We extracted {} quotes from the 2016 files".format(len(quotes_2016)))

 We extracted 18344 quotes from the 2016 files


##### *2015 quotes extractions*
> The original dataset is of 3.11 Go, so we decided to divide the dataset into chucks of 1000 rows and process each of them(by using the chunck_filtering). Then we load the process chunck into a new csv compressed bz2 file.

In [4]:
df_reader = pd.read_json('data/quotes-2015.json.bz2', lines=True, compression='bz2', chunksize=1000)
for i, chunk in enumerate(df_reader):
        chunk_clean=chunk_filtering(chunk, key_word) #recover interested row of the chunk
        header = i == 0 #we kept the name of the column only for the first chunk
        mode = 'w' if i == 0 else 'a' # For appending data to an existing CSV file (so for every chunk exepct the first one), 
                                        #we can use mode = a
            
        chunk_clean.to_csv(path_or_buf="data/clean_quotes-2015.bz2",compression='bz2',header=header, mode=mode, index = False )

(13829, 10)

In [33]:
quotes_2015= pd.read_csv('data/clean_quotes-2015.bz2', compression='bz2')

In [20]:
print( " We extracted {} quotes from the 2015 files".format(len(quotes_2015)))

 We extracted 35176 quotes from the 2015 files


In [109]:
print(" At result, we extracted {} quotes fromes quotebank data".format((len(quotes_2015)+len(quotes_2016)+len(quotes_2017)
                                                                         +len(quotes_2018)+len(quotes_2019)+len(quotes_2020)))

Even with key_word selection we success to extrat interesting data from the Quotebank data with a sufficient size. Let's add another dataset that will give us characteristic information about the speaker

> ##### B/ Select data representative for climate septic

We want to asses climate scepticism among our speakers. We selected 10 speakers that are said to be climate sceptic according to https://www.businessinsider.com/the-ten-most-important-climate-change-skeptics-2009-7?IR=T#dont-miss-11. We want to find our list of keywords from their quotations.

In [36]:
lst = ['Freeman Dyson', 'Bjorn Lomborg', 'Myron Ebell', 'Kiminori Itoh', 'Ivar Giaever', 
       'Will Happer', 'Ian Plimer', 'Michael Chrichton', 'Alan Carlin', 'Patrick Michaels'] #list of the name taken from the article
#iteration in the list of name in order to find if our people of interest are in our quotes list and 
#we then create one df per year with their correspondings quotes

template = []

for i in lst:
      template.append(quotes_2020.loc[quotes_2020['speaker'].apply(lambda x : i == x)])  
        
df_2020 = pd.concat(template, ignore_index=True)

template = []

for i in lst:
      template.append(quotes_2019.loc[quotes_2019['speaker'].apply(lambda x : i == x)])  
        
df_2019 = pd.concat(template, ignore_index=True)

template = []

for i in lst:
      template.append(quotes_2018.loc[quotes_2018['speaker'].apply(lambda x : i == x)])  
        
df_2018 = pd.concat(template, ignore_index=True)

template = []

for i in lst:
      template.append(quotes_2017.loc[quotes_2017['speaker'].apply(lambda x : i == x)])  
        
df_2017 = pd.concat(template, ignore_index=True)

template = []

for i in lst:
      template.append(quotes_2016.loc[quotes_2016['speaker'].apply(lambda x : i == x)])  
        
df_2016 = pd.concat(template, ignore_index=True)

template = []

for i in lst:
      template.append(quotes_2015.loc[quotes_2015['speaker'].apply(lambda x : i == x)])  
        
df_2015 = pd.concat(template, ignore_index=True)

In [37]:
lst = ['Freeman Dyson', 'Bjorn Lomborg', 'Myron Ebell', 'Kiminori Itoh', 'Ivar Giaever', 
       'Will Happer', 'Ian Plimer', 'Michael Chrichton', 'Alan Carlin', 'Patrick Michaels'] #list of the name taken from the article
#iteration in the list of name in order to find if our people of interest are in our quotes list and 
#we then create one df per year with their correspondings quotes

template = []

for i in lst:
      template.append(quotes_2020.loc[quotes_2020['speaker'].apply(lambda x : i == x)])  
        
df_2020 = pd.concat(template, ignore_index=True)
quotations_2020 = df_2020['quotation'].tolist()

In [38]:
#we will now construct a list with the quotations only
quotations_2020 = df_2020['quotation'].tolist()
quotations_2019 = df_2019['quotation'].tolist()
quotations_2018 = df_2018['quotation'].tolist()
quotations_2017 = df_2017['quotation'].tolist()
quotations_2016 = df_2016['quotation'].tolist()
quotations_2015 = df_2015['quotation'].tolist()

In [39]:
#we imported these librairies in order to handle language expression and word counting 
import nltk
import nltk.corpus
from nltk.tokenize import word_tokenize 
nltk.download('punkt')
from nltk.probability import FreqDist
nltk.download('words')
import string

[nltk_data] Downloading package punkt to /Users/maria/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /Users/maria/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [40]:
#two functions
from nltk import word_tokenize
from nltk.corpus import stopwords
a = set(stopwords.words('english'))

def remov_punc(lst): #removes the punctuations from a sentence
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_°~''' #list of punctuations 
    remov_punc = []
    t = 0
    
    for i in lst :
        t=t+1
        for d in i:
            if d in punc:
                i = i.replace(d, " ")
        remov_punc.append(i)
    return remov_punc
    
def words_freq(lst): #calculate each word frequency
    ls=[]
    for i in lst: 
        text = i
        text1 = word_tokenize(text.lower())
        imp_words = [x for x in text1 if x not in a]
        ls.append(imp_words)
    return ls

def words__highest_freq(lst): #return the highest word frequency
    ls_freq = []
    for i in lst: 
        fdist = FreqDist(i)
        fdist1 = fdist.most_common(1)
        ls_freq.append(fdist1)
    return ls_freq #this is a list with the highest frequency for each most written words



In [41]:
words_freq_2020 = words_freq(remov_punc(quotations_2020))
words_freq_2019 = words_freq(remov_punc(quotations_2019))
words_freq_2018 = words_freq(remov_punc(quotations_2018))
words_freq_2017 = words_freq(remov_punc(quotations_2017))
words_freq_2016 = words_freq(remov_punc(quotations_2016))
words_freq_2015 = words_freq(remov_punc(quotations_2015))

w_freq = words_freq_2020 + words_freq_2019 + words_freq_2018 + words_freq_2017 + words_freq_2016 + words_freq_2015
w_h_freq = words__highest_freq(w_freq)
w_h_freq

keywords_sceptic = []

for i in w_h_freq:
    for d in i: 
        if d[1] >=3 : 
            if d[0] not in keywords_sceptic:
                keywords_sceptic.append(d[0])
        
keywords_sceptic #our list of keywords according to their representation in the climate sceptic speaker quotations

['science',
 'increasing',
 'local',
 'demonstration',
 'climate',
 'energy',
 'emissions',
 'power',
 'degrees',
 'percent',
 'c',
 'silly',
 'paris',
 'co2',
 'consensus',
 'global',
 'effects',
 'models',
 'r',
 'ipcc',
 'years',
 'year']

In [42]:
# on 2020
df_reader = pd.read_json('data/quotes-2020.json.bz2', lines=True, compression='bz2', chunksize=1000)
for i, chunk in enumerate(df_reader):
        chunk_clean=chunk_filtering(chunk, keywords_sceptic  ) #recover interested row of the chunk
        header = i == 0 #we kept the name of the column only for the first chunk
        mode = 'w' if i == 0 else 'a' # For appending data to an existing CSV file (so for every chunk exepct the first one), 
                                        #we can use mode = a
            
        chunk_clean.to_csv(path_or_buf="data/clean_quotes_sceptic-2020.bz2",compression='bz2',header=header, mode=mode, index = False )


In [47]:
quotes_2020_sceptic= pd.read_csv('data/clean_quotes_sceptic-2020.bz2', compression='bz2')

## Load additional data Relative to speakers

The provided speaker_attributes.parquet file contains attributes in terms of QIDs, thereby being uninterpretable by humans (df_qid).
To map the QIDs to meaningful labels, we used the provied wikidata_labels_descriptions_quotebank.csv.bz2 containg the labels and value fo the respective QID containing the df_qid (df_label_qid)
By combaning the information of both we can obtained usefule information about speakers. 

In [5]:
df_qid = pd.read_parquet("speaker_attributes.parquet",engine= "pyarrow" )
df_label_qid = pd.read_csv('data/wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col='QID')

Before extract the label of qid, let's check which column we want to keep in frame with our project

In [None]:
#df_qid.head(3)

Let's verify that academic_degree has revelant values

In [90]:
#print("There's no academic degree revelant value ? {}".format(all(df_qid.academic_degree.isna())))

There's no academic degree revelant value ? False


We decided to drop lastrevid, US_congress_bio_ID, type. Moreover, it's seems that academic_degree value are rare, let's check that too.

In [8]:
df_qid.drop(['lastrevid', 'US_congress_bio_ID', 'type'], axis=1, inplace=True)

In [9]:
#We found out that some of the QIDs used in the speaker attribute file are actually redirection from an original QID. 
#We will manulally add their corresponding information using the orginal QID. We found the corespondance manualy between the two. 
#Here, there are in order, respectively the redirection QID, and its corresponding original one. One of he QID was only present 
#as a redirection, so we manually added this one (Q3186984), and its corresponding info. 

redirect_QID=['Q3268166', 'Q11815360', 'Q12014399', 'Q16287483',
              'Q20432251', 'Q21550646', 'Q13365117', 'Q13424794',
             'Q1248362', 'Q6859927', 'Q15145782',
             'Q15991263', 'Q12455619', 'Q5568256', 
             'Q6363085', 'Q11819457', 'Q12334852', 'Q15145783']
actual_QID=['Q1113899', 'Q1919436', 'Q250867', 'Q6051619',
             'Q26934816', 'Q18431816', 'Q12840545', 'Q5157338',
            'Q3455803', 'Q715222', 'Q1052281',
            'Q2743689', 'Q7019111', 'Q3738699', 
            'Q380075', 'Q3391743', 'Q476246', 'Q2449503']

#There is a QID that was deleted from Wikidata, Q99753484, so we will remove this QID later 

lst=[['Journalist', 'monthly magazine of the United Kingdom‘s National Union of Journalists (NUJ)']]
indexes=['Q3186984']
col=['Label', 'Description']
for i in range(len(redirect_QID)):
    lst.append([df_label_qid.loc[actual_QID[i]]['Label'], 
                df_label_qid.loc[actual_QID[i]]['Description']])
    indexes.append(redirect_QID[i])

additional_df= pd.DataFrame(lst, columns= col, index=indexes)
df_label_qid_co=df_label_qid.append(additional_df, ignore_index= False)

As this function make several minute (more than 20 min) to run, we decided to create a compressed csv files in order to run these cells once

In [11]:
def add_column (df_climate, df_septic) : 
    for i, skr in enumerate(df_qid.label) : 
        if (df_climate['speaker'].isin([skr]).any() & df_septic['speaker'].isin([skr]).any()) : 
            df_qid.loc[i, 'climate']='None'
        else if (df_climate['speaker'].isin([skr]).any()) :
            df_qid.loc[i, 'climate']='climate'
        else if (df_septic['speaker'].isin([skr]).any()) : 
            df_qid.loc[i, 'climate']='climate_septic'
        else : df_qid.loc[i, 'climate']='None'
        

In [12]:
#Function that loops through a column to replace QID with their label, and skip None values. We will deal with them later, 
#when we will use the different data.
def extraction_label(df) : 
    liste=[]
    for row  in df: 
        if row is None: 
            continue #skip None values
        template=[]
        for value in row: #iterating over the values of a cell, as there are multiple QIDs in some of them.
            if value == 'Q99753484': #To filter the deleted QID
                continue
            template.append(df_label_qid_co.loc[value]['Label']) #Map the QID to its corresponding label. 
        liste.append(template)    
    return pd.Series(liste)
    

In [13]:
#Applying the function to every column containing QIDs. 
df_qid [['nationality', 'gender', 'ethnic_group','occupation', 'party', 'academic_degree', 'candidacy', 'religion']] = df_qid[['nationality', 'gender', 'ethnic_group','occupation', 'party', 'academic_degree', 'candidacy', 'religion']].apply(extraction_label)
add_column(quotes_2020, quotes_2020_septic)
df_qid.to_csv(path_or_buf="data/speaker_attribute.bz2", compression = 'bz2', index = False) 

KeyboardInterrupt: 

In [53]:
test=pd.read_csv('data/speaker-2020.bz2', compression = 'bz2')
test.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0.1,Unnamed: 0,aliases,date_of_birth,nationality,gender,ethnic_group,occupation,party,academic_degree,id,label,candidacy,religion
0,0,['Washington' 'President Washington' 'G. Washi...,['+1732-02-22T00:00:00Z'],"['Great Britain', 'United States of America']",['male'],['White British'],"['politician', 'military officer', 'farmer', '...",['independent politician'],['Doctor of Sciences in Physics and Mathematics'],Q23,George Washington,"['1792 United States presidential election', '...",['Episcopal Church']
1,1,['Douglas Noel Adams' 'Douglas Noël Adams' 'Do...,['+1952-03-11T00:00:00Z'],['United Kingdom'],['male'],['French'],"['playwright', 'screenwriter', 'novelist', ""ch...",['Republican Party'],['laurea'],Q42,Douglas Adams,"['2000 United States presidential election', '...","['United Methodist Church', 'Episcopal Church'..."
2,2,['Paul Marie Ghislain Otlet' 'Paul Marie Otlet'],['+1868-08-23T00:00:00Z'],['Belgium'],['male'],['Poles'],"['writer', 'lawyer', 'librarian', 'information...",['independent politician'],['doctorate'],Q1868,Paul Otlet,['1946 Chilean presidential election'],['Catholicism']
3,3,['George Walker Bush' 'Bush Jr.' 'Dubya' 'GWB'...,['+1946-07-06T00:00:00Z'],['United States of America'],['male'],['French'],"['politician', 'motivational speaker', 'autobi...",['Radical Party'],['Doktor Nauk in Juridical Science'],Q207,George W. Bush,['2005 Polish presidential election'],['Catholicism']
4,4,['Velázquez' 'Diego Rodríguez de Silva y Veláz...,['+1599-06-06T00:00:00Z'],['Spain'],['male'],['Greeks'],['painter'],['Democratic Party'],"['Bachelor of Arts', 'Master of Business Admin...",Q297,Diego Velázquez,['2014 Indian general election in Vadodara Lok...,['Catholicism']


In [142]:
import ast
test2=quotes_2020.drop_duplicates(['qids'], keep = 'first')['qids']
test['climate'] = 0
test2.map(lambda y : ast.literal_eval(y)[0])

0       Q19877395
1        Q1653736
2        Q7199798
3       Q13570003
4       Q15127111
          ...    
7734     Q5106681
7741     Q5489500
7747      Q512051
7752    Q11530057
7754     Q5247771
Name: qids, Length: 3378, dtype: object

In [146]:
test2 = test2.map(lambda y : ast.literal_eval(y)[0])

In [158]:

test[pd.Index(test.id).isin(pd.Index(test2))]

Index([                                                                                                                                                                                                                                                                                                                                                                                                                         (18, '['Namo' 'Modi' 'Narendra Bhai' 'Narendra Damodardas Modi'\n 'Narendrabhai Damodardas Modi' 'Narendrabhai']', '['+1950-09-17T00:00:00Z']', '['India']', '['male']', '['Japanese people']', '['politician', 'writer', 'social worker', 'bibliographer']', '['Communist Party of Germany', 'Socialist Unity Party of Germany']', '['Doctor in Engineering']', 'Q1058', 'Narendra Modi', '['2013 Austrian legislative election']', '['agnosticism']', 0),
                                                                                                                                            

Unnamed: 0.1,Unnamed: 0,aliases,date_of_birth,nationality,gender,ethnic_group,occupation,party,academic_degree,id,label,candidacy,religion,climate


In [95]:
type(test.id[0])
type(('"'+"[" + "'" + "Q19877395" +"'"  + "]" + '"'))

str

In [135]:
print(('"'+"[" + "'" + "Q19877395" +"'"  + "]" + '"') == test2[0]  )
for idx, p in enumerate(test2):
    test2[idx] = p[2:-2].strip()
    


False


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test2[idx] = p[2:-2].strip()


In [136]:
test2.to_string

<bound method Series.to_string of 0       Q19877395
1        Q1653736
2        Q7199798
3       Q13570003
4       Q15127111
          ...    
3367    Q27967852
3370     Q1086748
3373     Q5106681
3376    Q11530057
3377     Q5247771
Name: qids, Length: 4854, dtype: object>

In [129]:
type(test2[0])

str

In [138]:
if (test.id[0].strip()  in test2) :
    print("stp")
    
            #test.loc[0, 'climate']='climate'

In [123]:
test.head()

Unnamed: 0.1,Unnamed: 0,aliases,date_of_birth,nationality,gender,ethnic_group,occupation,party,academic_degree,id,label,candidacy,religion,climate
0,0,['Washington' 'President Washington' 'G. Washi...,['+1732-02-22T00:00:00Z'],"['Great Britain', 'United States of America']",['male'],['White British'],"['politician', 'military officer', 'farmer', '...",['independent politician'],['Doctor of Sciences in Physics and Mathematics'],Q23,George Washington,"['1792 United States presidential election', '...",['Episcopal Church'],0
1,1,['Douglas Noel Adams' 'Douglas Noël Adams' 'Do...,['+1952-03-11T00:00:00Z'],['United Kingdom'],['male'],['French'],"['playwright', 'screenwriter', 'novelist', ""ch...",['Republican Party'],['laurea'],Q42,Douglas Adams,"['2000 United States presidential election', '...","['United Methodist Church', 'Episcopal Church'...",0
2,2,['Paul Marie Ghislain Otlet' 'Paul Marie Otlet'],['+1868-08-23T00:00:00Z'],['Belgium'],['male'],['Poles'],"['writer', 'lawyer', 'librarian', 'information...",['independent politician'],['doctorate'],Q1868,Paul Otlet,['1946 Chilean presidential election'],['Catholicism'],0
3,3,['George Walker Bush' 'Bush Jr.' 'Dubya' 'GWB'...,['+1946-07-06T00:00:00Z'],['United States of America'],['male'],['French'],"['politician', 'motivational speaker', 'autobi...",['Radical Party'],['Doktor Nauk in Juridical Science'],Q207,George W. Bush,['2005 Polish presidential election'],['Catholicism'],0
4,4,['Velázquez' 'Diego Rodríguez de Silva y Veláz...,['+1599-06-06T00:00:00Z'],['Spain'],['male'],['Greeks'],['painter'],['Democratic Party'],"['Bachelor of Arts', 'Master of Business Admin...",Q297,Diego Velázquez,['2014 Indian general election in Vadodara Lok...,['Catholicism'],0


In [45]:
def add_column (ls_climate, ls_skeptic, chunk) : 
    chunk['climate']='None'
    for i, skr in enumerate(chunk.id) :   
        if (ls_climate.isin([skr]).any()) :
            chunk.loc[i, 'climate']='climate'
        if (ls_skeptic.isin([skr]).any()) : 
            chunk.loc[i, 'climate']='climate_skeptic'

In [None]:
df_reader = pd.read_csv('data/speaker-2020.bz2',  compression='bz2', chunksize=1000)
for i, chunk in enumerate(df_reader):
        add_column(quotes_2020.drop_duplicates(['qids'], keep = 'first')['qids'],quotes_2020_sceptic.drop_duplicates(['qids'], keep = 'first')['qids'],chunk)
        header = i == 0 #we kept the name of the column only for the first chunk
        mode = 'w' if i == 0 else 'a' # For appending data to an existing CSV file (so for every chunk exepct the first one), 
                                        #we can use mode = a
            
        chunk.to_csv(path_or_buf="data/clean_quotes_clim.bz2",compression='bz2',header=header, mode=mode, index = False )

## Load data about natural disasters and important political event

We would like to compare how climate change speech change in media relatively to the important events occuring in the world. To do so we would like to have a list of important natural disasters that have occured between 2015 and 2020 as well as a list of political and diplomatic events more or less related to climate.

The final goal would be the create these lists from Wikipedia scraping data. For now, we made two non-exhaustive "handmade" ones :

In [2]:
data = {'Name':['Earthquake in Nepal','Earthquake in Nepal','Heat waves in Inda and Pakistan','Flood in India','Typhoon inundate Myanmar, Bangladesh, India','Massive floods in Malawi and Mozambique','Drought in Ethiopia','Hurricane in Haiti','Earthquake in Ecuador','Hurricane in Puerto Rico','Hurricane season in the US','Amazon rainforest wildfires','Fires in Australia','Covid-19 pandemic','Floods in Nepal','Fires in the US'],'Date':['2015-4-25','2015-5-12','2015-4-1','2015-11-1','2015-7-1','2015-3-14','2015-2-1','2016-10-4','2016-4-16','2017-9-16','2016-6-1','2019-6-1','2019-6-1','2019-11-16','2020-7-11','2020-7-24']}
natural_disasters = pd.DataFrame(data)
print(natural_disasters)

data = {'Name':['COP21','Trump Election','Trump announce quitting \"l\'Accord de Paris\"','CLimate strikes begun in Sweden','Official letter for quitting\"l\'Accord de Paris\"','Official retreat of the US from \"l\'Accord de Paris\"','Biden Election'],'Date':['2015-12-12','2016-11-8','2017-6-1','2018-8-20','2019-11-4','2020-11-4','2020-11-3']}
political_events = pd.DataFrame(data)
print(political_events)

                                           Name        Date
0                           Earthquake in Nepal   2015-4-25
1                           Earthquake in Nepal   2015-5-12
2               Heat waves in Inda and Pakistan    2015-4-1
3                                Flood in India   2015-11-1
4   Typhoon inundate Myanmar, Bangladesh, India    2015-7-1
5       Massive floods in Malawi and Mozambique   2015-3-14
6                           Drought in Ethiopia    2015-2-1
7                            Hurricane in Haiti   2016-10-4
8                         Earthquake in Ecuador   2016-4-16
9                      Hurricane in Puerto Rico   2017-9-16
10                   Hurricane season in the US    2016-6-1
11                  Amazon rainforest wildfires    2019-6-1
12                           Fires in Australia    2019-6-1
13                            Covid-19 pandemic  2019-11-16
14                              Floods in Nepal   2020-7-11
15                              Fires in

# II- Filter the data

As a good data scientist, the first thing to do is to clean up the data : we need to filtered missing and duplicates rows if there are presented. We will only filter data from Quotebank extraction as we speakers_file is only ??

> ##### *check for missing row*

In [51]:
print("Is there some missing rows ? {} ".format(np.array([quotes_2020.isnull().any(axis=1)]).all()))

Is there some missing rows ? False 


> ##### *check for duplicate* 
We define a function that receive a dataframe (quotes_2020 ... quotes_2015) and remove their duplicates rows according to duplicate quotation if the speakers and the date is the same.

In [18]:
def check_duplicates (df): 
    
    if df["quotation"].is_unique  == False & df["speaker"].is_unique == False & df["date"].is_unique == False: 
        df.drop_duplicates(['quotation'], keep='first', inplace=True) #remove the duplicate rows directly on the df 

In [19]:
check_duplicates(quotes_2020)
#check_duplicates(quotes_2019)
#check_duplicates(quotes_2018)
#check_duplicates(quotes_2017)
#check_duplicates(quotes_2016)
#check_duplicates(quotes_2017)
#check_duplicates(quotes_2015)
print( "We still get {} quotes from the 2020 dataset".format(len(quotes_2020)))
#print( "We still get {} quotes from the 2020 dataset".format(len(quotes_2019)))
#print( "We still get {} quotes from the 2020 dataset".format(len(quotes_2018)))
#print( "We still get {} quotes from the 2020 dataset".format(len(quotes_2017)))
#print( "We still get {} quotes from the 2020 dataset".format(len(quotes_2016)))
#print( "We still get {} quotes from the 2020 dataset".format(len(quotes_2015)))

We still get 10486 quotes from the 2020 dataset


> ##### *check for correlations* 
We define a function that receive a dataframe (quotes_2020 ... quotes_2015) and remove their duplicates rows according to duplicate quotation if the speakers and the date is the same.

## Creation of a dataframe containing speakers_attribute and there interest for the climat 

As a primary analysis, we decided to only look at speaker cited from quotation (so interest to climate problematic) and seek if caracterstic might contribute to their interest. To do so, we simply decide to create a new column to the speakers dataframe which value is one if the label value correspond to a speakers in our defined quotes dataframe or 0 if not.
> NB : we will only use the speakers extrated from 2020 for our first analyse

In [None]:
#let's check if the label is unique
speakers.label.is_unique

We can observe that  'label' from the speaker dataframe  is non unique meaning that they are multiple personne with the same name that have different caracteristics. This cause us trouble cause we can't propely indentify one quotation with a unique personne with singular caracteristic, but instead one quotation can be attribute to different personne with different caracteristics. 
For this primary task we decide to non discrimate the label and attribute them the same climate interest (i.e if a speaker is named x in the quotes data, then all label named x in the speakers df will be considere as interest for climate change). 

# III-Exploration of our data

Let's see some distribution and statitics: 
 - aged people vs yound people 
 - party politics  
 - confident intervals
ect... 

stat : correlation coeff ; m

In [None]:
climate['age'].hist(bins = 50)

In [None]:
climate['age'].hist(bins = 50).describe()

In [None]:
#does data comes from normal distribution ?

In [None]:
diagnostic.kstest_normal(climate['age'].values, dist = 'norm')

In [None]:
#does data comes from exponential distribution ?
#how about exponential?
diagnostic.kstest_normal(climate['age'].values, dist = 'exp')

In [None]:
#is party politics is correlated to climate preocupation ? 

In [None]:
stats.pearsonr(df['IncomePerCap'],df['Employed'])

In [None]:
sns.pairplot(lalonde_data)

# IV-Methods

pour changer nos datas in order to have more robust and less biased dataset

- bert pour trouver les climatos sceptics

to do on our final data set

boot strapping pour avoir du train and test datas

- matching climate vs scpetic / climate vs not climate via propensity score to predict who talks about climate without unobserved correlation

-> categorization

- tree 
- boosting 
- random forest 