In [171]:
import numpy as np
import pandas as pd
import json
from urllib.request import urlopen
import sqlite3
import string

#load the statistical libraries
from statsmodels.stats import diagnostic
from scipy import stats
import ast

# General information Remark

### In the loading part we will recover data from 2015 to 2020, however, first visulation (part III) will only be on the data from 2020.

# I- Load the data

### Load Quotebank data

First, let's recover the quotation of interest : as project is based on the caracterisation of the speaker, we decide to pre-select the quotations that are related to a speaker (i.e speaker value is different from 'None'). 
Moreover, we select the quotations whose subject is related to climate change : to do so we create a list of key word based on https://www.climaterealityproject.org/blog/key-terms-you-need-understand-climate-change and select quotes that contains at least one of these word.  (cf chunk_filtering method) . We are aware that this methode incude biais,and we thought to later utlise NPL in order to filter quotation related to climate from other

> ##### A/ Select data representative for climate interest

In [172]:
#declaration of a key_world list
key_word = ["carbon dioxide", "greenhouse gas", "global warming",
             "climate change",  "fossil fuels", "sea-level rise",
             "renewable energy", "CO2","methane","PPM","COP","GIEC", 
             "biofuels","business as usual", "carbon footprint", "carbon neutral", "carbon sequestration"] 

In [173]:
def chunk_filtering(chunk, lst):
    template=[] #creation of an empty list :it's always cheaper to append to a list and create a DataFrame than append on a empty dataframe.
    for i in lst: 
        template.append(chunk.loc[chunk["quotation"].apply(lambda x : i in x) & 
                                  chunk["speaker"].apply(lambda x: x!= "None")&chunk["qids"].apply(lambda x: len(np.array(x))==1)].drop(['phase'], axis=1))#select the quotation with value in speaker column different from 'None' 
                                                                                #and quotations containing the key word and drop Phase column
        
    return (pd.concat(template, ignore_index=True))# return a dataframe with our data of interest

In [None]:
dico={}
for date in [2020, 2019, 2018, 2017, 2016, 2015]:
    dico[date] = pd.read_json(f'data/quotes-{date}.json.bz2', lines=True, compression='bz2', chunksize=1000)

In [None]:
for date, df in dico.items() : 
    for i, chunk in enumerate(df) : 
        chunk_clean=chunk_filtering(chunk, keywords_sceptic) #recover interested row of the chunk
        header = i == 0 #we kept the name of the column only for the first chunk
        mode = 'w' if i == 0 else 'a' # For appending data to an existing CSV file (so for every chunk exepct the first one), 
                                        #we can use mode = a
            
        chunk_clean.to_csv(path_or_buf=f"data/clean_quotes-{date}.bz2",compression='bz2',header=header, mode=mode, index = False )

In [None]:
dico={}
for date in [2020, 2019, 2018, 2017, 2016, 2015]:
    dico[date]=[f"quotes_{date}",pd.read_csv(f'data/clean_quotes-{date}.bz2', compression='bz2')]

In [None]:
quotes_2020= pd.read_csv('data/clean_quotes_sceptic-2020.bz2', compression='bz2')

In [193]:
print(" At result, we extracted {} quotes fromes quotebank data".format((len(quotes_2015)+len(quotes_2016)+len(quotes_2017)
                                                                         +len(quotes_2018)+len(quotes_2019)+len(quotes_2020))))

 At result, we extracted 131472 quotes fromes quotebank data


Even with key_word selection we success to extrat interesting data from the Quotebank data with a sufficient size. Let's add another dataset that will give us characteristic information about the speaker

> ##### B/ Select data representative for climate septic

We want to asses climate scepticism among our speakers. We selected 10 speakers that are said to be climate sceptic according to https://www.businessinsider.com/the-ten-most-important-climate-change-skeptics-2009-7?IR=T#dont-miss-11. We want to find our list of keywords from their quotations.

In [232]:
lst = ['Freeman Dyson', 'Bjorn Lomborg', 'Myron Ebell', 'Kiminori Itoh', 'Ivar Giaever', 
       'Will Happer', 'Ian Plimer', 'Michael Chrichton', 'Alan Carlin', 'Patrick Michaels'] #list of the name taken from the article
#iteration in the list of name in order to find if our people of interest are in our quotes list and 
#we then create one df per year with their correspondings quotes

template = []
dico={2020 : quotes_2020, 2019 : quotes_2019, 2018 : quotes_2018, 2017 : quotes_2017, 2016 : quotes_2016, 2015 : quotes_2015}

for key, quotes in dico.items():
    for i in lst:
        template.append(quotes.loc[quotes['speaker'].apply(lambda x : i == x)])  
    df_quotes= pd.concat(template, ignore_index=True)

In [233]:
#we imported these librairies in order to handle language expression and words counting 
import nltk
import nltk.corpus
from nltk.tokenize import word_tokenize 
nltk.download('punkt')
from nltk.probability import FreqDist
nltk.download('words')
import string

[nltk_data] Downloading package punkt to /Users/maria/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /Users/maria/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [259]:
#two functions to asses the highest frequency of word appearance
from nltk import word_tokenize
from nltk.corpus import stopwords
a = set(stopwords.words('english'))

def remov_punc(lst): #removes the punctuations from a sentence
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_°~''' #list of punctuations 
    remov_punc = []
    t = 0
    
    for i in lst :
        t=t+1
        for d in i:
            if d in punc:
                i = i.replace(d, " ")
        remov_punc.append(i)
    return remov_punc
    
def words_freq(lst): #calculate each word frequency
    ls=[]
    for i in lst: 
        text = i
        text1 = word_tokenize(text.lower())
        imp_words = [x for x in text1 if x not in a]
        ls.append(imp_words)
    return ls

def words__highest_freq(lst): #return the highest word frequency
    ls_freq = []
    for i in lst: 
        fdist = FreqDist(i)
        fdist1 = fdist.most_common(1)
        ls_freq.append(fdist1)
    return ls_freq #this is a list with the highest frequency for each most written words



In [260]:
list_quotes=df_quotes['quotation'].tolist()

In [262]:
#construct lists with all the words frquency 
w_freq= words_freq(remov_punc(list_quotes))
w_h_freq = words__highest_freq(w_freq)
w_h_freq

keywords_sceptic = []

for i in w_h_freq: #keep the highest frequency 
    for d in i: 
        if d[1] >=3 : 
            if d[0] not in keywords_sceptic:
                keywords_sceptic.append(d[0])
        
keywords_sceptic #our list of keywords according to their representation in the climate sceptic speaker quotations

['science',
 'increasing',
 'local',
 'demonstration',
 'climate',
 'energy',
 'emissions',
 'power',
 'degrees',
 'percent',
 'c',
 'silly',
 'paris',
 'co2',
 'consensus',
 'global',
 'effects',
 'models',
 'r',
 'ipcc',
 'years',
 'year']

In [271]:
keywords_sceptic.remove('c')
keywords_sceptic.remove('r')

In [272]:
keywords_sceptic

['science',
 'increasing',
 'local',
 'demonstration',
 'climate',
 'energy',
 'emissions',
 'power',
 'degrees',
 'percent',
 'silly',
 'paris',
 'co2',
 'consensus',
 'global',
 'effects',
 'models',
 'ipcc',
 'years',
 'year']

In [274]:
dico={}
for date in [2020, 2019, 2018, 2017, 2016, 2015]:
    dico[date] = pd.read_json(f'data/quotes-{date}.json.bz2', lines=True, compression='bz2', chunksize=1000)

Now we load our data with this new list of keywords. We use the same technique as above. 

In [None]:
for date, df in dico.items() : 
    for i, chunk in enumerate(df) : 
        chunk_clean=chunk_filtering(chunk, keywords_sceptic) #recover interested row of the chunk
        header = i == 0 #we kept the name of the column only for the first chunk
        mode = 'w' if i == 0 else 'a' # For appending data to an existing CSV file (so for every chunk exepct the first one), 
                                        #we can use mode = a
            
        chunk_clean.to_csv(path_or_buf=f"data/clean_quotes_sceptic-{date}.bz2",compression='bz2',header=header, mode=mode, index = False )

In [201]:
quotes_2020_sceptic= pd.read_csv('data/clean_quotes_sceptic-2020.bz2', compression='bz2')

In [203]:
quotes_2019_sceptic= pd.read_csv('data/clean_quotes_sceptic-2019.bz2', compression='bz2')

In [None]:
quotes_2018_sceptic= pd.read_csv('data/clean_quotes_sceptic-2018.bz2', compression='bz2')

In [None]:
quotes_2017_sceptic= pd.read_csv('data/clean_quotes_sceptic-2017.bz2', compression='bz2')

In [None]:
quotes_2016_sceptic= pd.read_csv('data/clean_quotes_sceptic-2016.bz2', compression='bz2')

In [None]:
quotes_2015_sceptic= pd.read_csv('data/clean_quotes_sceptic-2015.bz2', compression='bz2')

## Load additional data Relative to speakers

### Extracted labels from QID 

The provided speaker_attributes.parquet file contains attributes in terms of QIDs, thereby being uninterpretable by humans (df_qid).
To map the QIDs to meaningful labels, we used the provied wikidata_labels_descriptions_quotebank.csv.bz2 containg the labels and value fo the respective QID containing the df_qid (df_label_qid)
By combaning the information of both we can obtained usefule information about speakers. 

#### *Load parquet file*

In [5]:
df_qid = pd.read_parquet("speaker_attributes.parquet",engine= "pyarrow" )
df_label_qid = pd.read_csv('data/wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col='QID')

#### *Somes visualisation and sort of the parquet file*

First let's check if the identifier are unique

In [None]:
df_qid.id.is_unique

Before extract the label of qid, let's check which column we want to keep in frame with our project

In [None]:
df_qid.head(3)

Let's verify that academic_degree has revelant values

In [90]:
print("There's no academic degree revelant value ? {}".format(all(df_qid.academic_degree.isna())))

There's no academic degree revelant value ? False


It seems that academic degree value is revelant, we decided to trop lastrevid, US_congress_bio_ID, type, Alisiase

In [8]:
df_qid.drop(['lastrevid', 'US_congress_bio_ID', 'type', 'Aliases'], axis=1, inplace=True)

#### *Transformation of the df_qid with the label value from df_label_qid*

In [None]:
def transform(y):
    if y is None: return None
    x = set(y)
    x.discard("Q99753484")
    return x
    
df_qid=df_qid['occupation'].apply(lambda y : transform(y))

In [None]:
#We found out that some of the QIDs used in the speaker attribute file are actually redirection from an original QID. 
#We will manulally add their corresponding information using the orginal QID. We found the corespondance manualy between the two. 
#Here, there are in order, respectively the redirection QID, and its corresponding original one. One of he QID was only present 
#as a redirection, so we manually added this one (Q3186984), and its corresponding info. 

redirect_QID=['Q3268166', 'Q11815360', 'Q12014399', 'Q16287483',
              'Q20432251', 'Q21550646', 'Q13365117', 'Q13424794',
             'Q1248362', 'Q6859927', 'Q15145782',
             'Q15991263', 'Q12455619', 'Q5568256', 
             'Q6363085', 'Q11819457', 'Q12334852', 'Q15145783']
actual_QID=['Q1113899', 'Q1919436', 'Q250867', 'Q6051619',
             'Q26934816', 'Q18431816', 'Q12840545', 'Q5157338',
            'Q3455803', 'Q715222', 'Q1052281',
            'Q2743689', 'Q7019111', 'Q3738699', 
            'Q380075', 'Q3391743', 'Q476246', 'Q2449503']

#There is a QID that was deleted from Wikidata, Q99753484, so we will remove this QID:


lst=[['Journalist', 'monthly magazine of the United Kingdom‘s National Union of Journalists (NUJ)']]
indexes=['Q3186984']
col=['Label', 'Description']
for i in range(len(redirect_QID)):
    lst.append([df_label_qid.loc[actual_QID[i]]['Label'], 
                df_label_qid.loc[actual_QID[i]]['Description']])
    indexes.append(redirect_QID[i])

additional_df= pd.DataFrame(lst, columns= col, index=indexes)
df_label=df_label_qid.append(additional_df, ignore_index= False)

In [None]:
#Applying the function to every column containing QIDs. 
cols=['nationality', 'gender', 'ethnic_group','occupation', 'party', 'academic_degree', 'candidacy', 'religion']

df_qid[cols] = df_qid[cols].applymap(lambda d: d if d is not None else [])
df_qid[cols] = df_qid[cols].applymap(lambda y: [df_label.loc[Q].Label for Q in y])

### Let's have additional data (skeptic/climate)

We will now match speaker from the df_qid with a value septic/climate. To do so, we will extracted 2 listes of qid from the quotes_years data : where the first one (qid_climate) contains qid related to speakers that talk about climate and the second list (qid_skeptic) contained qid related to speakers that seems climatospetic

#### *Qid_climate list*

In [53]:
dic={2020 : quotes_2020, 2019 : quotes_2019, 2018 : quotes_2018 , 2017 : quotes_2017, 2016 : quotes_2016, 2015 : quotes_2015}
qid_climate=[]

for key, file in dic.items() :
    qid_climate.append(file.drop_duplicates(['qids'], keep='first')['qids'].tolist())

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0.1,Unnamed: 0,aliases,date_of_birth,nationality,gender,ethnic_group,occupation,party,academic_degree,id,label,candidacy,religion
0,0,['Washington' 'President Washington' 'G. Washi...,['+1732-02-22T00:00:00Z'],"['Great Britain', 'United States of America']",['male'],['White British'],"['politician', 'military officer', 'farmer', '...",['independent politician'],['Doctor of Sciences in Physics and Mathematics'],Q23,George Washington,"['1792 United States presidential election', '...",['Episcopal Church']
1,1,['Douglas Noel Adams' 'Douglas Noël Adams' 'Do...,['+1952-03-11T00:00:00Z'],['United Kingdom'],['male'],['French'],"['playwright', 'screenwriter', 'novelist', ""ch...",['Republican Party'],['laurea'],Q42,Douglas Adams,"['2000 United States presidential election', '...","['United Methodist Church', 'Episcopal Church'..."
2,2,['Paul Marie Ghislain Otlet' 'Paul Marie Otlet'],['+1868-08-23T00:00:00Z'],['Belgium'],['male'],['Poles'],"['writer', 'lawyer', 'librarian', 'information...",['independent politician'],['doctorate'],Q1868,Paul Otlet,['1946 Chilean presidential election'],['Catholicism']
3,3,['George Walker Bush' 'Bush Jr.' 'Dubya' 'GWB'...,['+1946-07-06T00:00:00Z'],['United States of America'],['male'],['French'],"['politician', 'motivational speaker', 'autobi...",['Radical Party'],['Doktor Nauk in Juridical Science'],Q207,George W. Bush,['2005 Polish presidential election'],['Catholicism']
4,4,['Velázquez' 'Diego Rodríguez de Silva y Veláz...,['+1599-06-06T00:00:00Z'],['Spain'],['male'],['Greeks'],['painter'],['Democratic Party'],"['Bachelor of Arts', 'Master of Business Admin...",Q297,Diego Velázquez,['2014 Indian general election in Vadodara Lok...,['Catholicism']


#### *Qid_skeptic list*

In [None]:
dic={2020 : quotes_2020_sceptic, 2019 : quotes_2019_sceptic, 2018 : quotes_2018_sceptic , 2017 : quotes_2017_sceptic, 2016 : quotes_2016_sceptic, 2015 : quotes_2015_sceptic}
qid_skeptic=[]

for key, file in dic.items() :
    qid_skeptic.append(file.drop_duplicates(['qids'], keep='first')['qids'].tolist())

#### *Add an additional column to df_qid*

In [None]:
df_qid['climate']='None'

In [None]:
qid_climate=qid_climate.map(lambda y : ast.literal_eval(y)[0])
df_qid.at[df_qid[pd.Index(df_qid.id).isin(pd.Index(qid_climate))].index, 'climate']=='involved'

In [None]:
qid_skeptic=qid_skeptic.map(lambda y : ast.literal_eval(y)[0])
df_qid.at[df_qid[pd.Index(df_qid.id).isin(pd.Index(qid_skeptic))].index, 'climate']=='skeptic'

#### *Load resulting data into a csv compressed filled*

In [None]:
df_qid.to_csv("data/speaker_attribute.bz2", compression = 'bz2', index=False)

Let's visualize !

In [None]:
speakers=pd.read_csv("data/speaker_attribute.bz2", compression='bz2', index = False)

In [None]:
import ast
test2=quotes_2020.drop_duplicates(['qids'], keep = 'first')['qids']
test['climate'] = 0
test2.map(lambda y : ast.literal_eval(y)[0])

# II- Filter the data

As a good data scientist, the first thing to do is to clean up the data : first, we need to convert the value from the datafram into proper python string : we can use literal_eval to safely evaluating strings containing Python expressions from untrusted sources without the need to parse the values oneself. We also need to check for missing row and correlation.

> ##### *check for missing row*
We consider that a row is missing if we don't have information about speakers attributes (i.e other than label, qid)

In [51]:
print("Is there some missing rows ? {} ".format(np.array([speakers.drop(['label', 'id']).isnull().any(axis=1)]).all()))

Is there some missing rows ? False 


> ##### *evaluating strings*

In [None]:
speakers=speakers.fillna("[]").head(20)[['date_of_birth', 'nationality','gender','ethnic_group', 
                                       'occupation','party', 'academic_degree','candidacy', 'religion']].applymap(lambda y: ast.literal_eval(str(y)))

> ##### changes the date_of_birth 
we will juste recover year of the birth 

In [None]:
speakers.date_of_birth=speakers.date_of_birth.map(lambda y : str(y).split("-")[0][3:]).fillna(0)

> ##### *check for correlations* 

In [None]:
speakers.corr()
print(corr)

# III-Exploration of our data

Let's see some distribution and statitics: 
 - aged people vs yound people 
 - party politics  
 - confident intervals
ect... 

stat : correlation coeff ; m

In [None]:
climate['age'].hist(bins = 50)

In [None]:
climate['age'].hist(bins = 50).describe()

In [None]:
#does data comes from normal distribution ?

In [None]:
diagnostic.kstest_normal(climate['age'].values, dist = 'norm')

In [None]:
#does data comes from exponential distribution ?
#how about exponential?
diagnostic.kstest_normal(climate['age'].values, dist = 'exp')

In [None]:
#is party politics is correlated to climate preocupation ? 

In [None]:
stats.pearsonr(df['IncomePerCap'],df['Employed'])

In [None]:
sns.pairplot(lalonde_data)

# IV-Methods

## Datas loading and treatment 

### BERT

We would like to have an automatic classification of climate scepticism people. According to https://aclanthology.org/2021.naacl-main.175.pdf, neutralization is used in climate skepticism.  

Naturalization example : 

- Sure, we should reduce greenhouse gases, but if our cli- mate policies hurt our ability to create more wealth and bring power to the world’s poor, then we are ridding the patient of the disease, but only by killing him
- It’s very convenient for alarmist greens to blame the fires of Australia and California on global warming. In reality, global warming is just a natural cycle and the policies they themselves advocate are the culprits.
- The IPCC falsely attributes natural warming and urban warming to greenhouse gas (GHG) emission warming. It ignores the compelling evidence of natural climate change before 1950 that correlates well with indicators of solar activity

They could categorize these neutralization techniques arguments into two groups
- Policy (cost, economy, carbon tax) -> blame alarmist green 
    - Condemnation of the Condemner (manipulation of poltics)
    - Appeal to Higher Loyalties (progress is more important than taxes)
    - Justification by Comparison (pollution comparison)
- Science (ability of scientist) -> natural cycle
    - Denial of responsability (natural cycle)
    - Denial of injury1 (not significant)
    - Denial of injury2 (increase of CO2 is good)
    - Denial of victim 

We want to categorize our skeptic people in two subgroups, social and scientific. 
For the code part we will try to inspire ourself from https://arxiv.org/pdf/2110.12010.pdf.

### Bootstrapping

We would like to use the bootsrapping method in order to resample our datas and obtain a data_set_test and a data_set_train. This will able us to fit our model that we will construct in order to explore our data.

### Propensity score matching

In order to avoid unobserved corelation, we want to use propensity score matching. And from this new dataset go furtherer in our project.

## Project

### Regression

We will first try to do some regression to see if we can predict some tendancy with accurancy. We will try to see if some attributs are predictable or if they are all relevant

### Classifier

We will try different type of tree categorization in order to have more robust methods to test our hypothesis. 

- boosting 
- random forest 

Based on these models we will try to have a better understanding on the climate tendancy in the population. (cf README) for more details.