# Applied data analysis
## Self-confidence through quotations:



# Set the environment properly 

### Mount the Google Drive in order to access to the files which are located on our drive


In [None]:
from google.colab import drive
drive._mount('/content/drive')

Mounted at /content/drive


### Install and import every packages that will be necessary for the analysis

In [None]:
!pip install pickle-mixin
!pip install pyarrow
!pip install pathlib


import pyarrow
import pandas as pd
import numpy as np
import pathlib
from datetime import datetime
import sys
import bz2
import json
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

Collecting pickle-mixin
  Downloading pickle-mixin-1.0.2.tar.gz (5.1 kB)
Building wheels for collected packages: pickle-mixin
  Building wheel for pickle-mixin (setup.py) ... [?25l[?25hdone
  Created wheel for pickle-mixin: filename=pickle_mixin-1.0.2-py3-none-any.whl size=6008 sha256=93cfd4ab62e0fb564e3c236fa419e76a203220ca2c85212cd6839651cdb750c9
  Stored in directory: /root/.cache/pip/wheels/d0/70/0b/673e09a7ed429660d22352a1b117b4f616a8fc054bdd7eb157
Successfully built pickle-mixin
Installing collected packages: pickle-mixin
Successfully installed pickle-mixin-1.0.2


# Preprocessing of the data



In [None]:
# The following function allows to only keep the attributes when a single evalue is given. 
#The function replaces array of size 1 by its first element
#Indeed, some attributes are associated with an array of value, to make sure that the value we will use is the correct one, 
#we simple get rid of multiple valued element by replacing them with a 'Nan' value.

def rem_mult(array):
  if isinstance(array, np.ndarray):
    if array.size == 1 :
      array = array[0]
    else :
      array = np.nan
  return array


#### Extracting the relevant data from the original files and saving them in a new file


In [None]:
path_to_file = '/content/drive/MyDrive/ADAprojet2021/Quotebank/quotes-2020.json.bz2' 
path_to_out = '/content/drive/MyDrive/ADAprojet2021/quotes-2020-clean.json.bz2'


quotes_size =[]
with bz2.open(path_to_out, 'wb') as d_file:
    with bz2.open(path_to_file, 'rb') as s_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample as a dictionnary
            if (instance['speaker'] != 'None' and len(instance['qids']) == 1):
              #Get rid of the quotes associated with no speaker as well as 
              # quotes associated with multiple qids
                instance_out = {}
                instance_out['speaker'] = instance['speaker']
                instance_out['qids'] = instance['qids']
                #remove [ and ] 
                instance_out['quotation'] = instance['quotation'].replace("[ ", "").replace(" ]", "")      
                size = len(instance_out['quotation'].split())
                quotes_size.append(size)
                #Add the size of the quote in our new file
                instance_out['size'] = size 
                d_file.write((json.dumps(instance_out)+'\n').encode('utf-8')) # writing in the new file
            

KeyboardInterrupt: ignored

## First step: pre-process the quotation

  Indeed, since the whole analysis for this project is based on the quotations so it is important to keep only the quotations with comparable size


In [None]:
#See the distribution of the quotation length:
sns.displot(data= quotes_size, kde = True, height=8, aspect=1.6) 
sns.set_style('white')
plt.xlabel('Number of word in the quote')
plt.title("Distribution of the number of word in the different quote", x = 0.5, y = 1, fontsize = 15)
plt.tight_layout()
plt.show()

According to the distribution, the majority of the quotes are at most 100-words-long. In order to make sure that we analyse complete sentences, we are only going to keep sentences in a range 5 to 100 words long.


In [None]:
#According to the distribution of the number of word in the quotes, we can select only the quotes containg between 5 and 100 words to make sure that we analyse at least one complete sentence.

path_to_file = '/content/drive/MyDrive/ADAprojet2021/quotes-2020-clean.json.bz2'
path_to_out= '/content/drive/MyDrive/ADAprojet2021/quotes-2020-processed.json.bz2'
quotes_size1 =[]

with bz2.open(path_to_out, 'wb') as d_file:
    with bz2.open(path_to_file, 'rb') as s_file:
        for instance in s_file:
            instance = json.loads(instance) 

            if (5<= instance['size'] <= 100):
              #We only copy the line associated with the qotation size between 5 and 100:
                instance_out = instance

                size = len(instance_out['quotation'].split())
                quotes_size1.append(size)
                d_file.write((json.dumps(instance_out)+'\n').encode('utf-8'))

#Q75 = np.array(quotes_size).quantile(0.75)
#print(Q75)

In [None]:
sns.displot(data= quotes_size1, kde = True, height=8, aspect=1.6) 
sns.set_style('white')
plt.xlabel('Number of word in the quote')
plt.title("Distribution of the number of word in the different quote", x = 0.5, y = 1, fontsize = 15)
plt.tight_layout()
plt.show()

## Second step : Pre-process the speakers attributes

During this analysis we are going to use some attributes of the speakers such as their gender, their qids ,their nationality and their US congress bio ID, if there is one.

These extracted attributes will be save in a new file.

The qIDs are kept so they will allow us to navigate between the quote file and the speaker file.

In [None]:
#This function converts date of birth into datetime format, and replaces 
#by np.nan if speaker was born before 1900
# According to the fact that we are performing a linguistic study, it is essential to make sure that the english spoken is fairily the same. 
#To do so, we decided to select the speakers of interest regarding the date of birth. 
#Such processing makes sure that the speakers are contemporary and may speak similarily.
def dateofbirth(date):
  #Remove speakers who are born before JC
    if date[0] == '-' or date[1:5] == '0000': 
        date = np.nan
    else:
      #Notice here that we are only focusing on the year, this line will set the every birth days and months to first of january
        date = datetime.strptime(date[1:5], '%Y')
        if (date.year < 1900):
            date = np.nan
    return date

#This function removes QIDS found in the sorted data that are irrelevant for the analysis
def gender(id):
    #remove erkek (which is a last name), homosexuality 
    if id == 'Q106299064' or id == 'Q6636':
        return np.nan
    #replace male organism, cis male by male
    if id == 'Q44148' or id == 'Q15145778':
        return 'Q6581097'
    #same for female
    if id== 'Q43445' or id == 'Q15145779':
        return 'Q6581072'
    else :
        return id

#function returns true if us citizen
def us_to_bool(nat):
    if isinstance(nat, np.ndarray):
        return np.isin('Q30', nat)
    else : return False

#function returns true if in congress
def congress(id):
    if isinstance(id, str): return True
    else : return False


In [None]:

#files = pathlib.Path('./parquet').glob('part-*-0d587965-3d8f-41ce-9771-5b8c9024dce9-c000.snappy.parquet')
files = pathlib.Path("/content/drive/MyDrive/ADAprojet2021/Project datasets/speaker_attributes.parquet/").glob("part-*-0d587965-3d8f-41ce-9771-5b8c9024dce9-c000.snappy.parquet")
df = pd.DataFrame()
columns = ['id', 'label', 'gender','date_of_birth', 'nationality', 'US_congress_bio_ID']
for path in files:
    df = pd.concat([df, pd.read_parquet(path, columns = columns)], join='outer', ignore_index=True)
    df = df.drop_duplicates(subset='id', keep = False)

df.gender = df.gender.apply(lambda x: rem_mult(x))
df.gender = df.gender.apply(lambda x: gender(x))

df.date_of_birth = df.date_of_birth.apply(lambda x: rem_mult(x))

df = df.dropna(subset = ['id', 'gender','date_of_birth'])
df = df.astype({'gender': 'category'})

df.date_of_birth = df.date_of_birth.apply(lambda x: dateofbirth(x))
df = df.dropna(subset = ['date_of_birth'])

df.nationality = df.nationality.apply(lambda x: us_to_bool(x))
df.US_congress_bio_ID = df.US_congress_bio_ID.apply(lambda x: congress(x))


display(df)
print(df.memory_usage(deep=True))
genders = df.gender.unique()


path_to_out = '/content/drive/MyDrive/ADAprojet2021/speaker_attributes_processed.json.bz2'
df.to_json(path_to_out, orient = 'records', compression = 'bz2') 

Since we are doing an observationnal study, it is important to have a great notion of the distribution according to the different features. 

In [None]:
plt.figure(figsize=(50,20))
sns.histplot(df.gender)
plt.title("Histogram showing the distribution of gender ", fontsize=30)
plt.xlabel("Gender", fontsize=25)
plt.ylabel("Numbers of individuals", fontsize=25)
plt.yscale('log')
labels = ['transgender female', 'intersex', 'neutrois', 'genderqueer', 'eunuch', 'genderfluid', 'transgender person', 'shemale',
          'transgender male', 'transmasculine', 'two-spirit', 'muxe', 'non-binary', 'third gender', 'agender', 'neutral sex',
         'female', 'male', 'pangender', 'khatoey', 'bigender', 'demiboy', 'X-gender']
plt.xticks(ticks = np.arange(23), labels = labels, rotation=45, fontsize=20)
plt.show()

In [None]:

x = []
for i in df.date_of_birth:
  x.append(i.year)

sns.displot(data=x, kde = True, height=8, aspect=1.6) 
sns.set_style('white')
plt.xlabel('Year of birth')
plt.xlim(1900,2021)
plt.title("Distribution of the birth years", x = 0.5, y = 1, fontsize = 15)
plt.tight_layout()
plt.show()

## Joinining both datasets
In this step, we remove quotes by speakers we removed from the speaker dataframe (e.g. because we didn't know the gender), and we cound the number of quotes per speaker, so as to remove speakers that don't have quotes assigned. 

In [None]:
#create a file merging the quotes with the speaker information, to remove quotes by speakers of which we did not have sufficient info
path_to_file = '/content/drive/MyDrive/ADAprojet2021/quotes-2020-processed.json.bz2' 
path_to_pickle = '/content/picklefile.pkl'


df_reader = pd.read_json(path_to_file, lines=True, compression='bz2', chunksize=100000, orient = 'records')

nbr_chunk = 0

df['quote_nbr'] = np.zeros(df.shape[0])

In [None]:
#function to turn the qids from the quotebank file from list to string format
def list_to_first(qids):
  return qids[0]

with bz2.open(path_to_pickle, 'ab') as pickle_file: #attention ca append au fichier du coup supprimer le fichier avant de relancer
    for chunk in df_reader:
        chunk.qids = chunk.qids.apply(lambda x: list_to_first(x))
        chunk = chunk[(chunk.qids).isin(df.id.tolist())]
        occurences = chunk.qids.value_counts()
        occurences = pd.DataFrame(occurences)
        occurences['id'] = occurences.index.to_series()
        occurences.columns = ['quote_nbr', 'id']
        df = pd.merge(df, occurences, on='id', how='left' )
        df['quote_nbr'] = df.pop('quote_nbr_x')+ df.pop('quote_nbr_y')
        nbr_chunk = nbr_chunk + 1
        pickle.dump(chunk, pickle_file)

## Natural language processing:

Preprocessing of the citations: Put them in the right format, extract the tense of the citation to properly assign the right score.


In [None]:
#NLP libraries
import spacy, nltk, gensim, sklearn
from nltk import pos_tag, word_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# je sais pas si ca va etre utile mais je le laisse pour le moment
nlp = spacy.load('en_core_web_sm')


In [None]:
path = '/content/drive/MyDrive/ADAprojet2021/quotes-2020-processed.json.bz2'
quotes = pd.DataFrame(columns=('qids','quotation','verbs' 'tense', 'score'))

#create an array containing the tags associated with the tense 
present = ["VBP", "VBZ","VBG"]
past = ["VBD", "VBN"]

with bz2.open(path, 'rb') as s_file:
  for instance in s_file:
    tense = ['None']
    instance = json.loads(instance) 
    text = nltk.word_tokenize(instance['quotation'])
    tags = nltk.pos_tag(text)

    for word,pos in tags:
      tense = [pos for word,pos in tags if np.isin(pos, past + present)]
  #Check if every verbs are conjugated at the same tense, and assigning the tense to the quotes
    if set(tense).issubset(set(present)) == True:
      tense = 'present'
    if set(tense).issubset(set(past)) == True:
      tense = 'past'

    quotes = quotes.append({'qids':instance['qids'], 'quotation':instance['quotation'], 'tense': tense}, ignore_index=True)

Expression that we want to target in the citations and their associated score:

In [None]:
#sentences and associated scores to be used as starting point (taken from the Wesson & Pulford paper)
sentences = pd.DataFrame(columns = ['sentence', 'score'])

#il manque les expressions au passé
#We also removed phrases that were in two parts (I think ... isn't it, I think ... but I can't be sure)
sentences.sentence = ["I'm absolutely certain", "I'm positive", "I'm certain", "I know for a fact", "I know", "I'm confident", "I have no doubt", "I'm sure", "I have no doubt, I mean I'm sure", "I'm fairly confident", "I remember", "I believe", "I would say", "I suspect", "I could be mistaken but I'm sure", "I think", "I'm not completely confident, but I think", "I can't say for sure, but", "I think ... but I can't be sure", "I'm not certain but","I'm not sure but","... I think", "I guess", "I could be wrong, but I think", "I think, I think", "I think ... isn't it", "I'm guessing, but I would say", "I suppose", "Oh, I don't know, I suppose", "I'm not sure, it's kind of ..."]

#score in the article follow uk grading system (0-7, 7 being the best). 
sentences.score = [6.61, 6.57, 6.55, 6.5, 6.45, 6.43, 6.3, 6.02, 5.95, 5.32, 5.18, 4.86, 4.7, 4.68, 4.68, 4.66, 4.2, 4.16, 4.16, 4.14, 3.84, 3.75, 3.75, 3.68, 3.61, 3.48, 3.39, 3.34, 3.02, 2.91]

#We change this to a 0-1 scale, as it is more intuitive. 
sentences.score = sentences['score'].apply(lambda x: x/7)

#to enable proper comparison we need to case fold the expressions to avoid a missing
sentences.sentence = sentences['sentence'].apply(lambda x: x.casefold())

display(sentences)

Unnamed: 0,sentence,score
0,i'm absolutely certain,0.944286
1,i'm positive,0.938571
2,i'm certain,0.935714
3,i know for a fact,0.928571
4,i know,0.921429
5,i'm confident,0.918571
6,i have no doubt,0.9
7,i'm sure,0.86
8,"i have no doubt, i mean i'm sure",0.85
9,i'm fairly confident,0.76


Je laisse la cellule qui genere un data frame test si quelqu'un a besoin d'un sous ensemble pour tester ses trucs

In [None]:
#Extract a small part of the whole file to test 
path = '/content/drive/MyDrive/ADAprojet2021/quotes-2020-processed.json.bz2'
test = pd.DataFrame(columns=('qids','quotation', 'findings', 'score'))
i = 0
with bz2.open(path, 'rb') as s_file:
  for instance in s_file:
    instance = json.loads(instance) 
    i = i + 1
    test = test.append({'qids': instance['qids'], 'quotation': instance['quotation']}, ignore_index=True)
    if i == 15:
      break


Find the expressions of confidence in the quotations


In [None]:
from nltk.text import Text
import re 

#to enable proper comparison we need to case fold the citations to avoid a missing

#quotes['quotation']= quotes['quotation'].apply(lambda x: x.casefold())

test['quotation']= test['quotation'].apply(lambda x: x.casefold())

display(test)

Unnamed: 0,qids,quotation,findings,score
0,[Q367796],department of homeland security was livid and ...,,
1,[Q20684375],i met them when they just turned 4 and 7. they...,,
2,[Q5268447],the delay will have an impact on slough but th...,,
3,[Q4864119],the scheme treats addiction as an illness and ...,,
4,[Q816459],these actions will allow households who have a...,,
5,[Q30164281],1. fm is entitled to go straight to press conf...,,
6,[Q56255401],"11 straight weeks of pre-season,",,
7,[Q26923564],2019 was a landmark year for fiverr as we comp...,,
8,[Q4749380],7pm is when most hospitals change shifts. that...,,
9,[Q970000],a city is not an accident but the result of co...,,


In [None]:
#a mettre plus tot du coup 
#Apparemment le warning c'est un conniere de desaccord entre numpy et native python du coup je vais juste les ingnorer avec les lignes suivante :
import warnings; warnings.simplefilter('ignore')


In [None]:
#put the different expression in a format enable multiple comparison:
rx = r"(?=\b({})\b)".format("|".join(map(re.escape, sorted(sentences.sentence, key=len, reverse=True))))
pattern = re.compile(rx)

#keep the expressions found in an array to allow score assessing:
#quotes['findings'] = quotes['quotation'].apply(lambda x: np.unique(re.findall(pattern, x)))
test['findings'] = test['quotation'].apply(lambda x: np.unique(re.findall(pattern, x)))

#findings contains string or ??, need to figure out might be a problem

#score assessing:
#quotes['score'] = quotes['findings'].apply(lambda x: [sentences['score'][ind] for ind in sentences.index if(sentences['sentence'][ind] in x)])
test['score'] = test['findings'].apply(lambda x: [sentences['score'][ind] for ind in sentences.index if(sentences['sentence'][ind] in x)])
#display(quotes)
display(test)


  from ipykernel import kernelapp as app


Unnamed: 0,qids,quotation,findings,score
0,[Q367796],department of homeland security was livid and ...,"[of, the]","[4, 5]"
1,[Q20684375],i met them when they just turned 4 and 7. they...,[],[]
2,[Q5268447],the delay will have an impact on slough but th...,[the],[4]
3,[Q4864119],the scheme treats addiction as an illness and ...,[the],[4]
4,[Q816459],these actions will allow households who have a...,"[of, the]","[4, 5]"
5,[Q30164281],1. fm is entitled to go straight to press conf...,"[is, not]","[1, 2]"
6,[Q56255401],"11 straight weeks of pre-season,",[of],[5]
7,[Q26923564],2019 was a landmark year for fiverr as we comp...,[the],[4]
8,[Q4749380],7pm is when most hospitals change shifts. that...,"[is, why]","[1, 3]"
9,[Q970000],a city is not an accident but the result of co...,"[is, not, of, the]","[1, 2, 4, 5]"


Some expressions take into account the tense of the sentences, hence we create a new column that indicates the tense in order to properly assess the score 

In [None]:
#create an array containing the tags associated with the tense 
present = ["VBP", "VBZ","VBG"]
past = ["VBD", "VBN"]

#quotes['tags'] = quotes['quotations'].apply(lambda x: nltk.pos_tag(nltk.word_tokenize(x)))
test['tags'] = test['quotation'].apply(lambda x: nltk.pos_tag(nltk.word_tokenize(x)))

#quotes['tense'] = quotes['tags'].apply(lambda x: [pos for word,pos in tags if np.isin(pos, past + present)])
test['tense'] = test['tags'].apply(lambda x: [pos for word,pos in x if np.isin(pos, past + present)])

#quotes['tense'] = quotes['tense'].apply(lambda x: 'present' if set(x).issubset(set(present)) == True else 'past' )
test['tense'] = test['tense'].apply(lambda x: 'present' if set(x).issubset(set(present)) == True else 'past')

#quotes['score'] = quotes['findings'].apply(lambda x: [sentences['score'][ind] for ind in sentences.index if(sentences['sentence'][ind] in x)])

# dans le futur on aura besoin de faire une difeerenciation entre les citations qui sont au passé et au present pour associer le bon score:
#quotes[quotes.tense == 'present']['score'] = quotes[quotes.tense == 'present']['findings'].apply(lambda x: [sentences['score'][ind] for ind in sentences.index if(sentences[sentences.tense == 'present']['sentence'][ind] in x)])
#quotes[quotes.tense == 'past']['score'] = quotes[quotes.tense == 'past']['findings'].apply(lambda x: [sentences['score'][ind] for ind in sentences.index if(sentences[sentences.tense == 'past']['sentence'][ind] in x)])

test['score'] = test['findings'].apply(lambda x: [sentences['score'][ind] for ind in sentences.index if(sentences['sentence'][ind] in x)])
display(test)

# CODE DE FLORETTE:


In [None]:
#si besoin
#pip install -U sentence-transformers

#test de comparaison semantique avec des trucs randoms

sentences = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "He found a leprechaun in his walnut shell."
]

from sentence_transformers import SentenceTransformer

model = SentenceTransformer(r'C:\Users\flore\Desktop\EPFL\MA1\ada\PROJET\pretrained_model') #'bert-base-nli-mean-tokens'

sentence_embeddings = model.encode(sentences)

from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(
    [sentence_embeddings[0]],
    sentence_embeddings[1:]

In [None]:
#test de comparaison semantique de quelques une de nos citations avec une des phrases types du paper
#j'ai fait comme dans un exemple de BERT que j'ai trouvé sur internet. j'ai pris le pretrained model all-MiniLM-L6-v2


#note: normalement pas besoin d'avoir le model sur l'ordi, il suffit de mettre le nom 'all-MiniLM-L6-v2' et SentenceTransformer va le chercher tout seul sur internet, mais là il y a un bug genre ajd, 
#donc j'ai téléchargé le model sur mon ordi et mis le path
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(r'C:\Users\flore\Desktop\EPFL\MA1\ada\PROJET\pretrained_model') #with model all-MiniLM-L6-v2' # or use 'bert-base-nli-mean-tokens'


#Extract a small part of the whole file to test 
path = r'C:\Users\flore\Desktop\EPFL\MA1\ada\PROJET\quotes-2020-precessed.json.bz2' #path de mon ordi
test = pd.DataFrame(columns=('qids','quotation'))
i = 0
with bz2.open(path, 'rb') as s_file:
  for instance in s_file:
    instance = json.loads(instance) 
    print(instance)
    #print(instance['quotation'])
    i = i + 1
    test = test.append({'quoteID': instance['quoteID'], 'quotation': instance['quotation']}, ignore_index=True)
    #print(i)
    if i == 10:
      break

#embedding des 10 citations
sentence_embeddings = model.encode(test['quotation'])

#comparaison avec une des phrase du paper: cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
to_compare = ["I'm sure it's"]
to_compare_embedding = model.encode(to_compare)

cosine_similarity(
    [to_compare_embedding[0]],
    sentence_embeddings[0:]
)

#l'output ets un array des coefficients de similarité (là ils osnt tous ba vu qu'aucune citation n'a un truc prche de I'm sure it's)

ModuleNotFoundError: ignored

In [None]:
#autre test avec le début de citation pour voir si il donne bien un score plus élevé pour cette citation
to_compare_2 = ["Department of Homeland Security was livid and"]
to_compare_2_embedding = model.encode(to_compare_2)

cosine_similarity(
    [to_compare_2_embedding[0]],
    sentence_embeddings[0:]
)
#le premier est bien plus élevé