# TO RUN : imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import bz2
import urllib
from datetime import datetime, date

data_folder = 'data/'

# SELECT DATA OF INTEREST 
The project focusing on the debate of the right to bear arms in the USA, the first task consists in selecting the data related to this topic. To do so, we use a lexical field (named ***lexical_field*** in the code)  related to the topic, and select only the quotations, which contain one or more words of this lexical field. <br> The determined word bank is the set of the following words: 'gun', 'firearm', 'mass shooting', '2nd Amendment', 'homicide', 'gun shot', 'armed robbery', 'rifles', 'Second Amendment', 'Columbine', 'gun control'. The way the words have been chosen is explained in the `read.me`.  <br> <br> 
The selected quotes are then stored in a new data file named `quotes-20__-extended.json.bz2` in form of a dataframe with new columns. The added columns contain information about the speakers (gender, nationality, occupations, age (computed from the date of birth), ethnic group, political party and religion). Such information are taken from the second dataset `speaker_attributes.parquet`, built from wikidata information. Quotations that are not related to any seaker are not kept. <br> <br>
This data preprocessing being long, we decide to only treat the quotations of 2017 for Milestone 2. The corresponding file have a total of more than 26 millions quotes, it contains way enough infomation to compute the first statistics and check if our project is feasible. <br>
This is also why we decide to save the most information possible about the speaker. The quotations from the other years will be studied in Milestone 3.  <br> <br>

In [2]:
lexical_field = ['gun','firearm','mass shooting','2nd Amendment','homicide','gun shot','armed robbery','rifles','Second Amendment','Columbine', 'gun control']

speakers = pd.read_parquet(data_folder + 'speaker_attributes.parquet')
label = pd.read_csv(data_folder + 'wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col='QID')

In [3]:
path_to_file = data_folder + 'quotes-2019.json.bz2' 
path_to_out = data_folder + 'quotes-2019-extended.json.bz2'

iter = 0
nb_occ = 0

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            
            #To count the number of citations
            iter += 1
            
            #To count the total number of occurences
            if instance['numOccurrences'] is not None:
                nb_occ += instance['numOccurrences']

            #To check the progress of the algorithm, because it takes multiple hours to run
            if (iter % 100000 == 0):
                print('nombre de citations lus: {}'.format(iter))

            #We check if the quotation contains words from the lexical field
            if any(substring.lower() in instance['quotation'].lower() for substring in lexical_field) and instance['qids'] != []: #We keep only quotation containing words of the lexical field and where there is a speaker
                
                #We load additional information about the speaker
                speaker = speakers.loc[speakers['id'] == instance['qids'][0]].squeeze()

                #We add nationality
                if speaker.nationality is not None:
                    instance['nationality'] = []
                    for i in speaker['nationality']:
                        nat = label.loc[i]['Label']
                        instance['nationality'].append(nat)
                else:
                    instance['nationality'] = None
                    
                #We add the gender
                if speaker.gender is not None:
                    instance['gender'] = []
                    for i in speaker['gender']:
                        gend = label.loc[i]['Label']
                        instance['gender'].append(gend)
                else:
                    instance['gender'] = None

                #We add the occupations
                
               
                    
                
                
                if speaker.occupation is not None:
                    instance['occupation'] = []
                    for i in speaker['occupation']:
                        if i != 'Q99753484':
                            occ = label.loc[i]['Label']
                            instance['occupation'].append(occ)
                else:
                    instance['occupation'] = None

                #We add the age (computed from the date of birth)
                #We use a try since the date is wrong (e.g. month = 0) at some places
                try:
                    born = datetime.strptime(speaker.date_of_birth[0][1:11], "%Y-%m-%d").date()
                    today = date.today()
                    age = today.year - born.year - ((today.month, today.day) < (born.month, born.day))
                except:
                    age = None
                instance['age'] = age

                #We add the ethnic group
                if speaker.ethnic_group is not None:
                    instance['ethnic_group'] = []
                    for i in speaker['ethnic_group']:
                        ethnic = label.loc[i]['Label']
                        instance['ethnic_group'].append(ethnic)
                else:
                    instance['ethnic_group'] = None

                #We add the party
                if speaker.party is not None:
                    instance['party'] = []
                    for i in speaker['party']:
                        part = label.loc[i]['Label']
                        instance['party'].append(part)
                else:
                    instance['party'] = None

                #We add the religion
                if speaker.religion is not None:
                    instance['religion'] = []
                    for i in speaker['religion']:
                        relig = label.loc[i]['Label']
                        instance['religion'].append(relig)
                else:
                    instance['religion'] = None

                d_file.write((json.dumps(instance)+'\n').encode('utf-8')) # writing in the new file


print('iter = {i}'.format(i = iter))
print('nb_occ = {n}'.format(n = nb_occ))

nombre de citations lus: 100000
nombre de citations lus: 200000
nombre de citations lus: 300000
nombre de citations lus: 400000
nombre de citations lus: 500000
nombre de citations lus: 600000
nombre de citations lus: 700000
nombre de citations lus: 800000
nombre de citations lus: 900000
nombre de citations lus: 1000000
nombre de citations lus: 1100000
nombre de citations lus: 1200000
nombre de citations lus: 1300000
nombre de citations lus: 1400000
nombre de citations lus: 1500000
nombre de citations lus: 1600000
nombre de citations lus: 1700000
nombre de citations lus: 1800000
nombre de citations lus: 1900000
nombre de citations lus: 2000000
nombre de citations lus: 2100000
nombre de citations lus: 2200000
nombre de citations lus: 2300000
nombre de citations lus: 2400000
nombre de citations lus: 2500000
nombre de citations lus: 2600000
nombre de citations lus: 2700000
nombre de citations lus: 2800000
nombre de citations lus: 2900000
nombre de citations lus: 3000000
nombre de citations

KeyError: 'Q6363085'

**Useful statistics :**   
Number of "different" quotes in the 2017 newpapers:  26 611 588.
Some of these quotes being mentionned in several articles (number of occurence >1), the total number of quotes in 2017 is: 136 326 717.