# Import

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import bz2
import urllib
from datetime import datetime, date

data_folder = 'data/'

In [38]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/theopatron/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Creation of extended files

### FOR THE TIMELINE
keep quotation mentionning guns 

### FOR THE ANALYSIS
keep quotation mentionning guns <br/>
keep only those that have a speaker <br/>
keep american speakers <br/>
add colums corresponding to characteristics of speaker <br/>

In [39]:
lexical_field = ['gun','firearm','mass shooting','murder','homicide','gun shot','armed robbery','rifles','Second Amendment','Columbine', 'gun control']

speakers = pd.read_parquet(data_folder + 'speaker_attributes.parquet')
label = pd.read_csv(data_folder + 'wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col='QID')

In [40]:
path_to_file = data_folder + 'quotes-2017.json.bz2' 
path_to_out = data_folder + 'quotes-2017-extended.json.bz2'

iter = 0

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            iter += 1
            if any(substring in instance['quotation'] for substring in lexical_field) and instance['qids'] != []: #We keep only quotation containing words of the lexical field and where there is a speaker
                speaker = speakers.loc[speakers['id'] == instance['qids'][0]].squeeze()
                
                if speaker.nationality is not None: 
                    if any(substring == 'Q30' for substring in speaker.nationality): #keep american speaker

                        #We add the gender
                        instance['gender'] = 'male' if (speaker['gender'][0] == 'Q6581097') else 'female' 

                        #We add the nationalities
                        instance['nationality'] = []
                        for i in speaker['nationality']: 
                            nat = label.loc[i]['Label']
                            instance['nationality'].append(nat)

                        #We add the occupations
                        instance['occupation'] = []
                        for i in speaker['occupation']:
                            occ = label.loc[i]['Label']
                            instance['occupation'].append(occ)

                        #We add the date of birth
                        try:
                            born = datetime.strptime(speaker.date_of_birth[0][1:11], "%Y-%m-%d").date()
                            today = date.today()
                            age = today.year - born.year - ((today.month, today.day) < (born.month, born.day))
                        except:
                            age = None
                        instance['age'] = age

                        #We add the ethnic group
                        if speaker.ethnic_group is not None:
                            instance['ethnic_group'] = []
                            for i in speaker['ethnic_group']:
                                ethnic = label.loc[i]['Label']
                                instance['ethnic_group'].append(ethnic)
                        else:
                            instance['ethnic_group'] = None

                        #We add the party
                        if speaker.party is not None:
                            instance['party'] = []
                            for i in speaker['party']:
                                part = label.loc[i]['Label']
                                instance['party'].append(part)
                        else:
                            instance['party'] = None

                        #We add the religion
                        if speaker.religion is not None:
                            instance['religion'] = []
                            for i in speaker['religion']:
                                relig = label.loc[i]['Label']
                                instance['religion'].append(relig)
                        else:
                            instance['religion'] = None

                        d_file.write((json.dumps(instance)+'\n').encode('utf-8')) # writing in the new file
  
            #to test
            # if iter > 100000:
            #     break

# Analysis

In [36]:
def process_chunk(chunk):
    print(f'Processing chunk with {len(chunk)} rows')
    display(chunk)

with pd.read_json(data_folder + 'quotes-2017-extended.json.bz2', lines=True, compression='bz2', chunksize=100000) as df_reader:
    for chunk in df_reader:
        process_chunk(chunk)
        break

Processing chunk with 23 rows


Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,gender,nationality,occupation,age,ethnic_group,party,religion
0,2017-11-19-055725,The easiest one is to make guns better for hom...,David Hemenway,[Q1107796],2017-11-19 14:50:20,1,"[[David Hemenway, 0.791], [None, 0.209]]",[http://www.wbur.org/npr/462252799/research-su...,E,male,[United States of America],"[economist, university teacher]",76.0,,,
1,2017-06-16-034936,I ask that you will continue to pray for him a...,Gov. John Bel Edwards,[Q6221385],2017-06-16 03:52:10,1,"[[Gov. John Bel Edwards, 0.6379], [None, 0.186...",[http://fox8live.com/story/35678095/scalise-sh...,E,male,[United States of America],"[lawyer, politician]",55.0,,[Democratic Party],
2,2017-07-30-042440,It's not just a crime issue. It's an everyday ...,Greg Mitchell,[Q3116261],2017-07-30 11:57:14,1,"[[Greg Mitchell, 0.9228], [None, 0.0772]]",[http://abc7chicago.com/news/man-charged-with-...,E,male,[United States of America],"[journalist, writer, blogger]",74.0,,,
3,2017-06-14-157004,"You have a baseball bat, they have a rifle, yo...",Mo Brooks,[Q1941306],2017-06-14 12:14:33,272,"[[Mo Brooks, 0.8501], [None, 0.1003], [Bernie ...",[http://www.naplesnews.com/story/news/nation/2...,E,male,[United States of America],"[politician, lawyer, clerk]",67.0,,[Republican Party],
4,2017-11-17-007224,And that's when the youth pick up guns.,Eric King,"[Q5386890, Q5386891, Q57174567]",2017-11-17 03:31:00,2,"[[Eric King, 0.6366], [None, 0.3448], [Michael...",[http://www.wtvm.com/story/36866822/dougherty-...,E,male,[United States of America],[American football player],39.0,,,
5,2017-06-15-152823,"When I go to states like Arizona, when I go to...",Jedediah Bila,[Q16841387],2017-06-15 19:44:18,1,"[[Jedediah Bila, 0.8488], [Whoopi Goldberg, 0....",[http://rare.us/rare-news/the-media/the-claws-...,E,female,[United States of America],"[writer, journalist, columnist, politician, te...",42.0,,,
6,2017-09-27-135272,They did have to draw their guns and they did ...,Tracey Adams,"[Q239492, Q26110575]",2017-09-27 22:25:14,3,"[[Tracey Adams, 0.9225], [None, 0.0757], [Anto...",[http://www.wptv.com/news/region-the-glades/so...,E,female,[United States of America],"[pornographic actor, film actor]",63.0,,,
7,2017-11-07-077322,"It's just so absurd, Chris. The idea that -- I...",Alisyn Camerota,[Q4727358],2017-11-07 12:28:01,1,"[[Alisyn Camerota, 0.756], [None, 0.2236], [Pr...",[https://www.rawstory.com/2017/11/cnns-alisyn-...,E,female,[United States of America],[journalist],55.0,,,[Catholicism]
8,2017-08-30-196641,"You yearn for it and yearn for it, but there's...",Emily Fridlund,[Q41962473],2017-08-30 22:00:58,1,"[[Emily Fridlund, 0.9369], [None, 0.0631]]",[http://georgetownvoice.com/2017/08/30/the-voi...,E,female,[United States of America],[writer],,,,
9,2017-11-09-102929,"Nobody needs a machine gun, coming from a guy ...",Sturgill Simpson,[Q16736403],2017-11-09 14:19:28,1,"[[Sturgill Simpson, 0.7394], [None, 0.2606]]",[https://www.huffingtonpost.com/entry/sturgill...,E,male,[United States of America],"[singer, singer-songwriter]",43.0,,,
