# Import

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import bz2
import urllib
from datetime import datetime, date

data_folder = 'data/'

In [38]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/theopatron/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Creation of extended files

### FOR THE TIMELINE
keep quotation mentionning guns 

### FOR THE ANALYSIS
keep quotation mentionning guns <br/>
keep only those that have a speaker <br/>
keep american speakers <br/>
add colums corresponding to characteristics of speaker <br/>

In [39]:
lexical_field = ['gun','firearm','mass shooting','2nd Amendment','murder','homicide','gun shot','armed robbery','rifles','Second Amendment','Columbine', 'gun control']

speakers = pd.read_parquet(data_folder + 'speaker_attributes.parquet')
label = pd.read_csv(data_folder + 'wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col='QID')

In [45]:
path_to_file = data_folder + 'quotes-2017.json.bz2' 
path_to_out = data_folder + 'quotes-2017-extended.json.bz2'

iter = 0

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            iter += 1
            if any(substring.lower() in instance['quotation'].lower() for substring in lexical_field) and instance['qids'] != []: #We keep only quotation containing words of the lexical field and where there is a speaker
                speaker = speakers.loc[speakers['id'] == instance['qids'][0]].squeeze()
                
                if speaker.nationality is not None: 
                    if any(substring == 'Q30' for substring in speaker.nationality): #keep american speaker

                        #We add the gender
                        instance['gender'] = 'male' if (speaker['gender'][0] == 'Q6581097') else 'female' 

                        #We add the nationalities
                        instance['nationality'] = []
                        for i in speaker['nationality']: 
                            nat = label.loc[i]['Label']
                            instance['nationality'].append(nat)

                        #We add the occupations
                        if speaker.occupation is not None:
                            instance['occupation'] = []
                            for i in speaker['occupation']:
                                occ = label.loc[i]['Label']
                                instance['occupation'].append(occ)
                        else:
                            instance['occupation'] = None

                        #We add the date of birth
                        try:
                            born = datetime.strptime(speaker.date_of_birth[0][1:11], "%Y-%m-%d").date()
                            today = date.today()
                            age = today.year - born.year - ((today.month, today.day) < (born.month, born.day))
                        except:
                            age = None
                        instance['age'] = age

                        #We add the ethnic group
                        if speaker.ethnic_group is not None:
                            instance['ethnic_group'] = []
                            for i in speaker['ethnic_group']:
                                ethnic = label.loc[i]['Label']
                                instance['ethnic_group'].append(ethnic)
                        else:
                            instance['ethnic_group'] = None

                        #We add the party
                        if speaker.party is not None:
                            instance['party'] = []
                            for i in speaker['party']:
                                part = label.loc[i]['Label']
                                instance['party'].append(part)
                        else:
                            instance['party'] = None

                        #We add the religion
                        if speaker.religion is not None:
                            instance['religion'] = []
                            for i in speaker['religion']:
                                relig = label.loc[i]['Label']
                                instance['religion'].append(relig)
                        else:
                            instance['religion'] = None

                        d_file.write((json.dumps(instance)+'\n').encode('utf-8')) # writing in the new file
  
            #to test
            # if iter > 100000:
            #     break

# Analysis

In [44]:
def process_chunk(chunk):
    print(f'Processing chunk with {len(chunk)} rows')
    display(chunk.head(60))

with pd.read_json(data_folder + 'quotes-2017-extended.json.bz2', lines=True, compression='bz2', chunksize=100000) as df_reader:
    for chunk in df_reader:
        process_chunk(chunk)
        break

Processing chunk with 145 rows


Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,gender,nationality,occupation,age,ethnic_group,party,religion
0,2017-03-26-000661,"A big burglar, I never really knew him to carr...",Brian Andrews,"[Q2924842, Q4962982, Q4962983]",2017-03-26 17:01:19,1,"[[Brian Andrews, 0.7167], [Tony Fiore, 0.1666]...",[http://www.providencejournal.com/news/2017032...,E,male,[United States of America],[singer],41.0,,,
1,2017-05-22-015293,"Congress, investigate Seth Rich Murder! @ Juli...",Sean Hannity,[Q557730],2017-05-22 14:40:47,3,"[[Sean Hannity, 0.8718], [None, 0.0946], [Kim ...",[http://news.yahoo.com/news/seth-rich-conspira...,E,male,[United States of America],"[radio personality, television presenter, jour...",59.0,,[Conservative Party of New York State],[Catholicism]
2,2017-09-11-088873,similar rules for whether you can own a gun an...,Kris Kobach,[Q6437211],2017-09-11 22:17:52,2,"[[Kris Kobach, 0.5138], [John Lott, 0.3198], [...",[http://www.kansas.com/news/politics-governmen...,E,male,[United States of America],[lawyer],55.0,,[Republican Party],
3,2017-08-10-034688,He keeps getting better each day at practice. ...,Ruffin McNeill,[Q7377923],2017-08-10 01:00:45,1,"[[Ruffin McNeill, 0.7703], [None, 0.2297]]",[http://newsok.com/marquise-overton-expected-t...,E,male,[United States of America],"[head coach, American football player]",63.0,,,
4,2017-11-19-055725,The easiest one is to make guns better for hom...,David Hemenway,[Q1107796],2017-11-19 14:50:20,1,"[[David Hemenway, 0.791], [None, 0.209]]",[http://www.wbur.org/npr/462252799/research-su...,E,male,[United States of America],"[economist, university teacher]",76.0,,,
5,2017-10-03-036601,How could there be a file of mass shooting rem...,Jimmy Fallon,[Q335680],2017-10-03 15:18:43,3,"[[Jimmy Fallon, 0.4761], [None, 0.4727], [Adam...",[http://news.yahoo.com/entertainment/night-res...,E,male,[United States of America],"[actor, television presenter, writer, singer, ...",47.0,,[Democratic Party],[Catholicism]
6,2017-08-09-125224,The interesting thing that we're doing with th...,Ryan Murphy,"[Q1185730, Q19979249, Q21934130, Q316844, Q345...",2017-08-09 22:09:00,2,"[[Ryan Murphy, 0.8488], [None, 0.1447], [Santo...",[http://etonline.com/tv/223469_fx_reveals_amer...,E,male,[United States of America],[swimmer],26.0,,,
7,2017-08-16-042607,I and millions of other men and women across t...,Dana Loesch,[Q3013272],2017-08-16 00:50:13,1,"[[Dana Loesch, 0.6161], [None, 0.3839]]",[http://theblaze.com/video/congresswoman-blame...,E,female,[United States of America],"[radio personality, journalist]",43.0,,,
8,2017-12-11-100134,The Second Amendment -- you're aware of that? ...,Jeff Sessions,[Q358443],2017-12-11 18:00:00,1,"[[Jeff Sessions, 0.8341], [None, 0.1659]]",[https://www.thefix.com/jeff-sessions-gets-gri...,E,male,[United States of America],"[politician, lawyer, lawyer, political scientist]",74.0,,[Republican Party],[Methodism]
9,2017-03-14-098444,"They ask for money. I said, `I don't have any ...",Kim Kardashian,[Q186304],2017-03-14 19:40:45,1,"[[Kim Kardashian, 0.8586], [None, 0.1414]]",[http://feeds.inquisitr.com/~r/google/yDYq/~3/...,E,female,[United States of America],"[fashion designer, socialite, model, blogger, ...",41.0,"[Armenian American, Dutch American, English Am...",[Democratic Party],[Christianity]
