# Import

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import bz2
import urllib
from datetime import datetime, date

data_folder = 'data/'

In [17]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/theopatron/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Creation of extended files

### FOR THE TIMELINE
keep quotation mentionning guns 

### FOR THE ANALYSIS
keep quotation mentionning guns <br/>
keep only those that have a speaker <br/>
keep american speakers <br/>
add colums corresponding to characteristics of speaker <br/>

In [18]:
lexical_field = ['guns', 'rifle', 'slaughter'] # To be continued

speakers = pd.read_parquet(data_folder + 'speaker_attributes.parquet')
label = pd.read_csv(data_folder + 'wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col='QID')

In [92]:
path_to_file = data_folder + 'quotes-2015.json.bz2' 
path_to_out = data_folder + 'quotes-2015-extended.json.bz2'

iter = 0

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            iter += 1
            if any(substring in instance['quotation'] for substring in lexical_field) and instance['qids'] != []: #We keep only quotation containing words of the lexical field and where there is a speaker
                speaker = speakers.loc[speakers['id'] == instance['qids'][0]].squeeze()
                
                if speaker.nationality is not None: 
                    if any(substring == 'Q30' for substring in speaker.nationality): #keep american speaker

                        #We add the gender
                        instance['gender'] = 'male' if (speaker['gender'][0] == 'Q6581097') else 'female' 

                        #We add the nationalities
                        instance['nationality'] = []
                        for i in speaker['nationality']: 
                            nat = label.loc[i]['Label']
                            instance['nationality'].append(nat)

                        #We add the occupations
                        instance['occupation'] = []
                        for i in speaker['occupation']:
                            occ = label.loc[i]['Label']
                            instance['occupation'].append(occ)

                        #We add the date of birth
                        try:
                            born = datetime.strptime(speaker.date_of_birth[0][1:11], "%Y-%m-%d").date()
                            today = date.today()
                            age = today.year - born.year - ((today.month, today.day) < (born.month, born.day))
                        except:
                            age = None
                        instance['age'] = age

                        #We add the ethnic group -> problem here (NaN in the output file)
                        if speaker.ethnic_group is not None:
                            instance['ethnic_group'] = []
                            for i in speaker['ethnic_group']:
                                ethnic = label.loc[i]['Label']
                                instance['ethnic_group'].append(ethnic)
                        else:
                            instance['ethnic_group'] = None

                        #We add the party
                        if speaker.party is not None:
                            instance['party'] = []
                            for i in speaker['party']:
                                part = label.loc[i]['Label']
                                instance['party'].append(part)
                        else:
                            instance['party'] = None

                        #We add the religion
                        if speaker.religion is not None:
                            instance['religion'] = []
                            for i in speaker['religion']:
                                relig = label.loc[i]['Label']
                                instance['religion'].append(relig)
                        else:
                            instance['religion'] = None

                        d_file.write((json.dumps(instance)+'\n').encode('utf-8')) # writing in the new file
  
            #to test
            if iter > 20000:
                break

# Analysis

In [93]:
def process_chunk(chunk):
    print(f'Processing chunk with {len(chunk)} rows')
    display(chunk)
    # print(chunk.quotation[1])
    # chunk['scores'] = chunk['quotation'].apply(lambda review: sid.polarity_scores(review))
    # chunk['compound']  = chunk['scores'].apply(lambda score_dict: score_dict['compound'])
    # for id, row in chunk.iterrows():
    #     print(chunk.quotation[id])
    #     print(chunk.compound[id])

with pd.read_json(data_folder + 'quotes-2015-extended.json.bz2', lines=True, compression='bz2', chunksize=10000) as df_reader:
    for chunk in df_reader:
        process_chunk(chunk)
        break

Processing chunk with 7 rows


Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,gender,nationality,occupation,age,ethnic_group,party,religion
0,2015-12-08-003771,After the failure of the U.S. Congress to pass...,Roy Cooper,"[Q16106910, Q7372694, Q7372695]",2015-12-08 16:15:05,1,"[[Roy Cooper, 0.928], [None, 0.0574], [Frank P...",[http://digtriad.com/story/news/2015/12/08/sho...,E,male,[United States of America],[politician],76.0,,[Republican Party],
1,2015-11-14-003251,And I will tell you what -- you can say what y...,Donald Trump,"[Q22686, Q27947481]",2015-11-14 22:45:14,6,"[[Donald Trump, 0.798], [None, 0.202]]",[http://news.yahoo.com/paris-wouldve-different...,E,male,[United States of America],"[business magnate, investor, restaurateur, non...",75.0,,"[Republican Party, Independence Party of Ameri...",[Presbyterianism]
2,2015-07-29-058316,"It means higher prices, it means a slaughter o...",Sherrod Brown,[Q381880],2015-07-29 23:37:47,1,"[[Sherrod Brown, 0.4062], [Pat Roberts, 0.3282...",[http://www.nbc15.com/news/state/headlines/Wis...,E,male,[United States of America],"[politician, university teacher]",68.0,,[Democratic Party],"[Lutheranism, Evangelical Lutheran Church in A..."
3,2015-12-07-058906,Law abiding Americans owning guns is not the p...,Ralph Peters,[Q7287957],2015-12-07 20:31:06,1,"[[Ralph Peters, 0.8094], [Stuart Varney, 0.142...",[http://www.ibtimes.co.uk/obama-such-total-py-...,E,male,[United States of America],"[novelist, non-fiction writer]",69.0,,,
4,2015-02-05-088180,There were three long rifles as well as a hand...,Michael O'Keefe,"[Q16093372, Q575761]",2015-02-05 18:23:46,1,"[[Michael O'Keefe, 0.6323], [None, 0.1913], [D...",[http://whdh.com/Global/story.asp?S=28030754],E,male,[United States of America],"[lawyer, politician]",,,[Democratic Party],
5,2015-03-15-036625,that criminals armed with guns are targeting o...,Carl Williams,"[Q130674, Q3658816, Q39079097, Q614934]",2015-03-15 17:09:28,1,"[[Carl Williams, 0.8798], [None, 0.1202]]",[http://news.yahoo.com/jamaica-police-off-duty...,E,male,[United States of America],"[screenwriter, film producer, television produ...",70.0,,,
6,2015-10-08-001431,a couch covered in assault rifles,Katie Nolan,[Q19668331],2015-10-08 17:37:47,1,"[[Katie Nolan, 0.5683], [None, 0.2829], [Greg ...",[http://feeds.huffingtonpost.com/c/35496/f/677...,E,female,[United States of America],"[presenter, journalist, podcaster]",34.0,,,
