In [3]:
%%bash
DIRECTORY='GVDB/gvdb-aggregated-db'
propersize=471

if [ ! -d "$DIRECTORY" ]; then
    mkdir GVDB
    echo "Please download the GVDB from http://gun-violence.org/download/ inside the GVDB folder !"
else
    echo "Gun Violence DB Directory found !"
    sizekb=`du -m "$DIRECTORY" | cut -f1`
    echo "Amount of data in the GunViolence DB (should be $propersize MB): $sizekb MB"
    if [ $sizekb -eq $propersize ] ; then
        echo "You have the right size. You are good to go."
    else
        echo "You might want to check whether your GVDB directory is alright. Its size is $sizemb MB instead of $propersize MB"
    fi
fi

Gun Violence DB Directory found !
Amount of data in the GunViolence DB (should be 471 MB): 471 MB
You have the right size. You are good to go.


In [4]:
# File mappings
DIRECTORY='GVDB/gvdb-aggregated-db'
article_info="%s/Articles-with-extracted-info.tsv" % DIRECTORY
event_info="%s/Events.tsv" % DIRECTORY
print(article_info)

GVDB/gvdb-aggregated-db/Articles-with-extracted-info.tsv


In [15]:
urls_and_paths = [('frames/children_killed', 'http://www.gunviolencearchive.org/children-killed'),
                  ('frames/children_injured', 'http://www.gunviolencearchive.org/children-injured'),
                  ('frames/teens_killed', 'http://www.gunviolencearchive.org/teens-killed'),
                  ('frames/teens_injured', 'http://www.gunviolencearchive.org/teens-injured'),
                  ('frames/accidental_deaths', 'http://www.gunviolencearchive.org/accidental-deaths'),
                  ('frames/accidental_injuries', 'http://www.gunviolencearchive.org/accidental-injuries'),
                  ('frames/accidental_deaths_children', 'http://www.gunviolencearchive.org/accidental-child-deaths'),
                  ('frames/accidental_injuries_children', 'http://www.gunviolencearchive.org/accidental-child-injuries'),
                  ('frames/accidental_deaths_teens', 'http://www.gunviolencearchive.org/accidental-teen-deaths'),
                  ('frames/accidental_injuries_teens', 'http://www.gunviolencearchive.org/accidental-teen-injuries'),
                  ('frames/officer_involved_shootings', 'http://www.gunviolencearchive.org/officer-involved-shootings'),
                  ('frames/mass_shootings_2013', 'http://www.gunviolencearchive.org/reports/mass-shootings/2013'),
                  ('frames/mass_shootings_2014', 'http://www.gunviolencearchive.org/reports/mass-shootings/2014'),
                  ('frames/mass_shootings_2015', 'http://www.gunviolencearchive.org/reports/mass-shootings/2015'),
                  ('frames/mass_shootings', 'http://www.gunviolencearchive.org/mass-shooting')]
CORPUS_NAME = 'the_violent_corpus'

## Article overlap between GVDB and GVA

In [20]:
def get_sources(dataframe):
    """

    :param dataframe:
    :return:
    """
    sources = set()
    for index, row in dataframe.iterrows():
        sources.add(row['source_url'])
        sources.update(row['incident_sources'])
    return sources

In [25]:
import pickle
import pandas

frames = []
for df_path, url in urls_and_paths:
    with open(df_path, 'rb') as infile:
        df = pickle.load(infile)
        frames.append(df)
df = pandas.concat(frames)
gva_sources=get_sources(df)
len(gva_sources)

9641

In [35]:
import csv
import json
with open(article_info, 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter='\t', quotechar='"')
    gvdb_sources=set()
    for index,row in enumerate(spamreader):
        url=row[0]
        title=row[1]
        content=row[2]
        annotations=row[3]
        gvdb_sources.add(url)

intersection=gva_sources & gvdb_sources 


In [55]:
from datetime import datetime
from collections import defaultdict

date_distributions=defaultdict(int)
with open(article_info, 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter='\t', quotechar='"')
    gvdb_sources=set()
    for index,row in enumerate(spamreader):
        url=row[0]
        title=row[1]
        content=row[2]
        if url in intersection:
            annotations=json.loads(row[3])
            time_and_location=annotations['date-and-time']
            # circumstances annotations (weapons & so) -> skipped these
            # location annotations
            state=time_and_location['state']
            city=time_and_location['city']
            detailed_loc=time_and_location['details']
            # time annotations
            if time_and_location['date']:
                date=datetime.strptime(time_and_location['date'], '%Y-%m-%d')
                date_distributions[date.year]+=1
                print(date)
                if date.year==1969:
                    print(url)
                    print(annotations)
                    break
            time_day=time_and_location['time-day']
            clock_time=time_and_location['clock-time']
            # participants
            shooters=annotations['shooter-section']
            victims=annotations['victim-section']
            #break

2016-01-30 00:00:00
2016-02-02 00:00:00
2015-07-26 00:00:00
2016-02-19 00:00:00
2016-02-24 00:00:00
2016-02-19 00:00:00
2016-02-16 00:00:00
2016-02-20 00:00:00
2015-11-22 00:00:00
2014-05-07 00:00:00
2015-07-03 00:00:00
2015-02-27 00:00:00
2015-08-09 00:00:00
2015-03-14 00:00:00
2014-11-18 00:00:00
2014-08-06 00:00:00
2015-08-05 00:00:00
2013-05-25 00:00:00
2013-08-02 00:00:00
2013-03-21 00:00:00
2016-01-25 00:00:00
2015-08-21 00:00:00
2015-12-25 00:00:00
2013-08-12 00:00:00
2015-08-19 00:00:00
2013-07-19 00:00:00
2013-04-10 00:00:00
2015-05-09 00:00:00
2014-07-30 00:00:00
2014-08-01 00:00:00
2013-08-24 00:00:00
2016-03-12 00:00:00
2013-08-11 00:00:00
2014-10-29 00:00:00
2013-06-09 00:00:00
2013-09-07 00:00:00
2015-03-15 00:00:00
2016-01-16 00:00:00
2015-11-21 00:00:00
2015-06-18 00:00:00
2015-06-04 00:00:00
2015-08-15 00:00:00
2013-02-11 00:00:00
2015-07-29 00:00:00
2014-12-16 00:00:00
2016-03-11 00:00:00
2016-02-09 00:00:00
2015-03-08 00:00:00
2015-01-01 00:00:00
2013-10-27 00:00:00


In [28]:
print(intersection)

{'http://dfw.cbslocal.com/2016/02/06/sheriffs-office-5-year-old-boy-accidentally-shot-in-kaufman-county/', 'http://fox8.com/2015/04/12/1-year-old-child-killed-in-shooting-in-cleveland/', 'http://dfw.cbslocal.com/2015/07/03/boy-accidentally-shoots-himself-in-the-leg/', 'http://abc13.com/news/15-year-old-boy-accidentally-shot-killed-by-brother-in-sw-houston/1161643/', 'http://fox4kc.com/2013/06/23/prospect-shooting-spree-puts-nine-in-hospital-two-critically-hurt/', 'http://foxct.com/2015/03/08/accidental-firearm-discharge-in-monroe-strikes-11-year-old-boy-in-the-cheek/', 'http://heralddemocrat.com/news/texas/5-slayings-3-places-4-hours-stretch-dallas-police-s-homicide-unit', 'http://abc7ny.com/archive/9202495/', 'http://crimeblog.dallasnews.com/2015/03/five-wounded-one-killed-in-lancaster-shooting.html/', 'http://fox40.com/2013/03/16/mass-casualty-incident-declared-in-galt-after-reported-shooting/', 'http://homicide.latimes.com/neighborhood/long-beach', 'http://fox59.com/2015/08/21/child

In [51]:
print(date_distributions)

defaultdict(<class 'int'>, {2016: 87, 1969: 1, 2017: 1, 2013: 46, 2014: 29, 2015: 100})
