In [1]:
%%bash
DIRECTORY='GVDB/gvdb-aggregated-db'
propersize=471

if [ ! -d "$DIRECTORY" ]; then
    mkdir GVDB
    echo "Please download the GVDB from http://gun-violence.org/download/ inside the GVDB folder !"
else
    echo "Gun Violence DB Directory found !"
    sizekb=`du -m "$DIRECTORY" | cut -f1`
    echo "Amount of data in the GunViolence DB (should be $propersize MB): $sizekb MB"
    if [ $sizekb -eq $propersize ] ; then
        echo "You have the right size. You are good to go."
    else
        echo "You might want to check whether your GVDB directory is alright. Its size is $sizemb MB instead of $propersize MB"
    fi
fi

Gun Violence DB Directory found !
Amount of data in the GunViolence DB (should be 471 MB): 471 MB
You have the right size. You are good to go.


In [2]:
import pickle
import pandas
from datetime import datetime
from collections import defaultdict
import csv
import json
import Levenshtein

import utils

In [3]:
# File mappings
DIRECTORY='GVDB/gvdb-aggregated-db'
article_info="%s/Articles-with-extracted-info.tsv" % DIRECTORY
event_info="%s/Events.tsv" % DIRECTORY
print(article_info)

GVDB/gvdb-aggregated-db/Articles-with-extracted-info.tsv


In [4]:
urls_and_paths = [('frames/children_killed', 'http://www.gunviolencearchive.org/children-killed'),
                  ('frames/children_injured', 'http://www.gunviolencearchive.org/children-injured'),
                  ('frames/teens_killed', 'http://www.gunviolencearchive.org/teens-killed'),
                  ('frames/teens_injured', 'http://www.gunviolencearchive.org/teens-injured'),
                  ('frames/accidental_deaths', 'http://www.gunviolencearchive.org/accidental-deaths'),
                  ('frames/accidental_injuries', 'http://www.gunviolencearchive.org/accidental-injuries'),
                  ('frames/accidental_deaths_children', 'http://www.gunviolencearchive.org/accidental-child-deaths'),
                  ('frames/accidental_injuries_children', 'http://www.gunviolencearchive.org/accidental-child-injuries'),
                  ('frames/accidental_deaths_teens', 'http://www.gunviolencearchive.org/accidental-teen-deaths'),
                  ('frames/accidental_injuries_teens', 'http://www.gunviolencearchive.org/accidental-teen-injuries'),
                  ('frames/officer_involved_shootings', 'http://www.gunviolencearchive.org/officer-involved-shootings'),
                  ('frames/mass_shootings_2013', 'http://www.gunviolencearchive.org/reports/mass-shootings/2013'),
                  ('frames/mass_shootings_2014', 'http://www.gunviolencearchive.org/reports/mass-shootings/2014'),
                  ('frames/mass_shootings_2015', 'http://www.gunviolencearchive.org/reports/mass-shootings/2015'),
                  ('frames/mass_shootings', 'http://www.gunviolencearchive.org/mass-shooting')]
CORPUS_NAME = 'the_violent_corpus'

## Article overlap between GVDB and GVA

In [5]:
def get_sources(dataframe):
    """
    :param dataframe:
    :return:
    """
    sources = set()
    sources_to_incidents=defaultdict(set)
    for index, row in dataframe.iterrows():
        sources.add(row['source_url'])
        sources.update(row['incident_sources'])
        incident=row['incident_uri']
        sources_to_incidents[utils.hash_uri(row['source_url'])].add(incident)
        for s in row['incident_sources']:
            sources_to_incidents[utils.hash_uri(s)].add(incident)
    return sources, sources_to_incidents

In [6]:
def get_source_title(sources_to_incidents):
    titles_to_sources=defaultdict(str)
    for s,i in sources_to_incidents.items():
        path = 'the_violent_corpus/%s/%s.json' % (i.pop(),s)
        try:
            f=open(path)
            document = json.load(f)
            title=document['title']
            titles_to_sources[title]=s
        except:  # NODATE documents
            pass
    return titles_to_sources

In [7]:
def normalize(s):
    return s.strip().lower()

In [8]:
frames = []
for df_path, url in urls_and_paths:
    with open(df_path, 'rb') as infile:
        df = pickle.load(infile)
        frames.append(df)
df = pandas.concat(frames)
gva_sources, gva_sources_to_incidents=get_sources(df)
gva_titles_to_sources=get_source_title(gva_sources_to_incidents)
len(gva_sources)

9641

In [9]:
with open(article_info, 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter='\t', quotechar='"')
    gvdb_sources=set()
    for index,row in enumerate(spamreader):
        url=row[0]
        if "gold-data" in url: # anonymized URL
            title=row[1]
            content=row[2]
            for gva_title in gva_titles_to_sources:
                lev_ratio=Levenshtein.ratio(normalize(title), normalize(gva_title))                
                if normalize(title)==normalize(gva_title):
                    print(gva_titles_to_sources[gva_title])
                elif lev_ratio>0.9:
                    print(normalize(title), normalize(gva_title), lev_ratio)
        else:
            gvdb_sources.add(url)

intersection=gva_sources & gvdb_sources 


In [None]:

date_distributions=defaultdict(int)
with open(article_info, 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter='\t', quotechar='"')
    gvdb_sources=set()
    for index,row in enumerate(spamreader):
        url=row[0]
        title=row[1]
        content=row[2]
        if url in intersection:
            annotations=json.loads(row[3])
            time_and_location=annotations['date-and-time']
            # circumstances annotations (weapons & so) -> skipped these
            # location annotations
            state=time_and_location['state']
            city=time_and_location['city']
            detailed_loc=time_and_location['details']
            # time annotations
            if time_and_location['date']:
                date=datetime.strptime(time_and_location['date'], '%Y-%m-%d')
                date_distributions[date.year]+=1
                print(date)
                if date.year==1969:
                    print(url)
                    print(annotations)
                    break
            time_day=time_and_location['time-day']
            clock_time=time_and_location['clock-time']
            # participants
            shooters=annotations['shooter-section']
            victims=annotations['victim-section']
            #break

In [None]:
print(intersection)

In [None]:
print(date_distributions)