In [1]:
import pickle
import pandas
from collections import Counter, defaultdict
import operator
import utils

In [2]:
urls_and_paths = [('frames/children_killed', 'http://www.gunviolencearchive.org/children-killed'),
                  ('frames/children_injured', 'http://www.gunviolencearchive.org/children-injured'),
                  ('frames/teens_killed', 'http://www.gunviolencearchive.org/teens-killed'),
                  ('frames/teens_injured', 'http://www.gunviolencearchive.org/teens-injured'),
                  ('frames/accidental_deaths', 'http://www.gunviolencearchive.org/accidental-deaths'),
                  ('frames/accidental_injuries', 'http://www.gunviolencearchive.org/accidental-injuries'),
                  ('frames/accidental_deaths_children', 'http://www.gunviolencearchive.org/accidental-child-deaths'),
                  ('frames/accidental_injuries_children', 'http://www.gunviolencearchive.org/accidental-child-injuries'),
                  ('frames/accidental_deaths_teens', 'http://www.gunviolencearchive.org/accidental-teen-deaths'),
                  ('frames/accidental_injuries_teens', 'http://www.gunviolencearchive.org/accidental-teen-injuries'),
                  ('frames/officer_involved_shootings', 'http://www.gunviolencearchive.org/officer-involved-shootings'),
                  ('frames/mass_shootings_2013', 'http://www.gunviolencearchive.org/reports/mass-shootings/2013'),
                  ('frames/mass_shootings_2014', 'http://www.gunviolencearchive.org/reports/mass-shootings/2014'),
                  ('frames/mass_shootings_2015', 'http://www.gunviolencearchive.org/reports/mass-shootings/2015'),
                  ('frames/mass_shootings', 'http://www.gunviolencearchive.org/mass-shooting')]
CORPUS_NAME = 'the_violent_corpus'

In [3]:
frames = []
for df_path, url in urls_and_paths:
    with open(df_path, 'rb') as infile:
        df = pickle.load(infile)
        frames.append(df)
df = pandas.concat(frames)
len(df)

5485

In [4]:
total_participants=set()
alltokens=defaultdict(int)
for index, row in df.iterrows():
    if len(row['participants']):
        full_text=utils.concat_all_sources(row['incident_uri'])
        names=set()
        for p in row['participants']:
            if 'Name' in p and not p['Name'].strip().isdigit():
                names.add(p['Name'].strip())
        if all([n in full_text for n in names]):
            for name in names:
                total_participants.add(tuple([row['incident_uri'], name]))
                tokens=name.split()
                for t in tokens:
                    alltokens[t]+=1
print(len(total_participants))
#    if row['incident_uri'] not in uris:
#        pass # do the magic here
#        uris.add(row['incident_uri'])

3880


In [5]:
total_participants

{('386225', 'Mark Weekly'),
 ('724900', 'Alex Dixon'),
 ('590659', 'Brandon Ramirez'),
 ('491438', 'Gabriel Daniels'),
 ('610584', 'Tavaris Gilbert'),
 ('608614', 'Denise Couplin'),
 ('723429', 'Joel Rodriguez'),
 ('280068', 'Robert Thomas'),
 ('591674', 'Mary Bryan'),
 ('440743', 'Barbara Scott'),
 ('510769', 'Rakim Watson'),
 ('201648', 'Juan Pedro Garcia'),
 ('709650', 'Logan Ron Augustine'),
 ('680444', 'Marcos Antonio Jimenez'),
 ('651434', 'Stephon Harris'),
 ('174965', 'Tradon Crawford'),
 ('583067', 'Curtlon Panoske'),
 ('711803', 'Sang Kim'),
 ('726737', 'Deputy Ricky Scott'),
 ('351958', 'Audley Burrell'),
 ('531173', 'Qadarus Javon Grimes'),
 ('723825', 'David Shore'),
 ('727258', 'Levi Harrison'),
 ('640899', 'James Page'),
 ('579490', 'Maleah Ellis'),
 ('748066', 'Sheahonnie Davis'),
 ('579065', 'Juprie Wadley'),
 ('102719', 'James Carson Smith'),
 ('669118', 'Gregory Lambert'),
 ('465117', 'Salahudin Malik Robbins'),
 ('593238', 'Anna Sanchez'),
 ('539890', 'Branden Davis

# Full names stats

In [6]:
d=defaultdict(int)
for incident, participant in total_participants:
    d[participant]+=1
print(sorted(d.items(), key=operator.itemgetter(1)))

[('Zackery James Alexander', 1), ('Roman Kellough', 1), ('Deondre Kinney', 1), ('Houston Grant', 1), ('Devon Quinn', 1), ('Derrick Jackson', 1), ('Chris Mackey', 1), ('Donald Ethan Lawlis', 1), ('Fabriccio Patti', 1), ('Steven Dolan', 1), ('Dewaun Rockingham', 1), ('Lesly Paredes', 1), ('Daniel Lusk', 1), ('Kenneth Moore', 1), ('Lontrell Brantley', 1), ('Rick Foster', 1), ('Hubert Allen Jr.', 1), ('Sherie Lash, a.k.a. Sherie Rhoades', 1), ('Darmequaye Cohill', 1), ('Christopher Rutledge', 1), ('Ginger Reeves', 1), ('Romell L. Jones', 1), ('Frederick Seymore Jr.', 1), ('Jason Jonathan Acosta-Ramos', 1), ('Makayla Smith', 1), ('Taiquan Moss', 1), ('Jacoreian Jennings', 1), ('Brandon Cunningham', 1), ('Leyton McNabb', 1), ('Christopher Miller', 1), ('Darnae Christon', 1), ('Mario Cervantes-Angel', 1), ('Curtis Leon Williams', 1), ('Santos Navares', 1), ('Trevor Bernardoni', 1), ('Kenza Benzakour', 1), ('Kajmere Burton', 1), ('Jayquon Johnson', 1), ('Benjamin Bond', 1), ('Martavis Cooper',

### Distribution

In [7]:
freq = Counter([value for value in d.values()])
freq

Counter({1: 3817, 2: 24, 3: 2, 9: 1})

### MOA

In [8]:
num_forms=len(d.items())
num_form_instances=len(total_participants)
estimated_moa_full=num_form_instances/num_forms
print(estimated_moa_full)

1.0093652445369408


# Tokens stats

### Distribution

In [9]:
freq_token = Counter([value for value in alltokens.values()])
freq_token

Counter({1: 2567,
         2: 675,
         3: 360,
         4: 150,
         5: 97,
         6: 77,
         7: 42,
         8: 37,
         9: 29,
         10: 18,
         11: 18,
         12: 14,
         13: 11,
         14: 13,
         15: 5,
         16: 7,
         17: 3,
         18: 5,
         19: 6,
         20: 4,
         21: 8,
         23: 4,
         24: 3,
         26: 5,
         27: 1,
         28: 3,
         29: 1,
         30: 1,
         31: 2,
         32: 1,
         34: 1,
         36: 2,
         40: 2,
         47: 1,
         48: 1,
         50: 1,
         52: 1,
         54: 1,
         55: 2,
         57: 1,
         58: 1,
         59: 1,
         63: 1,
         68: 1,
         70: 1,
         71: 1,
         85: 1,
         94: 1})

### MOA

In [10]:
len(alltokens)

4188

In [11]:
s=0
for k,v in alltokens.items():
    s+=v
estimated_moa_tokens=s/len(alltokens)
print(estimated_moa_tokens)

2.5515759312320916


# Expansion against existing EL corpora

In [12]:
result=utils.check_overlap(d.keys()) # are full entity names found
result

{('el_corpora/AIDA-YAGO2_entities_and_links.tsv',
  'Bobby',
  'http://en.wikipedia.org/wiki/Bobby_Charlton\n'),
 ('el_corpora/AIDA-YAGO2_entities_and_links.tsv',
  'Carlos Delgado',
  'http://en.wikipedia.org/wiki/Carlos_Delgado\n'),
 ('el_corpora/AIDA-YAGO2_entities_and_links.tsv',
  'David Wilson',
  'http://en.wikipedia.org/wiki/Dave_Wilson_(rugby_union)\n'),
 ('el_corpora/AIDA-YAGO2_entities_and_links.tsv',
  'Frank Thomas',
  'http://en.wikipedia.org/wiki/Frank_Thomas_(baseball,_born_1968)\n'),
 ('el_corpora/AIDA-YAGO2_entities_and_links.tsv',
  'Juan Gonzalez',
  'http://en.wikipedia.org/wiki/Juan_González_(baseball)\n'),
 ('el_corpora/AIDA-YAGO2_entities_and_links.tsv', 'Kim', 'NIL\n'),
 ('el_corpora/AIDA-YAGO2_entities_and_links.tsv',
  'Michael Johnson',
  'http://en.wikipedia.org/wiki/Michael_Johnson_(athlete)\n'),
 ('el_corpora/neel2014-test.tsv',
  'Chris',
  'http://dbpedia.org/resource/Chris_Griffin\n'),
 ('el_corpora/neel2014-test.tsv',
  'Princeton',
  'http://dbpedia.

In [14]:
result2=utils.check_overlap(alltokens.keys()) # are full entity names found
result2

{('el_corpora/neel2014.tsv',
  'Paige',
  'http://dbpedia.org/resource/Paige_(band)\n'),
 ('el_corpora/AIDA-YAGO2_entities_and_links.tsv',
  'Henry',
  'http://en.wikipedia.org/wiki/Thierry_Henry\n'),
 ('el_corpora/rss500-unicode.tsv',
  'John',
  'http://dbpedia.org/resource/John_Calipari\n'),
 ('el_corpora/wes2015.tsv',
  'Cornell',
  'http://dbpedia.org/resource/Cornell_University\n'),
 ('el_corpora/AIDA-YAGO2_entities_and_links.tsv',
  'Simmons',
  'http://en.wikipedia.org/wiki/Phil_Simmons\n'),
 ('el_corpora/neel2015-test.tsv', 'Maria', 'NIL545\n'),
 ('el_corpora/neel2014.tsv', 'Jesus', 'http://dbpedia.org/resource/Jesus\n'),
 ('el_corpora/neel2014.tsv',
  'Joey',
  'http://dbpedia.org/resource/Joey_Tribbiani\n'),
 ('el_corpora/AIDA-YAGO2_entities_and_links.tsv',
  'Dayton',
  'http://en.wikipedia.org/wiki/Dayton_Agreement\n'),
 ('el_corpora/wes2015.tsv',
  'Elizabeth',
  'http://dbpedia.org/resource/Elizabeth_Montagu\n'),
 ('el_corpora/neel2015.tsv',
  'Watford',
  'http://dbpedi

In [16]:
print(len(result))
print(len(result2))

36
1239
