In [1]:
import pandas
from tabulate import tabulate
from IPython.core.display import display, HTML
from statistics import mean
from collections import Counter

In [2]:
import look_up_utils
import createq_utils

In [3]:
from glob import glob
import os
all_frames = [os.path.basename(path)
              for path in glob('../EventRegistries/GunViolence/frames/*')
              if not 'Icon' in path
             ]
all_frames

['accidental_deaths',
 'accidental_deaths_children',
 'accidental_deaths_teens',
 'accidental_injuries',
 'accidental_injuries_children',
 'accidental_injuries_teens',
 'children_injured',
 'children_killed',
 'mass_shootings',
 'mass_shootings_2013',
 'mass_shootings_2014',
 'mass_shootings_2015',
 'officer_involved_shootings',
 'teens_injured',
 'teens_killed']

## Load frames

In [4]:
frames = all_frames
df = pandas.concat([pandas.read_pickle('../EventRegistries/GunViolence/frames/' + frame)
                    for frame in frames])

## Load look_up

In [5]:
look_up, parameters2incident_uris = look_up_utils.create_look_up(df)

## Create table

In [6]:
lists_of_lists = []
wanted_attrs = ['answer', 'c2s_ratio', 
                #'a_avg_date_spread', 
                'a_avg_num_sources', 
                #'c_avg_date_spread', 
                'c_avg_num_sources']

headers = ['confusion', 'min_#_a_incidents', '#cand', 'granularity'] + wanted_attrs 

for confusion_tuple in [('location', 'time'),
                        ('location', 'participant'),
                        ('participant', 'time'),
                        ]:
    for min_num_answer_incidents in range(2, 4):
        print(confusion_tuple, min_num_answer_incidents)
        candidates=createq_utils.lookup_and_merge(look_up, 
                                                  parameters2incident_uris,
                                                  confusion_tuple,
                                                  min_num_answer_incidents,
                                                  df,
                                                  debug=False,
                                                  inspect_one=False) 
        
        gran_distr = Counter([getattr(cand, 'granularity') 
                              for cand in candidates])
        
        one_row = [confusion_tuple, min_num_answer_incidents, len(candidates), gran_distr]
        
        for attr in wanted_attrs:
            values = [getattr(cand, attr) for cand in candidates]
            row_value = (round(min(values), 1), 
                         round(mean(values), 1), 
                         round(max(values), 1))
            one_row.append(row_value)
        
        lists_of_lists.append(one_row)

stats_df = pandas.DataFrame(lists_of_lists, columns=headers)
table = tabulate(stats_df, headers='keys', tablefmt='html')
display(HTML(table))

('location', 'time') 2
('location', 'time') 3
('location', 'participant') 2
('location', 'participant') 3
('participant', 'time') 2
('participant', 'time') 3


Unnamed: 0,confusion,min_#_a_incidents,#cand,granularity,answer,c2s_ratio,a_avg_num_sources,c_avg_num_sources
0,"('location', 'time')",2,2549,"Counter({('state', 'month'): 814, ('city', 'year'): 560, ('city', 'month'): 452, ('state', 'day'): 424, ('state', 'year'): 212, ('city', 'day'): 82, ('address', 'year'): 3, ('address', 'month'): 2})","(2, 5.2, 155)","(0.5, 172.2, 1001.5)","(0.3, 2.2, 12.3)","(1.6, 2.1, 4.5)"
1,"('location', 'time')",3,1254,"Counter({('state', 'month'): 545, ('city', 'year'): 300, ('state', 'year'): 196, ('city', 'month'): 133, ('state', 'day'): 72, ('city', 'day'): 8})","(3, 8.6, 155)","(10.3, 123.9, 666.0)","(0.3, 2.2, 12.3)","(1.6, 2.1, 2.9)"
2,"('location', 'participant')",2,516,"Counter({('state', 'first'): 216, ('state', 'last'): 213, ('city', 'last'): 42, ('city', 'first'): 37, ('city', 'full_name'): 4, ('state', 'full_name'): 4})","(2, 2.3, 9)","(0.0, 90.2, 191.0)","(1.0, 3.3, 17.5)","(0, 2.3, 4.8)"
3,"('location', 'participant')",3,114,"Counter({('state', 'last'): 56, ('state', 'first'): 53, ('city', 'last'): 4, ('city', 'first'): 1})","(3, 3.5, 9)","(25.0, 73.2, 122.3)","(1.0, 3.2, 8.7)","(1.6, 2.2, 3.2)"
4,"('participant', 'time')",2,3302,"Counter({('first', 'year'): 1403, ('last', 'year'): 1145, ('first', 'month'): 353, ('last', 'month'): 344, ('first', 'day'): 23, ('last', 'day'): 18, ('full_name', 'month'): 8, ('full_name', 'year'): 8})","(2, 3.9, 19)","(2.0, 336.8, 999.5)","(1.0, 3.2, 14.5)","(1.6, 2.2, 3.2)"
5,"('participant', 'time')",3,1740,"Counter({('first', 'year'): 825, ('last', 'year'): 713, ('last', 'month'): 112, ('first', 'month'): 81, ('last', 'day'): 6, ('first', 'day'): 3})","(3, 5.7, 19)","(5.7, 270.9, 666.7)","(1.0, 3.2, 13.7)","(1.7, 2.2, 3.2)"


In [7]:
pandas.to_pickle(stats_df, 'stats.pickle')