In [1]:
import pandas
from tabulate import tabulate
from IPython.core.display import display, HTML
from statistics import mean
from collections import Counter

In [2]:
import look_up_utils
import createq_utils

In [3]:
from glob import glob
import os
all_frames = [os.path.basename(path)
              for path in glob('../EventRegistries/GunViolence/frames/*')
              if not 'Icon' in path
             ]
all_frames

['accidental_deaths',
 'accidental_deaths_children',
 'accidental_deaths_teens',
 'accidental_injuries',
 'accidental_injuries_children',
 'accidental_injuries_teens',
 'children_injured',
 'children_killed',
 'mass_shootings',
 'mass_shootings_2013',
 'mass_shootings_2014',
 'mass_shootings_2015',
 'officer_involved_shootings',
 'teens_injured',
 'teens_killed']

## Load frames

In [4]:
frames = all_frames
df = pandas.concat([pandas.read_pickle('../EventRegistries/GunViolence/frames/' + frame)
                    for frame in frames])
df = df.drop_duplicates(subset='incident_uri')

## Load look_up

In [5]:
look_up, parameters2incident_uris = look_up_utils.create_look_up(df)

## Create table

In [6]:
lists_of_lists = []
wanted_attrs = ['answer', 'c2s_ratio', 
                'a_avg_date_spread', 
                'a_avg_num_sources', 
                'c_avg_date_spread', 
                'c_avg_num_sources']

headers = ['confusion', 'min_#_a_incidents', '#cand', 'granularity'] + wanted_attrs 

for confusion_tuple in [('location', 'time'),
                        ('location', 'participant'),
                        ('participant', 'time'),
                        ]:
    for min_num_answer_incidents in range(2, 4):
        print(confusion_tuple, min_num_answer_incidents)
        candidates=createq_utils.lookup_and_merge(look_up, 
                                                  parameters2incident_uris,
                                                  confusion_tuple,
                                                  min_num_answer_incidents,
                                                  df,
                                                  debug=False,
                                                  inspect_one=False,
                                                  set_attr_values=True) 
        
        gran_distr = Counter([getattr(cand, 'granularity') 
                              for cand in candidates])
        
        one_row = [confusion_tuple, min_num_answer_incidents, len(candidates), gran_distr]
        
        for attr in wanted_attrs:
            values = [getattr(cand, attr) for cand in candidates]
            row_value = (round(min(values), 1), 
                         round(mean(values), 1), 
                         round(max(values), 1))
            one_row.append(row_value)
        
        lists_of_lists.append(one_row)

stats_df = pandas.DataFrame(lists_of_lists, columns=headers)
table = tabulate(stats_df, headers='keys', tablefmt='html')
display(HTML(table))

('location', 'time') 2
('location', 'time') 3
('location', 'participant') 2
('location', 'participant') 3
('participant', 'time') 2
('participant', 'time') 3


Unnamed: 0,confusion,min_#_a_incidents,#cand,granularity,answer,c2s_ratio,a_avg_date_spread,a_avg_num_sources,c_avg_date_spread,c_avg_num_sources
0,"('location', 'time')",2,1975,"Counter({('state', 'month'): 689, ('city', 'year'): 424, ('city', 'month'): 331, ('state', 'day'): 268, ('state', 'year'): 200, ('city', 'day'): 58, ('address', 'year'): 3, ('address', 'month'): 2})","(2, 5.0, 129)","(0.5, 134.7, 792.5)","(0, 120.7, 1488)","(0.3, 1.7, 13.0)","(0, 1326.1, 1569)","(1.0, 1.7, 2.5)"
1,"('location', 'time')",3,994,"Counter({('state', 'month'): 450, ('city', 'year'): 233, ('state', 'year'): 182, ('city', 'month'): 95, ('state', 'day'): 30, ('city', 'day'): 4})","(3, 7.9, 129)","(7.3, 100.6, 526.7)","(0, 182.9, 1488)","(0.3, 1.7, 6.3)","(178, 1372.6, 1569)","(1.4, 1.7, 2.2)"
2,"('location', 'participant')",2,479,"Counter({('state', 'last'): 207, ('state', 'first'): 199, ('city', 'last'): 41, ('city', 'first'): 32})","(2, 2.4, 8)","(5.0, 72.8, 163.0)","(0, 558.2, 1472)","(1.0, 2.4, 11.5)","(479, 1462.3, 1538)","(1.3, 1.8, 2.6)"
3,"('location', 'participant')",3,124,"Counter({('state', 'last'): 63, ('state', 'first'): 50, ('city', 'last'): 6, ('city', 'first'): 5})","(3, 3.5, 8)","(10.3, 59.2, 100.7)","(56, 746.2, 1412)","(1.0, 2.4, 6.3)","(1229, 1476.7, 1537)","(1.5, 1.8, 2.6)"
4,"('participant', 'time')",2,3284,"Counter({('first', 'year'): 1391, ('last', 'year'): 1142, ('first', 'month'): 355, ('last', 'month'): 346, ('first', 'day'): 27, ('last', 'day'): 23})","(2, 4.1, 21)","(1.0, 271.6, 790.5)","(0, 229.6, 1412)","(1.0, 2.4, 9.5)","(1, 1232.5, 1569)","(1.0, 1.8, 2.5)"
5,"('participant', 'time')",3,1756,"Counter({('first', 'year'): 825, ('last', 'year'): 732, ('last', 'month'): 118, ('first', 'month'): 75, ('last', 'day'): 3, ('first', 'day'): 3})","(3, 6.0, 21)","(5.3, 217.9, 526.3)","(4, 300.2, 1307)","(1.0, 2.4, 9.3)","(638, 1308.4, 1569)","(1.4, 1.8, 2.3)"


In [7]:
pandas.to_pickle(stats_df, 'stats.pickle')