In [1]:
import pandas
from tabulate import tabulate
from IPython.core.display import display, HTML
from statistics import mean
from collections import Counter

In [2]:
import look_up_utils
import createq_utils

In [3]:
from glob import glob
import os
all_frames = [os.path.basename(path)
              for path in glob('../EventRegistries/GunViolence/frames/*')
              if not 'Icon' in path
             ]
all_frames

['accidental_deaths',
 'accidental_deaths_children',
 'accidental_deaths_teens',
 'accidental_injuries',
 'accidental_injuries_children',
 'accidental_injuries_teens',
 'children_injured',
 'children_killed',
 'mass_shootings',
 'mass_shootings_2013',
 'mass_shootings_2014',
 'mass_shootings_2015',
 'officer_involved_shootings',
 'teens_injured',
 'teens_killed']

## Load frames

In [4]:
frames = all_frames
df = pandas.concat([pandas.read_pickle('../EventRegistries/GunViolence/frames/' + frame)
                    for frame in frames])
df = df.drop_duplicates(subset='incident_uri')

## Load look_up

In [5]:
look_up, parameters2incident_uris = look_up_utils.create_look_up(df,
                                                                 discard_ambiguous_names=True,
                                                                 allowed_incident_years={2017}
                                                                 )

## Create table

In [6]:
lists_of_lists = []
wanted_attrs = ['answer', 'c2s_ratio', 
                'a_avg_date_spread', 
                'a_avg_num_sources', 
                'c_avg_date_spread', 
                'c_avg_num_sources']

headers = ['confusion', 'min_#_a_incidents', '#cand', 'granularity'] + wanted_attrs 

for confusion_tuple in [('location', 'time'),
                        ('location', 'participant'),
                        ('participant', 'time'),
                        ]:
    for min_num_answer_incidents in range(2, 4):
        print(confusion_tuple, min_num_answer_incidents)
        candidates=createq_utils.lookup_and_merge(look_up, 
                                                  parameters2incident_uris,
                                                  confusion_tuple,
                                                  min_num_answer_incidents,
                                                  df,
                                                  debug=False,
                                                  inspect_one=False,
                                                  set_attr_values=True) 
        
        gran_distr = Counter([getattr(cand, 'granularity') 
                              for cand in candidates])
        
        one_row = [confusion_tuple, min_num_answer_incidents, len(candidates), gran_distr]
        
        for attr in wanted_attrs:
            values = [getattr(cand, attr) for cand in candidates]
            row_value = (round(min(values), 1), 
                         round(mean(values), 1), 
                         round(max(values), 1))
            one_row.append(row_value)
        
        lists_of_lists.append(one_row)

stats_df = pandas.DataFrame(lists_of_lists, columns=headers)
table = tabulate(stats_df, headers='keys', tablefmt='html')
display(HTML(table))

('location', 'time') 2
('location', 'time') 3
('location', 'participant') 2
('location', 'participant') 3
('participant', 'time') 2
('participant', 'time') 3


Unnamed: 0,confusion,min_#_a_incidents,#cand,granularity,answer,c2s_ratio,a_avg_date_spread,a_avg_num_sources,c_avg_date_spread,c_avg_num_sources
0,"('location', 'time')",2,407,"Counter({('state', 'month'): 96, ('city', 'year'): 88, ('city', 'month'): 81, ('state', 'day'): 79, ('state', 'year'): 41, ('city', 'day'): 22})","(2, 5.1, 58)","(2.3, 93.8, 322.0)","(0, 23.6, 130)","(0.3, 1.5, 6.5)","(2, 112.3, 158)","(1.0, 1.5, 1.9)"
1,"('location', 'time')",3,218,"Counter({('state', 'month'): 80, ('city', 'year'): 46, ('state', 'year'): 38, ('city', 'month'): 35, ('state', 'day'): 16, ('city', 'day'): 3})","(3, 7.8, 58)","(2.3, 70.4, 214.3)","(0, 33.9, 130)","(0.3, 1.5, 3.0)","(23, 122.3, 158)","(1.0, 1.5, 1.7)"
2,"('location', 'participant')",2,15,"Counter({('state', 'last'): 6, ('state', 'first'): 6, ('city', 'last'): 3})","(2, 2.1, 4)","(5.5, 17.1, 29.0)","(2, 29.8, 64)","(1.0, 2.4, 5.0)","(65, 75.7, 80)","(1.2, 1.6, 2.2)"
3,"('location', 'participant')",3,1,"Counter({('state', 'first'): 1})","(4, 4, 4)","(12.2, 12.2, 12.2)","(61, 61, 61)","(5.0, 5.0, 5.0)","(80, 80, 80)","(1.4, 1.4, 1.4)"
4,"('participant', 'time')",2,382,"Counter({('first', 'year'): 137, ('last', 'year'): 109, ('first', 'month'): 74, ('last', 'month'): 56, ('first', 'day'): 6})","(2, 3.1, 9)","(3.5, 188.9, 322.0)","(0, 33.2, 75)","(1.0, 2.1, 6.0)","(22, 134.5, 158)","(1.1, 1.5, 1.6)"
5,"('participant', 'time')",3,170,"Counter({('first', 'year'): 75, ('last', 'year'): 47, ('first', 'month'): 28, ('last', 'month'): 20})","(3, 4.5, 9)","(49.0, 130.4, 214.3)","(5, 44.5, 75)","(1.0, 2.0, 4.6)","(66, 139.7, 158)","(1.5, 1.5, 1.6)"


In [7]:
pandas.to_pickle(stats_df, 'stats.pickle')