In [1]:
import pandas
from tabulate import tabulate
from IPython.core.display import display, HTML
from statistics import mean
from collections import Counter

In [2]:
import look_up_utils
import createq_utils

In [3]:
from glob import glob
import os
all_frames = [os.path.basename(path)
              for path in glob('../EventRegistries/GunViolence/frames/*')
              if not 'Icon' in path
             ]
all_frames

['accidental_deaths',
 'accidental_deaths_children',
 'accidental_deaths_teens',
 'accidental_injuries',
 'accidental_injuries_children',
 'accidental_injuries_teens',
 'children_injured',
 'children_killed',
 'mass_shootings',
 'mass_shootings_2013',
 'mass_shootings_2014',
 'mass_shootings_2015',
 'officer_involved_shootings',
 'teens_injured',
 'teens_killed']

## Load frames

In [4]:
frames = all_frames
df = pandas.concat([pandas.read_pickle('../EventRegistries/GunViolence/frames/' + frame)
                    for frame in frames])

## Load look_up

In [5]:
look_up, parameters2incident_uris = look_up_utils.create_look_up(df)

## Create table

In [6]:
lists_of_lists = []
wanted_attrs = ['answer', 'c2s_ratio', 
                'a_avg_date_spread', 'a_avg_num_sources', 
                'c_avg_date_spread', 'c_avg_num_sources']

headers = ['confusion', 'min_confusion', '#cand', 'granularity'] + wanted_attrs 

for confusion_tuple in [('location', 'time'),
                        ('location', 'participant'),
                        ('participant', 'time'),
                        ]:
    for min_confusion in range(2, 4):
        print(confusion_tuple, min_confusion)
        candidates=createq_utils.lookup_and_merge(look_up, 
                                                  parameters2incident_uris,
                                                  confusion_tuple,
                                                  min_confusion,
                                                  df,
                                                  debug=False,
                                                  inspect_one=False) 
        
        gran_distr = Counter([getattr(cand, 'granularity') 
                              for cand in candidates])
        
        one_row = [confusion_tuple, min_confusion, len(candidates), gran_distr]
        
        for attr in wanted_attrs:
            values = [getattr(cand, attr) for cand in candidates]
            row_value = (round(min(values), 1), 
                         round(mean(values), 1), 
                         round(max(values), 1))
            one_row.append(row_value)
        
        lists_of_lists.append(one_row)

stats_df = pandas.DataFrame(lists_of_lists, columns=headers)
table = tabulate(stats_df, headers='keys', tablefmt='html')
display(HTML(table))

('location', 'time') 2
('location', 'time') 3
('location', 'participant') 2
('location', 'participant') 3
('participant', 'time') 2
('participant', 'time') 3


Unnamed: 0,confusion,min_confusion,#cand,granularity,answer,c2s_ratio,a_avg_date_spread,a_avg_num_sources,c_avg_date_spread,c_avg_num_sources
0,"('location', 'time')",2,188,"Counter({('city', 'year'): 118, ('city', 'month'): 48, ('address', 'year'): 10, ('address', 'month'): 5, ('address', 'day'): 4, ('city', 'day'): 3})","(2, 3.5, 22)","(0.5, 353.6, 995.5)","(0, 0, 0)","(0.5, 2.1, 11.0)","(0, 0, 0)","(1.7, 2.1, 3.0)"
1,"('location', 'time')",3,19,"Counter({('city', 'year'): 16, ('city', 'month'): 3})","(3, 6.4, 20)","(70.5, 246.4, 662.0)","(0, 0, 0)","(1.0, 1.9, 2.8)","(0, 0, 0)","(1.8, 2.2, 2.3)"
2,"('location', 'participant')",2,767,"Counter({('state', 'first'): 297, ('state', 'last'): 292, ('city', 'last'): 77, ('city', 'first'): 57, ('state', 'full_name'): 24, ('city', 'full_name'): 17, ('address', 'first'): 1, ('address', 'last'): 1, ('address', 'full_name'): 1})","(2, 2.4, 10)","(0.0, 85.3, 196.5)","(0, 0, 0)","(0.5, 3.2, 17.5)","(-1, -0.0, 0)","(0, 2.3, 4.7)"
3,"('location', 'participant')",3,181,"Counter({('state', 'last'): 91, ('state', 'first'): 75, ('city', 'last'): 8, ('city', 'first'): 5, ('city', 'full_name'): 1, ('state', 'full_name'): 1})","(3, 3.6, 10)","(0.3, 71.7, 132.7)","(0, 0, 0)","(1.0, 3.3, 8.7)","(0, 0, 0)","(1.6, 2.3, 4.6)"
4,"('participant', 'time')",2,1595,"Counter({('first', 'year'): 593, ('last', 'year'): 443, ('first', 'month'): 250, ('last', 'month'): 217, ('full_name', 'year'): 27, ('first', 'day'): 23, ('last', 'day'): 19, ('full_name', 'month'): 18, ('full_name', 'day'): 5})","(2, 2.9, 25)","(0.5, 359.9, 997.5)","(0, 0, 0)","(0.5, 3.2, 14.5)","(0, 0, 0)","(1.5, 2.3, 3.9)"
5,"('participant', 'time')",3,548,"Counter({('first', 'year'): 255, ('last', 'year'): 195, ('last', 'month'): 53, ('first', 'month'): 42, ('last', 'day'): 2, ('first', 'day'): 1})","(3, 4.6, 25)","(7.3, 297.8, 664.7)","(0, 0, 0)","(1.0, 3.3, 13.7)","(0, 0, 0)","(1.7, 2.3, 3.2)"


In [7]:
pandas.to_pickle(stats_df, 'stats.pickle')