In [1]:
import pandas
from tabulate import tabulate
from IPython.core.display import display, HTML
from statistics import mean
from collections import Counter

In [2]:
import look_up_utils
import createq_utils
from datetime import datetime

## Load frames

In [3]:
df = pandas.read_pickle('../EventRegistries/GunViolence/frames/all')

## Load look_up

In [4]:
look_up, parameters2incident_uris = look_up_utils.create_look_up(df,
                                                                 discard_ambiguous_names=True,
                                                                 allowed_incident_years={2013, 2014, 2015, 2016, 2017},
                                                                 check_name_in_article=True)

## Create table

In [5]:
lists_of_lists = []
wanted_attrs = ['answer', 'c2s_ratio', 
                'a_avg_date_spread', 
                'a_avg_num_sources',
                'num_a_sources',
                'c_avg_date_spread', 
                'c_avg_num_sources',
                'num_c_sources']

headers = ['confusion', 'min_#_a_incidents', '#cand', 'granularity'] + wanted_attrs 

for confusion_tuple in [('location', 'time'),
                        ('location', 'participant'),
                        ('participant', 'time'),
                        ]:
    for min_num_answer_incidents in range(2, 4):
        print(datetime.now(), confusion_tuple, min_num_answer_incidents)
        candidates=createq_utils.lookup_and_merge(look_up, 
                                                  parameters2incident_uris,
                                                  confusion_tuple,
                                                  min_num_answer_incidents,
                                                  df,
                                                  debug=False,
                                                  inspect_one=False,
                                                  set_attr_values=True) 
       
        gran_distr = Counter([getattr(cand, 'granularity') 
                              for cand in candidates])
        
        one_row = [confusion_tuple, min_num_answer_incidents, len(candidates), gran_distr]
        
        for attr in wanted_attrs:
            values = [getattr(cand, attr) for cand in candidates]
            row_value = (round(min(values), 1), 
                         round(mean(values), 1), 
                         round(max(values), 1))
            one_row.append(row_value)
        
        lists_of_lists.append(one_row)

stats_df = pandas.DataFrame(lists_of_lists, columns=headers)
table = tabulate(stats_df, headers='keys', tablefmt='html')
display(HTML(table))

2017-04-18 13:01:45.630271 ('location', 'time') 2
2017-04-18 13:08:06.201065 ('location', 'time') 3
2017-04-18 13:11:56.684421 ('location', 'participant') 2
2017-04-18 13:12:15.699459 ('location', 'participant') 3
2017-04-18 13:12:20.900464 ('participant', 'time') 2
2017-04-18 13:22:11.614524 ('participant', 'time') 3


Unnamed: 0,confusion,min_#_a_incidents,#cand,granularity,answer,c2s_ratio,a_avg_date_spread,a_avg_num_sources,num_a_sources,c_avg_date_spread,c_avg_num_sources,num_c_sources
0,"('location', 'time')",2,2581,"Counter({('state', 'month'): 832, ('city', 'year'): 554, ('city', 'month'): 461, ('state', 'day'): 435, ('state', 'year'): 213, ('city', 'day'): 78, ('address', 'year'): 5, ('address', 'month'): 3})","(2, 5.2, 141)","(0.5, 162.6, 877.5)","(0, 93.0, 1488)","(0.0, 1.3, 9.0)","(1, 691.9, 2792)","(0, 1310.7, 1569)","(0.6, 1.3, 2.5)","(0, 6.8, 278)"
1,"('location', 'time')",3,1272,"Counter({('state', 'month'): 560, ('city', 'year'): 309, ('state', 'year'): 194, ('city', 'month'): 137, ('state', 'day'): 64, ('city', 'day'): 8})","(3, 8.6, 141)","(10.7, 120.7, 583.3)","(0, 150.5, 1488)","(0.0, 1.3, 9.0)","(29, 855.2, 2792)","(41, 1364.6, 1569)","(0.6, 1.3, 1.9)","(0, 11.1, 278)"
2,"('location', 'participant')",2,308,"Counter({('state', 'last'): 134, ('state', 'first'): 125, ('city', 'last'): 27, ('city', 'first'): 22})","(2, 2.3, 6)","(7.5, 98.5, 180.0)","(2, 561.3, 1472)","(1.0, 2.4, 9.5)","(31, 303.4, 597)","(869, 1485.7, 1562)","(0.7, 1.4, 2.2)","(2, 5.6, 27)"
3,"('location', 'participant')",3,73,"Counter({('state', 'last'): 42, ('state', 'first'): 26, ('city', 'last'): 3, ('city', 'first'): 2})","(3, 3.3, 6)","(10.7, 78.6, 120.3)","(78, 724.2, 1422)","(1.0, 2.3, 5.3)","(59, 353.0, 597)","(869, 1480.6, 1560)","(1.1, 1.4, 1.8)","(3, 7.8, 27)"
4,"('participant', 'time')",2,2064,"Counter({('first', 'year'): 857, ('last', 'year'): 758, ('last', 'month'): 222, ('first', 'month'): 199, ('last', 'day'): 16, ('first', 'day'): 12})","(2, 3.6, 15)","(2.0, 356.2, 876.0)","(0, 220.8, 1381)","(1.0, 2.5, 10.0)","(4, 1478.5, 2746)","(94, 1227.1, 1569)","(0.9, 1.4, 2.7)","(2, 9.0, 41)"
5,"('participant', 'time')",3,1048,"Counter({('first', 'year'): 463, ('last', 'year'): 458, ('last', 'month'): 82, ('first', 'month'): 45})","(3, 5.2, 15)","(14.3, 275.1, 582.0)","(5, 283.4, 1307)","(1.0, 2.5, 8.0)","(107, 1717.0, 2746)","(653, 1308.9, 1569)","(0.9, 1.4, 2.5)","(3, 13.0, 41)"


In [6]:
pandas.to_pickle(stats_df, 'stats.pickle')

In [8]:
wanted_attrs = ['answer', 'c2s_ratio', 
                'a_avg_date_spread', 
                'a_avg_num_sources', 
                'num_a_sources',
                'c_avg_date_spread', 
                'c_avg_num_sources',
                'num_c_sources']

for wanted_attr in wanted_attrs:
    tot_num_can = 0
    total = 0
    for index, row in stats_df.iterrows():
        if row['min_#_a_incidents'] == 2:
            tot_num_can += row['#cand']
            minimum, average, maximum = row[wanted_attr]
            total += (average * row['#cand'])
    
    avg = total / tot_num_can
    print(wanted_attr, round(avg, 2))

answer 4.35
c2s_ratio 239.29
a_avg_date_spread 175.38
a_avg_num_sources 1.87
num_a_sources 995.53
c_avg_date_spread 1286.74
c_avg_num_sources 1.35
num_c_sources 7.64
