In [1]:
import pandas
from tabulate import tabulate
from IPython.core.display import display, HTML
from statistics import mean
from collections import Counter

In [2]:
import look_up_utils
import createq_utils
from datetime import datetime

## Load frames

In [3]:
df = pandas.read_pickle('../EventRegistries/GunViolenceArchive/frames/all')

## Load look_up

In [None]:
look_up, parameters2incident_uris = look_up_utils.create_look_up(df,
                                                                 discard_ambiguous_names=True,
                                                                 allowed_incident_years={2013, 2014, 2015, 2016, 2017},
                                                                 check_name_in_article=True)

## Create table

In [None]:
lists_of_lists = []
wanted_attrs = ['answer', 'c2s_ratio', 
                'a_avg_date_spread', 
                'a_avg_num_sources',
                'num_a_sources',
                'c_avg_date_spread', 
                'c_avg_num_sources',
                'num_c_sources']

headers = ['confusion', 'min_#_a_incidents', '#cand', 'granularity'] + wanted_attrs 

for confusion_tuple in [('location', 'time'),
                        ('location', 'participant'),
                        ('participant', 'time'),
                        ]:
    for min_num_answer_incidents in range(2, 4):
        print(datetime.now(), confusion_tuple, min_num_answer_incidents)
        candidates=createq_utils.lookup_and_merge(look_up, 
                                                  parameters2incident_uris,
                                                  confusion_tuple,
                                                  min_num_answer_incidents,
                                                  df,
                                                  debug=False,
                                                  inspect_one=False,
                                                  set_attr_values=True) 
       
        gran_distr = Counter([getattr(cand, 'granularity') 
                              for cand in candidates])
        
        one_row = [confusion_tuple, min_num_answer_incidents, len(candidates), gran_distr]
        
        for attr in wanted_attrs:
            values = [getattr(cand, attr) for cand in candidates]
            row_value = (round(min(values), 1), 
                         round(mean(values), 1), 
                         round(max(values), 1))
            one_row.append(row_value)
        
        lists_of_lists.append(one_row)

stats_df = pandas.DataFrame(lists_of_lists, columns=headers)
table = tabulate(stats_df, headers='keys', tablefmt='html')
display(HTML(table))

2017-04-24 18:12:41.891096 ('location', 'time') 2


In [None]:
pandas.to_pickle(stats_df, 'stats.pickle')

In [None]:
wanted_attrs = ['answer', 'c2s_ratio', 
                'a_avg_date_spread', 
                'a_avg_num_sources', 
                'num_a_sources',
                'c_avg_date_spread', 
                'c_avg_num_sources',
                'num_c_sources']

for wanted_attr in wanted_attrs:
    tot_num_can = 0
    total = 0
    for index, row in stats_df.iterrows():
        if row['min_#_a_incidents'] == 2:
            tot_num_can += row['#cand']
            minimum, average, maximum = row[wanted_attr]
            total += (average * row['#cand'])
    
    avg = total / tot_num_can
    print(wanted_attr, round(avg, 2))