In [1]:
import pickle
import pandas
from collections import namedtuple

In [2]:
import look_up_utils

In [3]:
from tabulate import tabulate
from IPython.core.display import display, HTML

## Load frames

In [4]:
frames = ['mass_shootings',
          'mass_shootings_2013',
          'mass_shootings_2014',
          'mass_shootings_2015']
look_up, parameters2incident_uris = look_up_utils.create_look_up(frames)
print(look_up)

{('location', 'time'): {('state', 'year'): defaultdict(<class 'dict'>, {('South Dakota', '2015'): {(('South Dakota',), '2015'): {'418318'}}, ('Kentucky', '2014'): {(('Kentucky',), '2014'): {'267792', '162305', '153275', '211348'}}, ('Massachusetts', '2014'): {(('Massachusetts',), '2014'): {'123666', '151475', '175943', '168169'}}, ('Alaska', '2014'): {(('Alaska',), '2014'): {'190599'}}, ('Minnesota', '2014'): {(('Minnesota',), '2014'): {'171802'}}, ('Massachusetts', '2016'): {(('Massachusetts',), '2016'): {'634889', '517058', '574753'}}, ('Colorado', '2017'): {(('Colorado',), '2017'): {'752249'}}, ('Nevada', '2016'): {(('Nevada',), '2016'): {'546581', '567788', '592207'}}, ('Indiana', '2014'): {(('Indiana',), '2014'): {'106394', '103698', '124146', '259935', '146565', '108691', '154721', '107673'}}, ('North Carolina', '2016'): {(('North Carolina',), '2016'): {'663624', '652606', '732927', '576774', '589159', '725338', '733176', '728286', '646995'}}, ('Arizona', '2013'): {(('Arizona',),

In [5]:
df = pandas.concat([pandas.read_pickle('../EventRegistries/GunViolence/frames/' + frame)
                    for frame in frames])

## Set parameter values

In [6]:
#CONFUSION_TUPLE=('location', 'participant')
CONFUSION_TUPLE=('location', 'time')
#CONFUSION_TUPLE=('time', 'participant')
MIN_CONFUSION=3

In [7]:
def lookup_and_merge(look_up, 
                     parameters2incident_uris, 
                     confusion_key):
    """
    
    :param dict look_up: see look_up_utils.py
    :param dict parameters2incident_uris: see look_up_utils.py
    :param tuple confusion_key: see constant CONFUSION_TUPLE
    
    :rtype: list
    :return list of namedtuple instances (each representing a candidate)
    """
    nice_combos=[]
    Candidate=namedtuple('Candidate', 'granularity sf meanings num_answer_e answer_incidents_uris noise_incident_uris oa')
    for granularity in (look_up[confusion_key]):
        for sf in look_up[confusion_key][granularity]:
            len_combo=len(look_up[confusion_key][granularity][sf])
            meanings=look_up[confusion_key][granularity][sf]
            
            # obtain answers uris
            answer_incidents_uris = parameters2incident_uris[confusion_key][granularity][sf]
            
            # obtain noise uris
            total_incident_uris={0:set(), 1:set()}
            noise_incident_uris={0:set(), 1:set()}
            oa={0:set(), 1:set()}
#            total_incident_uris_part=set()
            for index in [0,1]:
                confusion=confusion_key[index]
                all_meanings=look_up[(confusion,)][granularity[index]][sf[index]]
                oa[index]=len(all_meanings)
                for set_of_m in all_meanings.values():
                    total_incident_uris[index].update(set_of_m)
                noise_incident_uris[index] = total_incident_uris[index] - answer_incidents_uris
            
            if len_combo >= MIN_CONFUSION:
                c=Candidate(granularity=granularity, 
                            sf=sf, 
                            meanings=meanings, 
                            num_answer_e=len_combo, 
                            answer_incidents_uris=answer_incidents_uris,
                            noise_incident_uris=noise_incident_uris,
                            oa=oa)
                nice_combos.append(c)
    return nice_combos

candidates=lookup_and_merge(look_up, parameters2incident_uris,
                            CONFUSION_TUPLE)
print({c.granularity for c in candidates})

{('city', 'year')}


In [8]:
def incident_uris_stats(df, set_of_uris):
    """
    compute stats for set of uris:
    
    :param df: gunviolence dataframe
    :param set set_of_uris: uris of gunviolence incidents
    
    :rtype: tuple
    :return (num_of_incidents,
             num_of_sources,
             avg_num_of_sources,
             urls,
             )
    """
    if type(set_of_uris) == dict:
        set_of_uris = set_of_uris[0].union(set_of_uris[1])
    num_of_incidents = len(set_of_uris)
    
    results = df.query('incident_uri in @set_of_uris')
    sources = set()
    for index, row in results.iterrows():
        sources.add(row['source_url'])
        sources.update(row['incident_sources'])
    
    num_sources = len(sources)
    avg = -1
    if len(set_of_uris):
        avg = num_sources / len(set_of_uris)
    
    return (num_of_incidents, num_sources, avg, sources)

In [9]:
def compute_num_docs(include, df):
#    include=('454788', '419250')
    results=df.query('incident_uri in @include')
    sources = set()
    
    print(include)
    for index, row in results.iterrows():
        sources.add(row['source_url'])
        sources.update(row['incident_sources'])
        
    num_sources = len(sources)
    avg= num_sources / len(include)
    
    print()
    for source in sources:
        print(source)
    print("Average number of documents per incident: %f" % avg)
    return avg

#compute_num_docs(('454788', '419250'), df)

In [10]:
def compute_metrics(c, look_up, confusion_key, index):
    my_set=set()
    confusion=confusion_key[index]
    meanings=look_up[(confusion,)][c.granularity[index]][c.sf[index]]
    for m in meanings.values():
        for incident in m:
            my_set.add(incident)
#    print(c.sf[index], my_set)
    num_noisy_e= len(my_set)
    ns_ratio=num_noisy_e/c.num_answer_e
    oa=len(meanings)
    print('The %s %s has %d meanings over %d incidents.' % (confusion, c.sf[index], oa, num_noisy_e))
    #print('The %s %s has %d meanings over %d incidents. OA is %d, Noise-to-Signal ratio is %f' % (confusion, c.sf[index],oa,num_noisy_e, oa, ns_ratio))
    return ns_ratio, oa


            
    
def compute_confusion_metrics(look_up, confusion_key, candidates, df):
    
    list_of_lists = []
    headers = ['q_id',
               'question',
               'answer',
               'OA_part',
               'OA_loc',
               'A_num_of_incidents', 'A_#_sources', 'A_avg_#_sources', 
               'N_num_of_incidents', 'N_#_sources', 'N_avg_#_sources',
               'N2S',
              'A_sources',
              'N_sources'
               ]
    
    for q_id, c in enumerate(candidates, 1):
        n=incident_uris_stats(df, c.noise_incident_uris)
        a=incident_uris_stats(df, c.answer_incidents_uris)
        
        oa_loc = c.oa[0]
        oa_part = c.oa[1]
        
        n2s = (len(c.noise_incident_uris[0])+len(c.noise_incident_uris[1]))/len(c.answer_incidents_uris)
        
        # for (location, participat)
        #        question = "Q: How many killing events that involve %s happened in the location %s?" % (c.sf[1], c.sf[0])
        # for (location, time)
        question = "Q: How many killing events happened in the location %s in %s?" % (c.sf[0], c.sf[1])
        # for (time, participant)
        # question = "Q: How many killing events that involve %s happened in %s?" % (c.sf[1], c.sf[0])
        answer = c.num_answer_e
        
        one_row = [
            q_id,
            question,
            answer,
            oa_part,
            oa_loc,
            a[0],
            a[1],
            a[2],
            n[0],
            n[1],
            n[2],
            n2s,
            a[3],
            n[3],
            ]
        list_of_lists.append(one_row)
            
    question_df = pandas.DataFrame(list_of_lists, columns=headers)
    return question_df

question_df = compute_confusion_metrics(look_up,CONFUSION_TUPLE,candidates, df)

In [11]:
len(question_df)

3

In [12]:
#table = tabulate(question_df, headers='keys', tablefmt='html')
#display(HTML(table))
for index, row in question_df.iterrows():
    print()
    print(row['question'])
    print(row)
    for source_url in row['A_sources']:
        print(source_url)
    break


Q: How many killing events happened in the location Jackson in 2016?
q_id                                                                  1
question              Q: How many killing events happened in the loc...
answer                                                                3
OA_part                                                               1
OA_loc                                                                4
A_num_of_incidents                                                    4
A_#_sources                                                          14
A_avg_#_sources                                                     3.5
N_num_of_incidents                                                  385
N_#_sources                                                        1179
N_avg_#_sources                                                 3.06234
N2S                                                               96.25
A_sources             {http://www.msnewsnow.com/story/33514100/6-i