In [1]:
import pickle
import pandas
from collections import namedtuple

In [2]:
with open('cache/mass_shootings_2015.pickle', 'rb') as infile:
    look_up = pickle.load(infile)

In [3]:
def create_dataframe():
    df_paths = [#'frames/mass_shootings_2013',
                #'frames/mass_shootings_2014',
                '../EventRegistries/GunViolence/frames/mass_shootings_2015']
    frames = []
    for df_path in df_paths:
        with open(df_path, 'rb') as infile:
            df = pickle.load(infile)
            frames.append(df)
    df = pandas.concat(frames)
    return df

In [4]:
df=create_dataframe()

In [5]:
print(look_up.keys())

dict_keys([('location',), ('participant',), ('location', 'participant')])


In [6]:
CONFUSION_TUPLE=('location', 'participant')
MIN_CONFUSION=1

In [7]:
def lookup_and_merge(look_up, confusion_key):
    nice_combos=[]
    Candidate=namedtuple('Candidate', 'granularity sf meanings num_answer_e incidents')
    for granularity in (look_up[confusion_key]):
        if granularity[0]=='state':
            continue
        for sf in look_up[confusion_key][granularity]:
            len_combo=len(look_up[confusion_key][granularity][sf])
            meanings=look_up[confusion_key][granularity][sf]
            incidents=set()
            for m in meanings:
                incidents.add(m[1][0])
            if(len_combo>MIN_CONFUSION):
                c=Candidate(granularity=granularity, sf=sf, meanings=meanings, num_answer_e=len_combo, incidents=incidents)
                nice_combos.append(c)
    return nice_combos

candidates=lookup_and_merge(look_up, CONFUSION_TUPLE)
for c in candidates:
    print(c.granularity)

('city', 'full_name')
('city', 'last')
('city', 'last')
('city', 'last')
('city', 'first')
('city', 'first')
('city', 'first')
('city', 'first')


In [10]:
def compute_num_docs(include, df):
#    include=('454788', '419250')
    results=df.query('incident_uri in @include')
    s=0
    print(include)
    for index, row in results.iterrows():
        s+=len(row['incident_sources'])
    avg=s/len(include)
    print("Average number of documents per incident: %f" % avg)
    return avg

#compute_num_docs(('454788', '419250'), df)

In [11]:
def compute_metrics(c, look_up, confusion_key, index):
    my_set=set()
    confusion=confusion_key[index]
    meanings=look_up[(confusion,)][c.granularity[index]][c.sf[index]]
    for m in meanings.values():
        for incident in m:
            my_set.add(incident)
#    print(c.sf[index], my_set)
    num_noisy_e=len(my_set)
    ns_ratio=num_noisy_e/c.num_answer_e
    oa=len(meanings)
    print('The %s %s has %d meanings over %d incidents. OA is %d, Noise-to-Signal ratio is %f' % (confusion, c.sf[index],oa,num_noisy_e, oa, ns_ratio))
    return ns_ratio, oa


            
    
def compute_confusion_metrics(look_up, confusion_key, candidates, df):
    for c in candidates:
        print('CANDIDATE: %s' % str(c.sf))
        ns_loc, oa_loc=compute_metrics(c, look_up, confusion_key, 0)
        ns_part, oa_part=compute_metrics(c, look_up, confusion_key, 1)
        avg_num_docs=compute_num_docs(c.incidents, df)
        score=(ns_loc + oa_loc + ns_part + oa_part+avg_num_docs)/5
        print("Question score: %f" % score)
        print()
        print("Q: How many killing events that involve %s happened in %s?" % (c.sf[1], c.sf[0]))
        print("A: %d" % c.num_answer_e)
        print("###########################################################################")
        print()
        

compute_confusion_metrics(look_up,CONFUSION_TUPLE,candidates, df)

CANDIDATE: ('Daytona Beach', 'Rakim Watson')
The location Daytona Beach has 1 meanings over 3 incidents. OA is 1, Noise-to-Signal ratio is 1.500000
The participant Rakim Watson has 0 meanings over 0 incidents. OA is 0, Noise-to-Signal ratio is 0.000000
{'317165', '588835'}
Average number of documents per incident: 1.500000
Question score: 0.800000

Q: How many killing events that involve Rakim Watson happened in Daytona Beach?
A: 2
###########################################################################

CANDIDATE: ('Daytona Beach', 'Watson')
The location Daytona Beach has 1 meanings over 3 incidents. OA is 1, Noise-to-Signal ratio is 1.500000
The participant Watson has 2 meanings over 2 incidents. OA is 2, Noise-to-Signal ratio is 1.000000
{'317165', '588835'}
Average number of documents per incident: 1.500000
Question score: 1.400000

Q: How many killing events that involve Watson happened in Daytona Beach?
A: 2
#####################################################################

In [16]:
# debug rakim watson
look_up[('participant',)]['full_name'].keys()

dict_keys(['Rakim Watson'])