In [1]:
import json
import pickle
import pandas
from collections import namedtuple
from glob import glob
from datetime import datetime
from statistics import mean

In [2]:
import look_up_utils
import classes
import metrics

In [3]:
from tabulate import tabulate
from IPython.core.display import display, HTML

## Load frames

In [4]:
frames = ['mass_shootings',
          'mass_shootings_2013',
          'mass_shootings_2014',
          'mass_shootings_2015']
look_up, parameters2incident_uris = look_up_utils.create_look_up(frames)

In [5]:
df = pandas.concat([pandas.read_pickle('../EventRegistries/GunViolence/frames/' + frame)
                    for frame in frames])

## Set parameter values

In [6]:
CONFUSION_TUPLE=('location', 'participant')
#CONFUSION_TUPLE=('location', 'time')
#CONFUSION_TUPLE=('participant', 'time')
MIN_CONFUSION=3

In [7]:
def lookup_and_merge(look_up, 
                     parameters2incident_uris, 
                     confusion_key,
                     debug=False):
    """
    
    :param dict look_up: see look_up_utils.py
    :param dict parameters2incident_uris: see look_up_utils.py
    :param tuple confusion_key: see constant CONFUSION_TUPLE
    
    :rtype: list
    :return list of namedtuple instances (each representing a candidate)
    """
    all_questions=[]
    
    for granularity in (look_up[confusion_key]):
        for sf in look_up[confusion_key][granularity]:
            num_answer_e=len(look_up[confusion_key][granularity][sf])
            if num_answer_e >= MIN_CONFUSION:
                meanings=look_up[confusion_key][granularity][sf]

                # obtain answers uris
                answer_incident_uris = parameters2incident_uris[confusion_key][granularity][sf]

                # obtain noise uris
                total_incident_uris={0:set(), 1:set()}
                noise_incident_uris={0:set(), 1:set()}
                oa={0:set(), 1:set()}
                
                #total_incident_uris_part=set()
                for index in [0,1]:
                    confusion=confusion_key[index]
                    all_meanings=look_up[(confusion,)][(granularity[index],)][(sf[index],)]
                    oa[index]=metrics.get_observed_ambiguity(all_meanings)
                    for set_of_m in all_meanings.values():
                        total_incident_uris[index].update(set_of_m)
                    noise_incident_uris[index] = total_incident_uris[index] - answer_incident_uris

                c=classes.Candidate(
                            confusion_factors=CONFUSION_TUPLE,
                            granularity=granularity,
                            sf=sf, 
                            meanings=meanings,
                            answer=num_answer_e, 
                            answer_incident_uris=answer_incident_uris,
                            noise_incident_uris=noise_incident_uris,
                            oa_list=oa
                )
                all_questions.append(c)
    return all_questions

candidates=lookup_and_merge(look_up, parameters2incident_uris,
                            CONFUSION_TUPLE,
                            debug=False)
print({c.granularity for c in candidates})

{('state', 'last'), ('state', 'full_name'), ('city', 'full_name'), ('city', 'first'), ('state', 'first'), ('city', 'last')}


In [8]:
def incident_uris_stats(df, set_of_uris,compute_time_metrics=True):
    """
    compute stats for set of uris:
    
    :param df: gunviolence dataframe
    :param set set_of_uris: uris of gunviolence incidents
    
    :rtype: tuple
    :return (num_of_incidents,
             num_of_sources,
             avg_num_of_sources,
             urls,
             )
    """
    if type(set_of_uris) == dict:
        set_of_uris = set_of_uris[0].union(set_of_uris[1])
    num_of_incidents = len(set_of_uris)
    
    results = df.query('incident_uri in @set_of_uris')
    sources = set()

    for index, row in results.iterrows():
        sources.add(row['source_url'])
        sources.update(row['incident_sources'])

    num_sources = len(sources)
    avg = -1
    if len(set_of_uris):
        avg = num_sources / len(set_of_uris)

    date_spreads=[]    
    if compute_time_metrics:
        for incident_uri in set_of_uris:
            dates=set()
            path='../EventRegistries/GunViolence/the_violent_corpus/%s/*.json' % incident_uri
            for json_file in glob(path):
                document=json.load(open(json_file))
                try:
                    dates.add(datetime.strptime(document['dct'], '%a, %d %b %Y %H:%M:%S GMT'))
                except ValueError: # NODATE documents
                    pass
            date_spreads.append(metrics.get_dct_spread(dates))

    return (num_of_incidents, num_sources, avg, sources, mean(date_spreads) if len(date_spreads) else -1)

In [9]:
def compute_confusion_metrics(candidates, df):
    
    for q_id, c in enumerate(candidates, 1):
        n=incident_uris_stats(df, c.noise_incident_uris, False) # computes noise stats
        a=incident_uris_stats(df, c.answer_incident_uris, 'time' in CONFUSION_TUPLE) # computes answer stats
        
        c.a_avg_num_sources=a[2]
        c.a_sources=a[3]
        c.dct_spread=a[4]
        
        c.n_avg_num_sources=n[2]
        c.n_sources=n[3]
        
        
        c.n2s_ratio = metrics.get_ratio___noise_e2answer_e(c.noise_incident_uris[0] | c.noise_incident_uris[1], c.answer_incident_uris)
#        n2s = (len(c.noise_incident_uris[0])+len(c.noise_incident_uris[1]))/len(c.answer_incidents_uris)
                
        c.qid=q_id
        
    return candidates

candidates = compute_confusion_metrics(candidates, df)

In [10]:
def display_candidates_in_df(candidates):
    list_of_lists = []
    headers = ['q_id',
               'question',
               'answer',
               'OA_' + candidates[0].confusion_factors[0],
               'OA_' + candidates[0].confusion_factors[1],
               #'A_num_of_incidents', 'A_#_sources', 
               'A_avg_#_sources', 
               #'N_num_of_incidents', 'N_#_sources', 
               'N_avg_#_sources',
               'N2S',
              'A_sources',
              'N_sources',
               'A_avg_DCT_spread'
               ]
    for c in candidates:
        one_row = [
                c.qid,
                c.question,
                c.answer,
                c.oa_list[0],
                c.oa_list[1],
                #a[0],
                #a[1],
                c.a_avg_num_sources,
                #n[0],
                #n[1],
                c.n_avg_num_sources,
                c.n2s_ratio,
                c.a_sources,
                c.n_sources,
                c.dct_spread
                ]
        list_of_lists.append(one_row)
            
    question_df = pandas.DataFrame(list_of_lists, columns=headers)
    return question_df
    
question_df=display_candidates_in_df(candidates)

In [11]:
len(question_df)

101

In [12]:
question_df

Unnamed: 0,q_id,question,answer,OA_location,OA_participant,A_avg_#_sources,N_avg_#_sources,N2S,A_sources,N_sources,A_avg_DCT_spread
0,1,How many events happened in California that in...,5,1,16,2.400000,2.143713,33.400000,{https://www.lawa.org/uploadedFiles/LAX/LAWA%2...,{http://www.local10.com/news/south-florida-wom...,-1
1,2,How many events happened in Georgia that invol...,3,1,29,3.000000,2.965116,28.666667,{http://www.macon.com/2014/12/19/3489530_fourt...,"{, http://wjtv.com/2017/01/15/pike-county-inve...",-1
2,3,How many events happened in Florida that invol...,3,1,4,2.000000,2.918605,28.666667,{http://www.local10.com/news/crime/7-injured-i...,"{, http://www.local10.com/news/south-florida-w...",-1
3,4,How many events happened in Tennessee that inv...,3,1,43,3.333333,3.071429,28.000000,{http://www.timesfreepress.com/news/local/stor...,{https://www.justice.gov/usao-edmi/pr/six-vice...,-1
4,5,How many events happened in Louisiana that inv...,5,1,52,3.800000,3.444444,18.000000,{http://www.houmatoday.com/article/20160122/ar...,{http://www.nola.com/crime/index.ssf/2016/10/p...,-1
5,6,How many events happened in California that in...,4,1,43,2.250000,2.352041,49.000000,{http://www.sfgate.com/crime/article/1-killed-...,{http://homicide.latimes.com/neighborhood/flor...,-1
6,7,How many events happened in Illinois that invo...,4,1,7,2.000000,1.991150,28.250000,{http://articles.chicagotribune.com/2013-08-20...,{http://kwqc.com/ap/5-teens-shot-in-drive-by-s...,-1
7,8,How many events happened in Florida that invol...,4,1,43,3.750000,3.056911,30.750000,{http://www.mynews13.com/content/news/cfnews13...,"{, http://www.local10.com/news/south-florida-w...",-1
8,9,How many events happened in Florida that invol...,3,1,15,3.000000,2.865979,32.333333,{http://www.winknews.com/2016/11/23/2-dead-3-i...,"{, http://www.local10.com/news/south-florida-w...",-1
9,10,How many events happened in Florida that invol...,7,1,52,2.857143,3.007937,18.000000,{http://www.firstcoastnews.com/news/local/data...,"{, http://www.local10.com/news/south-florida-w...",-1


In [13]:
#table = tabulate(question_df, headers='keys', tablefmt='html')
#display(HTML(table))
for index, row in question_df.iterrows():
    print()
    print(row['question'])
    print(row)
    for source_url in row['A_sources']:
        print(source_url)
    break


How many events happened in California that involve Hernandez ?
q_id                                                                1
question            How many events happened in California that in...
answer                                                              5
OA_location                                                         1
OA_participant                                                     16
A_avg_#_sources                                                   2.4
N_avg_#_sources                                               2.14371
N2S                                                              33.4
A_sources           {https://www.lawa.org/uploadedFiles/LAX/LAWA%2...
N_sources           {http://www.local10.com/news/south-florida-wom...
A_avg_DCT_spread                                                   -1
Name: 0, dtype: object
https://www.lawa.org/uploadedFiles/LAX/LAWA%20T3%20After%20Action%20Report%20March%2018%202014.pdf
http://www.ksbw.com/news/central-california