In [1]:
import pickle
import pandas
from collections import namedtuple

In [2]:
import look_up_utils
import classes
import metrics

In [3]:
from tabulate import tabulate
from IPython.core.display import display, HTML

## Load frames

In [4]:
frames = ['mass_shootings',
          'mass_shootings_2013',
          'mass_shootings_2014',
          'mass_shootings_2015']
look_up, parameters2incident_uris = look_up_utils.create_look_up(frames)

In [5]:
df = pandas.concat([pandas.read_pickle('../EventRegistries/GunViolence/frames/' + frame)
                    for frame in frames])

## Set parameter values

In [6]:
#CONFUSION_TUPLE=('location', 'participant')
#CONFUSION_TUPLE=('location', 'time')
CONFUSION_TUPLE=('participant', 'time')
MIN_CONFUSION=3

In [19]:
def lookup_and_merge(look_up, 
                     parameters2incident_uris, 
                     confusion_key,
                     debug=False):
    """
    
    :param dict look_up: see look_up_utils.py
    :param dict parameters2incident_uris: see look_up_utils.py
    :param tuple confusion_key: see constant CONFUSION_TUPLE
    
    :rtype: list
    :return list of namedtuple instances (each representing a candidate)
    """
    all_questions=[]
    
    for granularity in (look_up[confusion_key]):
        for sf in look_up[confusion_key][granularity]:
            num_answer_e=len(look_up[confusion_key][granularity][sf])
            if num_answer_e >= MIN_CONFUSION:
                meanings=look_up[confusion_key][granularity][sf]

                # obtain answers uris
                answer_incident_uris = parameters2incident_uris[confusion_key][granularity][sf]

                # obtain noise uris
                total_incident_uris={0:set(), 1:set()}
                noise_incident_uris={0:set(), 1:set()}
                oa={0:set(), 1:set()}
    #            total_incident_uris_part=set()
                for index in [0,1]:
                    confusion=confusion_key[index]

                    if debug:
                        print()
                        print(confusion)
                        print(granularity[index])
                        input('continue?')
                    all_meanings=look_up[(confusion,)][(granularity[index],)][(sf[index],)]
                    oa[index]=metrics.get_observed_ambiguity(all_meanings)
                    for set_of_m in all_meanings.values():
                        total_incident_uris[index].update(set_of_m)
                    noise_incident_uris[index] = total_incident_uris[index] - answer_incident_uris

                c=classes.Candidate(
                            confusion_factors=CONFUSION_TUPLE,
                            granularity=granularity,
                            sf=sf, 
                            meanings=meanings,
                            answer=num_answer_e, 
                            answer_incident_uris=answer_incident_uris,
                            noise_incident_uris=noise_incident_uris,
                            oa_list=oa
                )
                all_questions.append(c)
    return all_questions

candidates=lookup_and_merge(look_up, parameters2incident_uris,
                            CONFUSION_TUPLE,
                            debug=False)
print({c.granularity for c in candidates})

{('first', 'year'), ('last', 'year'), ('first', 'month'), ('full_name', 'year'), ('last', 'month'), ('last', 'day')}


In [20]:
def incident_uris_stats(df, set_of_uris):
    """
    compute stats for set of uris:
    
    :param df: gunviolence dataframe
    :param set set_of_uris: uris of gunviolence incidents
    
    :rtype: tuple
    :return (num_of_incidents,
             num_of_sources,
             avg_num_of_sources,
             urls,
             )
    """
    if type(set_of_uris) == dict:
        set_of_uris = set_of_uris[0].union(set_of_uris[1])
    num_of_incidents = len(set_of_uris)
    
    results = df.query('incident_uri in @set_of_uris')
    sources = set()
    for index, row in results.iterrows():
        sources.add(row['source_url'])
        sources.update(row['incident_sources'])
    
    num_sources = len(sources)
    avg = -1
    if len(set_of_uris):
        avg = num_sources / len(set_of_uris)
    
    return (num_of_incidents, num_sources, avg, sources)

In [21]:
def compute_num_docs(include, df):
#    include=('454788', '419250')
    results=df.query('incident_uri in @include')
    sources = set()
    
    print(include)
    for index, row in results.iterrows():
        sources.add(row['source_url'])
        sources.update(row['incident_sources'])
        
    num_sources = len(sources)
    avg= num_sources / len(include)
    
    print()
    for source in sources:
        print(source)
    print("Average number of documents per incident: %f" % avg)
    return avg

#compute_num_docs(('454788', '419250'), df)

In [22]:
def compute_confusion_metrics(look_up, candidates, df):
    
    for q_id, c in enumerate(candidates, 1):
        n=incident_uris_stats(df, c.noise_incident_uris) # computes noise stats
        a=incident_uris_stats(df, c.answer_incident_uris) # computes answer stats
        
        c.a_avg_num_sources=a[2]
        c.a_sources=a[3]
        c.n_avg_num_sources=n[2]
        c.n_sources=n[3]
        
        c.n2s_ratio = metrics.get_ratio___noise_e2answer_e(c.noise_incident_uris[0] | c.noise_incident_uris[1], c.answer_incident_uris)
#        n2s = (len(c.noise_incident_uris[0])+len(c.noise_incident_uris[1]))/len(c.answer_incidents_uris)
                
        c.qid=q_id
        
    return candidates

candidates = compute_confusion_metrics(look_up,candidates, df)

In [26]:
def display_candidates_in_df(candidates):
    list_of_lists = []
    headers = ['q_id',
               'question',
               'answer',
               'OA_' + candidates[0].confusion_factors[0],
               'OA_' + candidates[0].confusion_factors[1],
               #'A_num_of_incidents', 'A_#_sources', 
               'A_avg_#_sources', 
               #'N_num_of_incidents', 'N_#_sources', 
               'N_avg_#_sources',
               'N2S',
              'A_sources',
              'N_sources'
               ]
    for c in candidates:
        one_row = [
                c.qid,
                c.question,
                c.answer,
                c.oa_list[0],
                c.oa_list[1],
                #a[0],
                #a[1],
                c.a_avg_num_sources,
                #n[0],
                #n[1],
                c.n_avg_num_sources,
                c.n2s_ratio,
                c.a_sources,
                c.n_sources
                ]
        list_of_lists.append(one_row)
            
    question_df = pandas.DataFrame(list_of_lists, columns=headers)
    return question_df
    
question_df=display_candidates_in_df(candidates)

In [27]:
len(question_df)

309

In [28]:
question_df

Unnamed: 0,q_id,question,answer,OA_participant,OA_time,A_avg_#_sources,N_avg_#_sources,N2S,A_sources,N_sources
0,1,How many events happened in 2016 that involve ...,7,27,1,5.571429,3.030151,56.857143,{http://www.telegram.com/news/20160614/leomins...,"{, http://www.knoe.com/content/news/Four-hurt-..."
1,2,How many events happened in 2016 that involve ...,3,11,1,9.333333,3.041026,130.000000,{http://www.telegram.com/news/20160614/leomins...,"{, http://www.knoe.com/content/news/Four-hurt-..."
2,3,How many events happened in 2015 that involve ...,4,10,1,4.750000,2.555224,83.750000,{http://ktla.com/2015/12/14/man-killed-in-gunf...,{http://www.denverpost.com/news/ci_29085984/on...
3,4,How many events happened in 2014 that involve ...,3,7,1,3.333333,1.869565,92.000000,{http://www.nbcmiami.com/news/local/Multiple-P...,"{, http://www.kesq.com/news/party-shooting-lea..."
4,5,How many events happened in 2016 that involve ...,4,13,1,2.250000,3.082051,97.500000,{http://www.walb.com/story/32001315/three-auto...,"{, http://www.knoe.com/content/news/Four-hurt-..."
5,6,How many events happened in 2015 that involve ...,4,13,1,6.000000,2.544379,84.500000,{http://7online.com/news/charleston-shooting-s...,{http://www.denverpost.com/news/ci_29085984/on...
6,7,How many events happened in 2015 that involve ...,4,9,1,2.250000,2.568862,83.500000,{http://www.nwitimes.com/news/local/lake/gary/...,{http://www.denverpost.com/news/ci_29085984/on...
7,8,How many events happened in 2014 that involve ...,6,36,1,2.166667,2.254181,49.833333,{http://www.wsbtv.com/ap/ap/georgia/dublin-ga-...,{http://www.kesq.com/news/party-shooting-leave...
8,9,How many events happened in 2016 that involve ...,3,5,1,2.000000,3.078125,128.000000,{http://www.nbcmiami.com/news/local/House-Part...,"{, http://www.knoe.com/content/news/Four-hurt-..."
9,10,How many events happened in 2016 that involve ...,4,15,1,3.500000,3.066327,98.000000,{http://wavy.com/2016/06/19/four-people-shot-i...,"{, http://www.knoe.com/content/news/Four-hurt-..."


In [None]:
#table = tabulate(question_df, headers='keys', tablefmt='html')
#display(HTML(table))
for index, row in question_df.iterrows():
    print()
    print(row['question'])
    print(row)
    for source_url in row['A_sources']:
        print(source_url)
    break