In [1]:
import pandas
from datetime import datetime, timedelta
from dateutil import parser
from collections import Counter
from SPARQLWrapper import SPARQLWrapper, JSON
import json

In [2]:
df = pandas.read_pickle('firerescue_v2.pickle')

## location utils

In [3]:
def get_json_results(sparql_query):
    """
    perform sparql query to dbpedia
    and return json

    :rtype: dict
    :return: json results
    """

    sparql.setQuery(sparql_query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    return results

def get_state_sf2dbpedia_uri(df):
    """
    create mapping from states as surface forms in Gub Violence
    to Dbpedia uris

    :param pandas.core.frame.DataFrame df: gunviolence dataframes

    :rtype: dict
    :return: mapping state to dbpedia uri
    """
    state_query = '''
    SELECT DISTINCT ?state
    WHERE {
            ?state <http://purl.org/dc/terms/subject> <http://dbpedia.org/resource/Category:States_of_the_United_States> .
            ?state <http://dbpedia.org/ontology/country> <http://dbpedia.org/resource/United_States>
    }
    '''

    states_info = get_json_results(state_query)
    states_uris = {
        result['state']['value']
        for result in states_info["results"]["bindings"]
        }

    gv_sf_state2uri = {
        'Georgia': 'http://dbpedia.org/resource/Georgia_(U.S._state)',
        'Washington': 'http://dbpedia.org/resource/Washington_(state)',
    }

    gv_states = set(df['state'])

    for gv_sf_state in gv_states:
        for state_uri in states_uris:
            state_for_matching = gv_sf_state.replace(' ', '_')
            if state_uri.endswith('/' + state_for_matching):
                gv_sf_state2uri[gv_sf_state] = state_uri

    return gv_sf_state2uri

def possible_db_uris_for_city_county(city_or_county, state_uri):
    """
    given city_or_county and state, this function returns possible dbpedia
    uris

    :param str city_or_county: e.g. Rochester
    :param str state_uri: e.g. http://dbpedia.org/resource/New_York

    :rtype: set
    :return: set of possible dbpedia uris
    """
    query = '''
    SELECT DISTINCT ?uri
    WHERE {
    ?uri <http://dbpedia.org/ontology/isPartOf>* <%s> .
    ?uri rdfs:label ?label .
    filter strStarts(?label, "%s")
    }
    ''' % (state_uri, city_or_county)

    json_result = get_json_results(query)

    results = {
        result['uri']['value']
        for result in json_result["results"]["bindings"]
        }

    return results

In [4]:
def weekday_approach(reporting_date, first_sentence):
    """
    """
    weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday',
                'Friday', 'Saturday', 'Sunday']
    
    for week_day in weekdays:
        if week_day in first_sentence:
            result = parser.parse(week_day, 
                                  default=reporting_date, 
                                  fuzzy=True)
            week_before = result - timedelta(days=7)
            return week_before
    
    return None

def return_wiki_of_location(row, gv_sf_state2uri):
    """
    """
    result = ''
    state = row['state']
    
    if state in gv_sf_state2uri:
        state_uri = gv_sf_state2uri[state]
        results = possible_db_uris_for_city_county(row['city_or_county'], 
                                                  state_uri)
        
        if len(results) >= 1:
            result = min(results)
    
    if result:
        result = result.replace('http://dbpedia.org/resource/',
                                'https://en.wikipedia.org/wiki/')
    
    return result

In [5]:
sparql = SPARQLWrapper("http://dbpedia.org/sparql")

In [6]:
gv_sf_state2uri = get_state_sf2dbpedia_uri(df)

In [7]:
log = []

structured_data = dict()

for index, row in df.iterrows():
    
    incident_uri = row['incident_uri']
    date_string = row['date']
    reporting_date = datetime.strptime(date_string, "%B %d, %Y")
    summary = row['incident']
    
    if not summary:
        #print('no summary for: %s' % row['incident_uri'])
        log.append('no summary')
        continue

    first_sentence = summary[0]
    
    if first_sentence is None:
        #print('no summary for: %s' % row['incident_uri'])
        log.append('no summary')
        continue
        
    # weekday attempt
    weekday_attempt = weekday_approach(reporting_date, first_sentence)
    if weekday_attempt:
        log.append('weekday')
        estimated_incident_date = weekday_attempt
        estimated_incident_date = '{:%Y-%m-%d}'.format(estimated_incident_date)
    else:
        log.append('unknown')
        estimated_incident_date = ''
        
    estimated_location = return_wiki_of_location(row, gv_sf_state2uri)
        
        
    incident_info = {'estimated_incident_date' : estimated_incident_date,
                     'estimated_location': estimated_location,
                     'articles': [{'dct': date_string,
                                  'title' : row['title'],
                                  'body' : ' '.join([paragraph
                                                     for paragraph in summary
                                                     if paragraph])}]}
    structured_data[incident_uri] = incident_info

2009-02-19
2009-02-06
2009-01-18
2009-01-09
2009-01-05
2009-01-07
2008-12-17
2008-12-16
2008-12-08
2008-12-04
2008-11-22
2008-11-17
2008-11-16
2008-11-12
2008-11-09
2008-11-04
2008-11-06
2008-10-24
2008-10-23
2008-10-22
2008-10-13
2008-10-14
2008-10-08
2008-10-02
2008-09-30
2008-09-25
2008-09-09
2008-09-08
2008-08-16
2008-07-22
2008-07-13
2008-07-07
2008-07-02
2008-06-30
2008-06-24
2008-06-23
2008-06-15
2008-06-07
2008-06-08
2008-05-25
2008-05-20
2008-05-12
2008-05-09
2008-05-06
2008-04-30
2008-04-22
2008-03-26
2008-02-20
2008-02-04
2008-01-25
2008-01-27
2008-01-07
2008-01-02
2007-12-18
2007-12-19
2007-12-04
2007-12-04
2007-12-05
2007-12-02
2007-12-01
2007-11-25
2007-11-28
2007-11-14
2007-11-15
2007-11-16
2007-11-14
2007-11-06
2007-11-08
2007-10-31
2007-10-26
2007-10-22
2007-10-15
2007-10-14
2007-10-11
2007-10-09
2007-10-09
2007-10-05
2007-09-28
2007-09-23
2007-09-22
2007-09-20
2007-09-15
2007-09-04
2007-09-03
2007-09-03
2007-08-29
2007-08-22
2007-08-25
2007-08-17
2007-08-18
2007-08-13

In [8]:
with open('firerescue_structured_data.json', 'w') as outfile:
    json.dump(structured_data, outfile)

In [9]:
len(structured_data)

320

In [10]:
for attribute in ['estimated_incident_date', 'estimated_location']:
    default_values = len([incident_info[attribute]
                          for incident_info in structured_data.values()
                          if incident_info[attribute] == ''])
    print(attribute, default_values)

estimated_incident_date 107
estimated_location 35
