In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON
from collections import defaultdict

## Question
**Do two cities names occur with the same in name in the same state?**
* the internet says [no](https://www.quora.com/Are-there-any-cases-of-two-US-cities-with-the-same-name-in-a-single-state)
* our dbpedia search says yes (see results below)

In [2]:
def get_json_results(sparql_query):
    """
    perform sparql query to dbpedia
    and return json
    
    :rtype: dict
    :return: json results
    """
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    
    sparql.setQuery(sparql_query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    return results

## queries

```
SELECT DISTINCT ?city ?state ?label
WHERE { ?city <http://dbpedia.org/ontology/country> <http://dbpedia.org/resource/United_States> .
        ?city <http://www.w3.org/2000/01/rdf-schema#label> ?label .
        ?city <http://dbpedia.org/ontology/isPartOf> ?state .
        ?state <http://purl.org/dc/terms/subject> <http://dbpedia.org/resource/Category:States_of_the_United_States> .
        filter(langMatches(lang(?label),"EN"))
}
```


## all us states 

In [3]:
state_query = '''
SELECT DISTINCT ?state 
WHERE { 
        ?state <http://purl.org/dc/terms/subject> <http://dbpedia.org/resource/Category:States_of_the_United_States> .
        ?state <http://dbpedia.org/ontology/country> <http://dbpedia.org/resource/United_States>
}
'''

In [4]:
states_info = get_json_results(state_query)
states_uris = {
    result['state']['value']
    for result in states_info["results"]["bindings"]
}

## query cities per us states

In [5]:
cities_of_state_template = '''
SELECT DISTINCT ?city ?label
WHERE { ?city <http://dbpedia.org/ontology/country> <http://dbpedia.org/resource/United_States> .
        ?city <http://www.w3.org/2000/01/rdf-schema#label> ?label .
        ?city <http://dbpedia.org/ontology/isPartOf> <%s> .
        filter(langMatches(lang(?label),"EN"))
}
'''

In [8]:
state_citylabel2cities = defaultdict(set)
for state_uri in states_uris:
    
    cities_of_state = cities_of_state_template % state_uri
    cities_info = get_json_results(cities_of_state)
    
    for info in cities_info["results"]["bindings"]:
        
        city_label = info['label']['value'] 
        if ', ' in info['label']['value']:
            city_label, state_label = info['label']['value'].split(', ', 1) 
            
        key = (state_uri, city_label)
        value = info['city']['value']
        state_citylabel2cities[key].add(value)

In [19]:
counter = 0
debug = False
matching_cities = set()
for (state, city_label), value in state_citylabel2cities.items():
    if len(value) >= 2:
        
        counter += 1
        matching_cities.add(city_label)
        if debug:
            print()
            print(state, city_label)
            print(value)
            input('continue?')

In [21]:
counter

2187

## check overlap with gun violence 

In [15]:
import pandas

In [16]:
from glob import glob
import os
all_frames = [os.path.basename(path)
              for path in glob('../EventRegistries/GunViolence/frames/*')
              if not 'Icon' in path
             ]
all_frames

['accidental_deaths',
 'accidental_deaths_children',
 'accidental_deaths_teens',
 'accidental_injuries',
 'accidental_injuries_children',
 'accidental_injuries_teens',
 'children_injured',
 'children_killed',
 'mass_shootings',
 'mass_shootings_2013',
 'mass_shootings_2014',
 'mass_shootings_2015',
 'officer_involved_shootings',
 'teens_injured',
 'teens_killed']

In [17]:
frames = all_frames
df = pandas.concat([pandas.read_pickle('../EventRegistries/GunViolence/frames/' + frame)
                    for frame in frames])

In [40]:
gv_matches = set()
for index, row in df.iterrows():
    if row['city_or_county'] in matching_cities:
        gv_matches.add((row['state'], row['city_or_county'], row['address']))

## attempt to disambiguate to dbpedia link

In [33]:
states2cities = defaultdict(set)
for index, row in df.iterrows():
    states2cities[row['state']].add(row['city_or_county'])

In [45]:
len([city
    for state, cities in states2cities.items()
    for city in cities])

1516

## try using geo modules

In [77]:
len(gv_matches)

611

In [66]:
import geopy

In [67]:
from geopy.geocoders import Nominatim

In [68]:
geolocator = Nominatim()

In [75]:
location = geolocator.geocode("5105 Cleveland Rd, Jacksonville, Florida", timeout=10)

In [76]:
location

Location(5105, Cleveland Road, Jacksonville, Duval County, Florida, 32209, United States of America, (30.3672231701182, -81.707894903197, 0.0))

In [2]:
type(location)

NameError: name 'location' is not defined