## Creating a dictionary linking US cities to states

In [1]:
import sys
sys.path.append('C:\Anaconda3\Lib\site-packages')

# finding the state based on geotags
from geopy.geocoders import Nominatim

# the Geonamescache library contains information
# about continents, cities and US states
import geonamescache

In [2]:
# get a dictionary of cities: 'c'
gc = geonamescache.GeonamesCache()
c = gc.get_cities()

# extract the US city names and coordinates
US_cities = [c[key]['name'] for key in list(c.keys())
             if c[key]['countrycode'] == 'US']
US_longs = [c[key]['longitude'] for key in list(c.keys())
            if c[key]['countrycode'] == 'US']
US_latts = [c[key]['latitude'] for key in list(c.keys())
            if c[key]['countrycode'] == 'US']

#### List of "broken" cities below
These cities show up more than once in the list (i.e., there are cities with the same name in different states)

In [19]:
# how many cities exist more than once?
import collections
duplicates = [item for item, count in collections.Counter(US_cities).items() if count > 1]
print(len(duplicates))
#print('')
print(duplicates)

289
['Birmingham', 'Decatur', 'Enterprise', 'Florence', 'Helena', 'Homewood', 'Huntsville', 'Madison', 'Montgomery', 'Oxford', 'Selma', 'Troy', 'Conway', 'Fayetteville', 'Jacksonville', 'Texarkana', 'Dover', 'Middletown', 'Newark', 'Wilmington', 'Bloomingdale', 'Brandon', 'Brownsville', 'Edgewater', 'Gainesville', 'Hollywood', 'Lakeside', 'Leesburg', 'Oak Ridge', 'Palm Springs', 'Parkland', 'Princeton', 'Saint Cloud', 'Sanford', 'Spring Hill', 'University Park', 'Venice', 'West Hollywood', 'Westchester', 'Weston', 'Albany', 'Athens', 'Augusta', 'Brunswick', 'Canton', 'Carrollton', 'Columbus', 'Dublin', 'Duluth', 'Evans', 'Forest Park', 'Martinez', 'Rome', 'Roswell', 'Smyrna', 'Thomasville', 'Union City', 'Woodstock', 'Alton', 'Belleville', 'Charleston', 'Marion', 'Mount Vernon', "O'Fallon", 'Quincy', 'Springfield', 'Avon', 'Bloomington', 'Clarksville', 'Greenfield', 'Greenwood', 'Lawrence', 'New Castle', 'Plainfield', 'Richmond', 'Seymour', 'Shelbyville', 'Gardner', 'Kansas City', 'Man

In [20]:
def get_states(longs, latts):
    ''' Input two 1D lists of floats/ints '''
    # a list of states
    states = []
    # use a coordinate tool from the geopy library
    geolocator = Nominatim()
    for lon, lat in zip(longs, latts):
        try:
            # get the state name
            location = geolocator.reverse(str(lat)+', '+str(lon))
            state = location.raw['address']['state']
        except:
            # return empty string
            state = ''
        states.append(state)
    return states

In [11]:
# find the states of each city
# WARNING: this takes a while
US_states = get_states(US_longs, US_latts)

  


In [12]:
# create a dictionary linking cities
# as keys with their states

city_to_state = {}
for city, state in zip(US_cities, US_states):
    if state:
        city_to_state[city] = state

In [14]:
for key in city_to_state.keys():
    print('"'+key+'"'+':', '"'+city_to_state[key]+'"'+',')

"Fort Hunt": "Virginia",
"Bessemer": "Alabama",
"Paducah": "Kentucky",
"Birmingham": "Alabama",
"Center Point": "Alabama",
"Cullman": "Alabama",
"Daphne": "Alabama",
"Decatur": "Illinois",
"Dothan": "Alabama",
"East Florence": "Alabama",
"Enterprise": "Alabama",
"Fairhope": "Alabama",
"Florence": "South Carolina",
"Foley": "Alabama",
"Gadsden": "Alabama",
"Helena": "Montana",
"Homewood": "Alabama",
"Hoover": "Alabama",
"Hueytown": "Alabama",
"Huntsville": "Texas",
"Madison": "Connecticut",
"Millbrook": "Alabama",
"Mobile": "Alabama",
"Montgomery": "Alabama",
"Mountain Brook": "Alabama",
"Northport": "Alabama",
"Opelika": "Alabama",
"Oxford": "Ohio",
"Pelham": "Alabama",
"Phenix City": "Alabama",
"Prattville": "Alabama",
"Prichard": "Alabama",
"Selma": "Alabama",
"Talladega": "Alabama",
"Tillmans Corner": "Alabama",
"Troy": "Alabama",
"Trussville": "Alabama",
"Tuscaloosa": "Alabama",
"Vestavia Hills": "Alabama",
"Bella Vista": "Arkansas",
"Benton": "Arkansas",
"Bentonville": "Arkansas",