# Generating Lookup Tables for Geolocation
This notebook details the steps to generate the lookup tables for geolocation.

In [633]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 300)

# International Cities and Counties
World cities with populations >100K, Canadian cities, Canadian counties, UK cities

In [705]:
# Only need first column of data
usecols=['Unnamed: 0', 'Both sexes\nLes deux sexes', 'Both sexes\nLes deux sexes.1']
un_cities = pd.read_csv('../public_data/uncities.csv', usecols=usecols, skiprows=4)\
    .set_axis(['name', 'city_pop', 'urban_pop'], axis=1)\
    .dropna(subset=['city_pop', 'urban_pop'])  # Drop if neither city or urban population exists
# '...' is alternative name for na
un_cities['pop'] = np.where(un_cities['urban_pop'].str.startswith('.'), un_cities['city_pop'], un_cities['urban_pop'])
un_cities = un_cities[['name', 'pop']]

Unnamed: 0,name,pop


In [682]:
un_cities.iloc[0].str.contains('...')

name    True
pop     True
Name: 4, dtype: bool

In [712]:
usecols = ['Unnamed: 2', 'Population (usual residents) : All usual residents - Unit : Persons']
uk_areas = pd.read_csv('../public_data/ukcensusareas.csv', skiprows=1, usecols=usecols)\
    .set_axis(['name', 'pop'], axis=1)
uk_areas[uk_areas.notna()]

Unnamed: 0,name,pop
0,England,53012456
1,Northern Ireland,1810863
2,Scotland,5295403
3,Wales,3063456
4,North East,2596886
...,...,...
9928,"Garth, Menai (Bangor)",5125
9929,"Haverfordwest: Prendergast, Rudbaxton",2952
9930,"Brackla, Coity",13820
9931,"Crickhowell, Llangattock",3800


In [723]:
load_cols = ['GEO_NAME', 'DIM: Profile of Census Metropolitan Areas/Census Agglomerations (2247)',
            'Dim: Sex (3): Member ID: [1]: Total - Sex']
canada_areas = pd.read_csv('../public_data/canadacensusareas.csv', low_memory=False, usecols=load_cols)\
                  .set_axis(['name', 'variable', 'pop'], axis=1)
canada_areas = canada_areas[canada_areas['variable'] == 'Population, 2016'].drop(columns=['variable'])
canada_areas

Unnamed: 0,name,pop
0,St. John's,205955
2247,Bay Roberts,11083
4494,Grand Falls-Windsor,14171
6741,Gander,13234
8988,Corner Brook,31917
11235,Charlottetown,69325
13482,Summerside,16587
15729,Halifax,403390
17976,Kentville,26222
20223,Truro,45753


In [664]:
un_countries = pd.read_csv('../public_data/uncountries.csv', header=None)
# Set country populations high so that they take precedence over city names
un_countries['population'] = 1000000000
un_countries = un_countries.iloc[:,[0,3]].set_axis(['name', 'pop'], axis=1)
un_countries.head()

Unnamed: 0,name,pop
0,Afghanistan,1000000000
1,Åland Islands,1000000000
2,Albania,1000000000
3,Algeria,1000000000
4,American Samoa,1000000000


In [746]:
foreign_entities_df = pd.DataFrame(columns=['name','pop'])
for df in [un_cities, un_countries, canada_areas, uk_areas]:
    foreign_entities_df = foreign_entities_df.append(df.copy())
foreign_entities_df['pop'] = foreign_entities_df['pop'].astype(str).str.replace(',', '')
foreign_entities_df['name'] = foreign_entities_df['name'].str.lower().str.replace('city','')

# US City Data

In [545]:
states = pd.read_csv('../public_data/usstateabbreviations.csv', names=['abb', 'name'])['name']

# Filter by population
us_cities_df = us_cities_df[us_cities_df['population'] >= 5000]

def extract_state(state):
    split = state.split(',')[-1].strip()
    # Some state strings have multiple punctuation and require extra filtering,
    # e.g., "Lynchburg, Moore county metropolitan government, Tennessee"
    if ';' in split:
        return split.split(';')[-1].strip()
    else:
        return split

In [546]:
import re
def tokenize_cityname(row):
    state = row['state']
    city = row['city_name']
    city = city.split(',')[0]     # Remove State from city name
    # These strings were appended by the census; 'and' is for multiple area labels
    area_names = ['city', 'village', 'borough', 'town', 'CDP', 'municipality', 'and']
    city_and_state = city[:]
    for area_name in area_names:
        city_and_state = city_and_state.replace(' '+area_name, '', 1)
    stripped = city_and_state.strip(' ,')
    # Ensures strings like "Arizona City, Arizona" are changed to "Arizona City"
    # and not "City, Arizona" after strip.
    return state + ' City' if state in stripped else stripped

In [547]:
len(us_cities_df['state'].unique())

52

In [548]:
# Load data and apply operations
us_cities_df = pd.read_csv('../public_data/usacscities.csv', skiprows=1,
                          usecols=['Geographic Area Name', 'Estimate!!Total'])\
                           .set_axis(['city_name', 'population'], axis=1)
us_cities_df['state'] = us_cities_df['city_name'].apply(extract_state)
us_cities_df['city_name_formatted'] = us_cities_df.apply(tokenize_cityname, axis=1)
us_cities_df = us_cities_df.sort_values('population', ascending=False)\
                           .drop_duplicates(subset=['city_name_formatted'], keep='first')
us_cities_df.head()

Unnamed: 0,city_name,population,state,city_name_formatted
17551,"New York city, New York",8426743,New York,New York City
2725,"Los Angeles city, California",3900794,California,Los Angeles
6283,"Chicago city, Illinois",2717534,Illinois,Chicago
25203,"Houston city, Texas",2217706,Texas,Houston
22711,"Philadelphia city, Pennsylvania",1555072,Pennsylvania,Philadelphia


## Format the City Name from the list of cities to include only the city name

In [549]:
re.split('\W+', 'St. George')

['St', 'George']

In [570]:
to_append = us_cities_df[us_cities_df['city_name_formatted'].str.contains('St.')].copy()
to_append['city_name_formatted'] = to_append['city_name_formatted'].apply(lambda s: s.replace('St.', 'St'))
us_cities_df = us_cities_df.append(to_append)

In [571]:
us_cities_df[us_cities_df['city_name_formatted'].str.contains('George')]

Unnamed: 0,city_name,population,state,city_name_formatted
26515,"St. George city, Utah",76915,Utah,St. George
25078,"Georgetown city, Texas",56102,Texas,Georgetown
26932,"George Mason CDP, Virginia",10065,Virginia,George Mason
11435,"Woodlawn CDP (Prince George's County), Maryland",7973,Maryland,Woodlawn (Prince George's County)
10765,"Village St. George CDP, Louisiana",7294,Louisiana,Village St. George
27001,"King George CDP, Virginia",5299,Virginia,King George
25079,"George West city, Texas",2538,Texas,George West
5282,"Georgetown-Quitman County unified government, ...",2326,Georgia,Georgetown-Quitman County unified government
27152,"Prince George CDP, Virginia",1944,Virginia,Prince George
22080,"Georgetown CDP (Luzerne County), Pennsylvania",1782,Pennsylvania,Georgetown (Luzerne County)


# State names and abbreviations

In [593]:
states_df = pd.read_csv('../public_data/usstateabbreviations.csv', header=None).set_axis(['state_abb','state_name'], axis=1)
states_df['abb_with_periods'] = states_df['state_abb'].apply(lambda s: s[0]+'.'+s[1]+'.')
states_df.head(3)

Unnamed: 0,state_abb,state_name,abb_with_periods
0,AK,Alaska,A.K.
1,AL,Alabama,A.L.
2,AR,Arkansas,A.R.


In [606]:
state_strings = set([s.lower() for row in states_df.values.tolist() for s in row])

In [607]:
state_strings

{'a.k.',
 'a.l.',
 'a.r.',
 'a.z.',
 'ak',
 'al',
 'alabama',
 'alaska',
 'ar',
 'arizona',
 'arkansas',
 'az',
 'c.a.',
 'c.o.',
 'c.t.',
 'ca',
 'california',
 'co',
 'colorado',
 'connecticut',
 'ct',
 'd.c.',
 'd.e.',
 'dc',
 'de',
 'delaware',
 'district of columbia',
 'f.l.',
 'fl',
 'florida',
 'g.a.',
 'ga',
 'georgia',
 'h.i.',
 'hawaii',
 'hi',
 'i.a.',
 'i.d.',
 'i.l.',
 'i.n.',
 'ia',
 'id',
 'idaho',
 'il',
 'illinois',
 'in',
 'indiana',
 'iowa',
 'k.s.',
 'k.y.',
 'kansas',
 'kentucky',
 'ks',
 'ky',
 'l.a.',
 'la',
 'louisiana',
 'm.a.',
 'm.d.',
 'm.e.',
 'm.i.',
 'm.n.',
 'm.o.',
 'm.s.',
 'm.t.',
 'ma',
 'maine',
 'maryland',
 'massachusetts',
 'md',
 'me',
 'mi',
 'michigan',
 'minnesota',
 'mississippi',
 'missouri',
 'mn',
 'mo',
 'montana',
 'ms',
 'mt',
 'n.c.',
 'n.d.',
 'n.e.',
 'n.h.',
 'n.j.',
 'n.m.',
 'n.v.',
 'n.y.',
 'nc',
 'nd',
 'ne',
 'nebraska',
 'nevada',
 'new hampshire',
 'new jersey',
 'new mexico',
 'new york',
 'nh',
 'nj',
 'nm',
 'north carol

# Shelving

In [747]:
import shelve
shelf = shelve.open('../public_data/location_data')
shelf['foreign_entities_df'] = foreign_entities_df
shelf['state_strings'] = state_strings

In [None]:
shelf['foreign_entities'].pop()