# Generating Lookup Tables for Geolocation
This notebook details the steps to generate the lookup tables for geolocation.

In [1078]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 300)

# International Cities and Counties
World cities with pops >100K, Canadian cities, Canadian counties, UK cities

In [1079]:
# Only need first column of data
usecols=['Unnamed: 0', 'Both sexes\nLes deux sexes', 'Both sexes\nLes deux sexes.1']
un_cities = pd.read_csv('../public_data/uncities.csv', usecols=usecols, skiprows=4)\
    .set_axis(['name', 'city_pop', 'urban_pop'], axis=1)
# Drop US Cities
first_us_idx = un_cities.index[un_cities['name']=='Abilene (TX)'][0]
last_us_idx = un_cities.index[un_cities['name']=='CHARLOTTE AMALIE'][0]
un_cities = un_cities.drop(index=range(first_us_idx, last_us_idx+1))
un_cities = un_cities.dropna(subset=['city_pop', 'urban_pop'])  # Drop if neither city or urban pop exists
un_cities['pop'] = np.where(un_cities['urban_pop'].str.startswith('.'), un_cities['city_pop'], un_cities['urban_pop'])
un_cities = un_cities[['name', 'pop']]

In [1080]:
usecols = ['Unnamed: 2', 'Population (usual residents) : All usual residents - Unit : Persons']
uk_areas = pd.read_csv('../public_data/ukcensusareas.csv', skiprows=1, usecols=usecols)\
    .set_axis(['name', 'pop'], axis=1)
uk_areas[uk_areas.notna()]

Unnamed: 0,name,pop
0,England,53012456
1,Northern Ireland,1810863
2,Scotland,5295403
3,Wales,3063456
4,North East,2596886
...,...,...
9928,"Garth, Menai (Bangor)",5125
9929,"Haverfordwest: Prendergast, Rudbaxton",2952
9930,"Brackla, Coity",13820
9931,"Crickhowell, Llangattock",3800


In [1081]:
load_cols = ['GEO_NAME', 'DIM: Profile of Census Metropolitan Areas/Census Agglomerations (2247)',
            'Dim: Sex (3): Member ID: [1]: Total - Sex']
canada_areas = pd.read_csv('../public_data/canadacensusareas.csv', low_memory=False, usecols=load_cols)\
                  .set_axis(['name', 'variable', 'pop'], axis=1)
canada_areas = canada_areas[canada_areas['variable'] == 'pop, 2016'].drop(columns=['variable'])
canada_areas

Unnamed: 0,name,pop


In [1082]:
un_countries = pd.read_csv('../public_data/uncountries.csv', header=None)
# Set country pops high so that they take precedence over city names
un_countries['pop'] = 1000000000
un_countries = un_countries.iloc[:,[0,3]].set_axis(['name', 'pop'], axis=1)
un_countries.head()

Unnamed: 0,name,pop
0,Afghanistan,1000000000
1,Åland Islands,1000000000
2,Albania,1000000000
3,Algeria,1000000000
4,American Samoa,1000000000


In [1083]:
un_countries[un_countries['name'].str.contains('None')]

Unnamed: 0,name,pop


In [1084]:
foreign_entities_df = pd.DataFrame(columns=['name','pop'])
for df in [un_cities, un_countries, canada_areas, uk_areas]:
    foreign_entities_df = foreign_entities_df.append(df.copy())
foreign_entities_df['pop'] = foreign_entities_df['pop'].astype(str).str.replace(',', '').astype(int)
#foreign_entities_df['name'] = foreign_entities_df['name'].str.lower().str.replace('city','')
foreign_entities_df['name'] = foreign_entities_df['name'].str.replace('City','').str.strip()
foreign_entities_df['in_us'] = False
foreign_entities_df = foreign_entities_df[~(foreign_entities_df['name']=='')]
foreign_entities_df = foreign_entities_df[foreign_entities_df['name'].str.len() >= 4]
foreign_entities_df.sort_values('pop', ascending=False)

Unnamed: 0,name,pop,in_us
51,Colombia,1000000000,False
141,Mauritius,1000000000,False
157,New Zealand,1000000000,False
156,New Caledonia,1000000000,False
155,Netherlands,1000000000,False
...,...,...,...
905,Bushmills,738,False
297,JAMESTOWN,657,False
3991,ALOFI,639,False
3361,VATICAN CITY,451,False


# US City Data

In [1085]:
states = pd.read_csv('../public_data/usstateabbreviations.csv', names=['abb', 'name'])['name']

def extract_state(state):
    split = state.split(',')[-1].strip()
    # Some state strings have multiple punctuation and require extra filtering,
    # e.g., "Lynchburg, Moore county metropolitan government, Tennessee"
    if ';' in split:
        return split.split(';')[-1].strip()
    else:
        return split

In [1086]:
import re
def tokenize_cityname(row):
    state = row['state']
    city = row['raw_name_string']
    city = city.split(',')[0]     # Remove State from city name
    # These strings were appended by the census; 'and' is for multiple area labels
    area_names = ['city', 'village', 'borough', 'town', 'CDP', 'municipality', 'and']
    city_and_state = city[:]
    for area_name in area_names:
        city_and_state = city_and_state.replace(' '+area_name, '', 1)
    stripped = city_and_state.strip(' ,')
    # Ensures strings like "Arizona City, Arizona" are changed to "Arizona City"
    # and not "City, Arizona" after strip.
    return state + ' City' if state in stripped else stripped

In [1087]:
len(us_cities_df['state'].unique())

52

In [1088]:
# Load data and apply operations
us_cities_df = pd.read_csv('../public_data/usacscities.csv', skiprows=1,
                          usecols=['Geographic Area Name', 'Estimate!!Total'])\
                           .set_axis(['raw_name_string', 'pop'], axis=1)
us_cities_df = us_cities_df[us_cities_df['pop'] >= 5000] # Filter by population
us_cities_df['state'] = us_cities_df['raw_name_string'].apply(extract_state)
us_cities_df['name'] = us_cities_df.apply(tokenize_cityname, axis=1)\
    .str.replace('City','')
    #.str.lower().str.replace('city', '')
us_cities_df = us_cities_df.sort_values('pop', ascending=False)\
                           .drop_duplicates(subset=['name'], keep='first')
us_cities_df['in_us'] = True
print(us_cities_df.shape)
us_cities_df.head()

(5293, 5)


Unnamed: 0,raw_name_string,pop,state,name,in_us
17551,"New York city, New York",8426743,New York,New York,True
2725,"Los Angeles city, California",3900794,California,Los Angeles,True
6283,"Chicago city, Illinois",2717534,Illinois,Chicago,True
25203,"Houston city, Texas",2217706,Texas,Houston,True
22711,"Philadelphia city, Pennsylvania",1555072,Pennsylvania,Philadelphia,True


# Merge US & International Cities

In [1089]:
foreign_entities_df['state'] = None
foreign_entities_df['raw_name_string'] = None
#foreign_entities_df = foreign_entities_df.reindex(us_cities_df.columns, axis=1)
foreign_entities_df.head()
print(foreign_entities_df[foreign_entities_df['pop'] == 1000000000].head())
all_entities = us_cities_df.append(foreign_entities_df)
all_entities = all_entities.sort_values('pop', ascending=False)
all_entities.head()

             name         pop  in_us state raw_name_string
0     Afghanistan  1000000000  False  None            None
1   Åland Islands  1000000000  False  None            None
2         Albania  1000000000  False  None            None
3         Algeria  1000000000  False  None            None
4  American Samoa  1000000000  False  None            None


Unnamed: 0,raw_name_string,pop,state,name,in_us
138,,1000000000,,Marshall Islands,False
115,,1000000000,,Japan,False
201,,1000000000,,Singapore,False
102,,1000000000,,Holy See,False
103,,1000000000,,Honduras,False


## Format the City Name from the list of cities to include only the city name

In [1090]:
def add_variation(df, original, replacement):
    to_append = df[df['name'].str.contains(original)].copy()
    to_append['name'] = to_append['name'].str.replace(original, replacement)
    df = df.append(to_append)
    return df

replacements = {
    'st.': 'st',
    'new york':'nyc',
    'los angeles':'la'
}

for k, v in replacements.items():
    us_cities_df = add_variation(us_cities_df, k, v)

In [1091]:
foreign_entities_df.shape

(13417, 5)

# State names and abbreviations

In [1092]:
states_df = pd.read_csv('../public_data/usstateabbreviations.csv', header=None).set_axis(['state_abb','state_name'], axis=1)
states_df['abb_with_periods'] = states_df['state_abb'].apply(lambda s: s[0]+'.'+s[1]+'.')
states_df.head(3)

Unnamed: 0,state_abb,state_name,abb_with_periods
0,AK,Alaska,A.K.
1,AL,Alabama,A.L.
2,AR,Arkansas,A.R.


In [1093]:
state_strings = set([s.lower() for row in states_df.values.tolist() for s in row])

In [1094]:
state_strings

{'a.k.',
 'a.l.',
 'a.r.',
 'a.z.',
 'ak',
 'al',
 'alabama',
 'alaska',
 'ar',
 'arizona',
 'arkansas',
 'az',
 'c.a.',
 'c.o.',
 'c.t.',
 'ca',
 'california',
 'co',
 'colorado',
 'connecticut',
 'ct',
 'd.c.',
 'd.e.',
 'dc',
 'de',
 'delaware',
 'district of columbia',
 'f.l.',
 'fl',
 'florida',
 'g.a.',
 'ga',
 'georgia',
 'h.i.',
 'hawaii',
 'hi',
 'i.a.',
 'i.d.',
 'i.l.',
 'i.n.',
 'ia',
 'id',
 'idaho',
 'il',
 'illinois',
 'in',
 'indiana',
 'iowa',
 'k.s.',
 'k.y.',
 'kansas',
 'kentucky',
 'ks',
 'ky',
 'l.a.',
 'la',
 'louisiana',
 'm.a.',
 'm.d.',
 'm.e.',
 'm.i.',
 'm.n.',
 'm.o.',
 'm.s.',
 'm.t.',
 'ma',
 'maine',
 'maryland',
 'massachusetts',
 'md',
 'me',
 'mi',
 'michigan',
 'minnesota',
 'mississippi',
 'missouri',
 'mn',
 'mo',
 'montana',
 'ms',
 'mt',
 'n.c.',
 'n.d.',
 'n.e.',
 'n.h.',
 'n.j.',
 'n.m.',
 'n.v.',
 'n.y.',
 'nc',
 'nd',
 'ne',
 'nebraska',
 'nevada',
 'new hampshire',
 'new jersey',
 'new mexico',
 'new york',
 'nh',
 'nj',
 'nm',
 'north carol

# Shelving

In [1097]:
import shelve
shelf = shelve.open('../public_data/location_data')
shelf['foreign_entities_df'] = foreign_entities_df
shelf['state_strings'] = state_strings
shelf['states_df'] = states_df
shelf.close()

In [1096]:
foreign_entities_df[foreign_entities_df['name'].str.contains('Angeles')]

Unnamed: 0,name,pop,in_us,state,raw_name_string
2834,Angeles,411634,False,,
