# Generating Lookup Tables for Geolocation
This notebook details the steps to generate the lookup tables for geolocation.

In [1218]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 300)

# International Cities and Counties
World cities with pops >100K, Canadian cities, Canadian counties, UK cities

In [1219]:
# Only need first column of data
usecols=['Unnamed: 0', 'Both sexes\nLes deux sexes', 'Both sexes\nLes deux sexes.1']
un_cities = pd.read_csv('../public_data/uncities.csv', usecols=usecols, skiprows=4)\
    .set_axis(['name', 'city_pop', 'urban_pop'], axis=1)
# Drop US Cities
first_us_idx = un_cities.index[un_cities['name']=='Abilene (TX)'][0]
last_us_idx = un_cities.index[un_cities['name']=='CHARLOTTE AMALIE'][0]
un_cities = un_cities.drop(index=range(first_us_idx, last_us_idx+1))
un_cities = un_cities.dropna(subset=['city_pop', 'urban_pop'])  # Drop if neither city or urban pop exists
un_cities['pop'] = np.where(un_cities['urban_pop'].str.startswith('.'), un_cities['city_pop'], un_cities['urban_pop'])
un_cities = un_cities[['name', 'pop']]

In [1220]:
usecols = ['Unnamed: 2', 'Population (usual residents) : All usual residents - Unit : Persons']
uk_areas = pd.read_csv('../public_data/ukcensusareas.csv', skiprows=1, usecols=usecols)\
    .set_axis(['name', 'pop'], axis=1)
uk_areas[uk_areas.notna()]

Unnamed: 0,name,pop
0,England,53012456
1,Northern Ireland,1810863
2,Scotland,5295403
3,Wales,3063456
4,North East,2596886
...,...,...
9928,"Garth, Menai (Bangor)",5125
9929,"Haverfordwest: Prendergast, Rudbaxton",2952
9930,"Brackla, Coity",13820
9931,"Crickhowell, Llangattock",3800


In [1221]:
load_cols = ['GEO_NAME', 'DIM: Profile of Census Metropolitan Areas/Census Agglomerations (2247)',
            'Dim: Sex (3): Member ID: [1]: Total - Sex']
canada_areas = pd.read_csv('../public_data/canadacensusareas.csv', low_memory=False, usecols=load_cols)\
                  .set_axis(['name', 'variable', 'pop'], axis=1)
canada_areas = canada_areas[canada_areas['variable'] == 'pop, 2016'].drop(columns=['variable'])
canada_areas

Unnamed: 0,name,pop


In [1222]:
un_countries = pd.read_csv('../public_data/uncountries.csv', header=None)
# Set country pops high so that they take precedence over city names
un_countries['pop'] = 1000000000
un_countries = un_countries.iloc[:,[0,3]].set_axis(['name', 'pop'], axis=1)
un_countries.head()

Unnamed: 0,name,pop
0,Afghanistan,1000000000
1,Åland Islands,1000000000
2,Albania,1000000000
3,Algeria,1000000000
4,American Samoa,1000000000


In [1223]:
un_countries[un_countries['name'].str.contains('None')]

Unnamed: 0,name,pop


In [1224]:
foreign_entities_df = pd.DataFrame(columns=['name','pop'])
for df in [un_cities, un_countries, canada_areas, uk_areas]:
    foreign_entities_df = foreign_entities_df.append(df.copy())
foreign_entities_df['pop'] = foreign_entities_df['pop'].astype(str).str.replace(',', '').astype(int)
#foreign_entities_df['name'] = foreign_entities_df['name'].str.lower().str.replace('city','')
foreign_entities_df['name'] = foreign_entities_df['name'].str.replace('City','').str.strip()
foreign_entities_df['is_foreign'] = True
foreign_entities_df = foreign_entities_df[~(foreign_entities_df['name']=='')]
foreign_entities_df = foreign_entities_df[foreign_entities_df['name'].str.len() >= 4]
foreign_entities_df.sort_values('pop', ascending=False)

Unnamed: 0,name,pop,is_foreign
51,Colombia,1000000000,True
141,Mauritius,1000000000,True
157,New Zealand,1000000000,True
156,New Caledonia,1000000000,True
155,Netherlands,1000000000,True
...,...,...,...
905,Bushmills,738,True
297,JAMESTOWN,657,True
3991,ALOFI,639,True
3361,VATICAN CITY,451,True


# US City Data

In [1225]:
states = pd.read_csv('../public_data/usstateabbreviations.csv', names=['abb', 'name'])['name']

def extract_state(state):
    split = state.split(',')[-1].strip()
    # Some state strings have multiple punctuation and require extra filtering,
    # e.g., "Lynchburg, Moore county metropolitan government, Tennessee"
    if ';' in split:
        return split.split(';')[-1].strip()
    else:
        return split

In [1226]:
import re
def tokenize_cityname(row):
    state = row['state']
    city = row['raw_name_string']
    city = city.split(',')[0]     # Remove State from city name
    # These strings were appended by the census; 'and' is for multiple area labels
    area_names = ['city', 'village', 'borough', 'town', 'CDP', 'municipality', 'and']
    city_and_state = city[:]
    for area_name in area_names:
        city_and_state = city_and_state.replace(' '+area_name, '', 1)
    stripped = city_and_state.strip(' ,')
    # Ensures strings like "Arizona City, Arizona" are changed to "Arizona City"
    # and not "City, Arizona" after strip.
    return state + ' City' if state in stripped else stripped

In [1227]:
len(us_cities_df['state'].unique())

52

In [1228]:
# Load data and apply operations
us_cities_df = pd.read_csv('../public_data/usacscities.csv', skiprows=1,
                          usecols=['Geographic Area Name', 'Estimate!!Total'])\
                           .set_axis(['raw_name_string', 'pop'], axis=1)
us_cities_df = us_cities_df[us_cities_df['pop'] >= 5000] # Filter by population
us_cities_df['state'] = us_cities_df['raw_name_string'].apply(extract_state)
us_cities_df['name'] = us_cities_df.apply(tokenize_cityname, axis=1)\
    .str.replace('City','')
    #.str.lower().str.replace('city', '')
us_cities_df = us_cities_df.sort_values('pop', ascending=False)\
                           .drop_duplicates(subset=['name'], keep='first')
us_cities_df['is_foreign'] = False
print(us_cities_df.shape)
us_cities_df.head()

(5293, 5)


Unnamed: 0,raw_name_string,pop,state,name,is_foreign
17551,"New York city, New York",8426743,New York,New York,False
2725,"Los Angeles city, California",3900794,California,Los Angeles,False
6283,"Chicago city, Illinois",2717534,Illinois,Chicago,False
25203,"Houston city, Texas",2217706,Texas,Houston,False
22711,"Philadelphia city, Pennsylvania",1555072,Pennsylvania,Philadelphia,False


# Merge US & International Cities

In [1229]:
foreign_entities_df['state'] = None
foreign_entities_df['raw_name_string'] = None
#foreign_entities_df = foreign_entities_df.reindex(us_cities_df.columns, axis=1)
foreign_entities_df.head()
print(foreign_entities_df[foreign_entities_df['pop'] == 1000000000].head())
all_entities = us_cities_df.append(foreign_entities_df)
all_entities = all_entities.sort_values('pop', ascending=False)
all_entities['name'] = all_entities['name'].str.strip()
all_entities.head()

             name         pop  is_foreign state raw_name_string
0     Afghanistan  1000000000        True  None            None
1   Åland Islands  1000000000        True  None            None
2         Albania  1000000000        True  None            None
3         Algeria  1000000000        True  None            None
4  American Samoa  1000000000        True  None            None


Unnamed: 0,raw_name_string,pop,state,name,is_foreign
138,,1000000000,,Marshall Islands,True
115,,1000000000,,Japan,True
201,,1000000000,,Singapore,True
102,,1000000000,,Holy See,True
103,,1000000000,,Honduras,True


## Format the City Name from the list of cities to include only the city name

In [1230]:
def add_variation(df, original, replacement):
    to_append = df[df['name'].str.contains(original)].copy()
    to_append['name'] = to_append['name'].str.replace(original, replacement)
    df = df.append(to_append)
    return df

replacements = {
    'st.': 'st',
    'new york':'nyc',
    'los angeles':'la'
}

for k, v in replacements.items():
    us_cities_df = add_variation(us_cities_df, k, v)

In [1231]:
foreign_entities_df.shape

(13417, 5)

# State names and abbreviations

In [1232]:
states_df = pd.read_csv('../public_data/usstateabbreviations.csv', header=None).set_axis(['state_abb','state_name'], axis=1)
states_df['abb_with_periods'] = states_df['state_abb'].apply(lambda s: s[0]+'.'+s[1]+'.')
states_df.head(3)

Unnamed: 0,state_abb,state_name,abb_with_periods
0,AK,Alaska,A.K.
1,AL,Alabama,A.L.
2,AR,Arkansas,A.R.


In [1233]:
#state_strings = set([s.lower() for row in states_df.values.tolist() for s in row])
state_strings = set([s for row in states_df.values.tolist() for s in row])

In [1234]:
states_dict = {}
for i, row in states_df.iterrows():
    states_dict[row['state_abb']] = row['state_name']
    states_dict[row['state_name']] = row['state_name']
    states_dict[row['abb_with_periods']] = row['state_name']
states_dict

{'AK': 'Alaska',
 'Alaska': 'Alaska',
 'A.K.': 'Alaska',
 'AL': 'Alabama',
 'Alabama': 'Alabama',
 'A.L.': 'Alabama',
 'AR': 'Arkansas',
 'Arkansas': 'Arkansas',
 'A.R.': 'Arkansas',
 'AZ': 'Arizona',
 'Arizona': 'Arizona',
 'A.Z.': 'Arizona',
 'CA': 'California',
 'California': 'California',
 'C.A.': 'California',
 'CO': 'Colorado',
 'Colorado': 'Colorado',
 'C.O.': 'Colorado',
 'CT': 'Connecticut',
 'Connecticut': 'Connecticut',
 'C.T.': 'Connecticut',
 'DC': 'District of Columbia',
 'District of Columbia': 'District of Columbia',
 'D.C.': 'District of Columbia',
 'DE': 'Delaware',
 'Delaware': 'Delaware',
 'D.E.': 'Delaware',
 'FL': 'Florida',
 'Florida': 'Florida',
 'F.L.': 'Florida',
 'GA': 'Georgia',
 'Georgia': 'Georgia',
 'G.A.': 'Georgia',
 'HI': 'Hawaii',
 'Hawaii': 'Hawaii',
 'H.I.': 'Hawaii',
 'IA': 'Iowa',
 'Iowa': 'Iowa',
 'I.A.': 'Iowa',
 'ID': 'Idaho',
 'Idaho': 'Idaho',
 'I.D.': 'Idaho',
 'IL': 'Illinois',
 'Illinois': 'Illinois',
 'I.L.': 'Illinois',
 'IN': 'Indiana',

# Shelving

In [1235]:
import shelve
shelf = shelve.open('../public_data/location_data')
shelf['foreign_entities_df'] = foreign_entities_df
shelf['state_strings'] = state_strings
shelf['states_df'] = states_df
shelf['states_dict'] = states_dict
shelf['all_entities'] = all_entities
shelf.close()

In [1236]:
foreign_entities_df[foreign_entities_df['name'].str.contains('Angeles')]

Unnamed: 0,name,pop,is_foreign,state,raw_name_string
2834,Angeles,411634,True,,


In [1237]:
all_entities[all_entities['name'].str.contains('York')]

Unnamed: 0,raw_name_string,pop,state,name,is_foreign
17551,"New York city, New York",8426743,New York,New York,False
6,,5283733,,Yorkshire and The Humber,True
45,,2226058,,West Yorkshire,True
42,,1343601,,South Yorkshire,True
30,,598376,,North Yorkshire,True
84,,334179,,East Riding of Yorkshire,True
87,,198051,,York,True
16367,"West New York town, New Jersey",51860,New Jersey,West New York,False
23259,"York city, Pennsylvania",43853,Pennsylvania,York,False
7419,"Yorkville city, Illinois",18222,Illinois,Yorkville,False
