# Generating Lookup Tables for Geolocation
This notebook details the steps to generate the lookup tables for geolocation.

In [315]:
import pandas as pd
pd.set_option('display.max_rows', 300)

# International Cities and Counties
World cities with populations >100K, Canadian cities, Canadian counties, UK cities

In [377]:
# Only need first column of data
un_cities = pd.read_csv('../public_data/uncities.csv').iloc[:,0].dropna()
un_cities.head()

0    Population des capitales et des villes de 100 ...
1    Continent, country or area, date, code and cit...
5                                     AFRICA - AFRIQUE
6                                    Algeria - Algérie
7                           16 IV 2008 (CDJC)         
Name: 8. Population of capital cities and cities of 100 000 or more inhabitants: latest available year,  2000 - 2019, dtype: object

In [367]:
uk_areas = pd.read_csv('../public_data/ukcensusareas.csv', skiprows=1, 
                        usecols=['Unnamed: 2']).set_axis(['area_name'], axis=1).iloc[:,0]
uk_areas.head()

0             England
1    Northern Ireland
2            Scotland
3               Wales
4          North East
Name: area_name, dtype: object

In [361]:
load_cols = ['GEO_NAME', 'DIM: Profile of Census Metropolitan Areas/Census Agglomerations (2247)',
            'Dim: Sex (3): Member ID: [1]: Total - Sex']
canada_areas = pd.read_csv('../public_data/canadacensusareas.csv', low_memory=False, usecols=load_cols)\
                  .set_axis(['area_name', 'variable', 'population'], axis=1)
canada_areas = canada_areas[canada_areas['variable'] == 'Population, 2016'].iloc[:,0]
canada_areas.head()

0                St. John's
2247            Bay Roberts
4494    Grand Falls-Windsor
6741                 Gander
8988           Corner Brook
Name: area_name, dtype: object

In [359]:
un_countries = pd.read_csv('../public_data/uncountries.csv', header=None).iloc[:,0]
un_countries.head()

0       Afghanistan
1     Åland Islands
2           Albania
3           Algeria
4    American Samoa
Name: 0, dtype: object

In [389]:
foreign_entities = set(un_cities).union(uk_areas).union(canada_areas).union(un_countries)
foreign_entities = {s.lower() for s in foreign_entities}
s = "New York City"
print ([t for t in foreign_entities if t in s.lower()])

['york', 'city']


In [392]:
'city' in foreign_entities

True

# US City Data

In [208]:
us_cities_df = pd.read_csv('../public_data/uscities.csv', usecols=['cityName', 'cityState'])
us_cities_df.head()

Unnamed: 0,cityState,cityName
0,Alabama,"Alabaster city, Alabama"
1,Alabama,"Albertville city, Alabama"
2,Alabama,"Alexander City city, Alabama"
3,Alabama,"Andalusia city, Alabama"
4,Alabama,"Anniston city, Alabama"


In [314]:
us_cities_df[us_cities_df['cityName'].str.contains('borough')]

Unnamed: 0,cityState,cityName,cityNameFormatted
100,Alaska,"Juneau city and borough, Alaska",Juneau
107,Alaska,"Sitka city and borough, Alaska",Sitka
465,California,"Hillsborough town, California",Hillsborough
905,Colorado,"Roxborough Park CDP, Colorado",Roxborough Park
943,Connecticut,"Naugatuck borough, Connecticut",Naugatuck
2176,Kentucky,"Middlesborough city, Kentucky",Middlesborough
2544,Massachusetts,"Marlborough city, Massachusetts",Marlborough
2549,Massachusetts,"Middleborough Center CDP, Massachusetts",Middleborough Center
2560,Massachusetts,"Northborough CDP, Massachusetts",Northborough
3154,New Jersey,"Allendale borough, New Jersey",Allendale


## Format the City Name from the list of cities to include only the city name

In [230]:
re.split('\W+', 'St. George')

['St', 'George']

In [273]:
us_cities_df[us_cities_df['cityName'].str.contains(' and')]

Unnamed: 0,cityState,cityName,cityNameFormatted
100,Alaska,"Juneau city and borough, Alaska",Juneau and
107,Alaska,"Sitka city and borough, Alaska",Sitka and
2842,Minnesota,St. Anthony city (Hennepin and Ramsey Counti,St. Anthony (Hennepin and Ramsey Counti
3040,Missouri,"Town and Country city, Missouri",Town and Country


In [295]:
import re
def tokenize_cityname(row):
    state = row['cityState']
    city = row['cityName']
    # Remove State from city name
    city = city.split(',')[0]
        
    # These strings were appended by the researchers; 'and' is for multiple area labels
    area_names = ['city', 'village', 'borough', 'town', 'CDP', 'municipality', 'and']
    city_and_state = city[:]
    for area_name in area_names:
        city_and_state = city_and_state.replace(' '+area_name, '', 1)
    #stripped = city_and_state.replace(state, '', 1).strip(' ,')
    stripped = city_and_state.strip(' ,')
    # Ensures strings like "Arizona City, Arizona" are changed to "Arizona City"
    # and not "City, Arizona" after strip.
    return state + ' City' if state in stripped else stripped
us_cities_df['cityNameFormatted'] = us_cities_df.apply(tokenize_cityname, axis=1)
us_cities_df[us_cities_df['cityNameFormatted'] == 'Arlington']

Unnamed: 0,cityState,cityName,cityNameFormatted
2499,Massachusetts,"Arlington CDP, Massachusetts",Arlington
4563,Tennessee,"Arlington town, Tennessee",Arlington
4670,Texas,"Arlington city, Texas",Arlington
5069,Virginia,"Arlington CDP, Virginia",Arlington
5238,Washington,"Arlington city, Washington",Arlington


In [245]:
print(len(us_cities_df[['cityState', 'cityNameFormatted']].drop_duplicates()))
us_cities_df[us_cities_df.duplicated(subset=['cityState', 'cityNameFormatted'])]

5564


Unnamed: 0,cityState,cityName,cityNameFormatted
525,California,"Live Oak city, California",Live Oak
3687,New York,"Tonawanda CDP, New York",Tonawanda
5507,Wisconsin,"Pewaukee village, Wisconsin",Pewaukee


So only 3 townships were dropped.

## Add population data

In [246]:
us_pops = pd.read_csv('../public_data/uscitypops.csv', usecols=['city', 'state_name', 'population'])
print(len(us_pops))
us_pops.head()

28338


Unnamed: 0,city,state_name,population
0,New York,New York,18713220
1,Los Angeles,California,12750807
2,Chicago,Illinois,8604203
3,Miami,Florida,6445545
4,Dallas,Texas,5743938


In [313]:
us_pops[us_pops['city']=='Darien']

Unnamed: 0,city,state_name,population
2220,Darien,Illinois,21628
8072,Darien,Georgia,3580
12030,Darien,Wisconsin,1590


In [280]:
us_pops[us_pops.duplicated(subset=['city', 'state_name'])]

Unnamed: 0,city,state_name,population
4553,Woodbury,New York,8852
5301,Middletown,Pennsylvania,7212
8018,Oakwood,Ohio,3624
8908,Midway,Florida,3000
11916,San Antonio,Puerto Rico,1625
12023,Midway,Florida,1592
12633,Oakland,Pennsylvania,1422
12648,San Antonio,Puerto Rico,1419
13691,Georgetown,Pennsylvania,1182
15813,Chula Vista,Texas,829


In [308]:
city_state_pops = pd.merge(us_cities_df.rename(columns={'cityState':'state_name', 'cityNameFormatted':'city'})[['state_name', 'city']],
                           us_pops, 
                           how='left', 
                           on=['city', 'state_name'])\
                    .sort_values('population', ascending=False)\
                    .reset_index()

In [309]:
city_state_pops.head()

Unnamed: 0,index,state_name,city,population
0,537,California,Los Angeles,12750807.0
1,1658,Illinois,Chicago,8604203.0
2,1197,Florida,Miami,6445545.0
3,4735,Texas,Dallas,5743938.0
4,4401,Pennsylvania,Philadelphia,5649300.0


In [311]:
len(city_state_pops[city_state_pops['population'].isna()].head(300))

194

In [310]:
city_state_pops[city_state_pops['population'].isna()].head(300)

Unnamed: 0,index,state_name,city,population
5378,105,Alaska,Lakes,
5379,187,Arizona,Village of Oak Creek (Big Park),
5380,299,California,Bonadelle Ranchos-Madera Ranchos,
5381,402,California,El Paso de Robles (Paso Robles),
5382,405,California,El Sobrante (Riverside County),
5383,695,California,San Buenaventura (Ventura),
5384,754,California,Spring Valley (San Diego County),
5385,916,Colorado,Twin Lakes (Adams County),
5386,924,Connecticut,Bethel,
5387,930,Connecticut,Darien,
