# Geocoding Utility Demo

In [1]:
import os
import pandas as pd
import numpy as np

from delphi_utils import GeoMapper

os.chdir("_delphi_utils_python/delphi_utils/data/2020/")

# Basic Utility Usage
Two functions: `add_geocode` and `replace_geocode`.

In [15]:
fips_data = pd.DataFrame({
        "fips":[1123,48253,72003,18181],
        "date":[pd.Timestamp('2018-01-01')]*4,
        "count": [2,1,np.nan,10021],
        "total": [4,1,np.nan,100001]
    })

# Add a new column with the new code
gmpr = GeoMapper()
df = gmpr.add_geocode(fips_data, "fips", "zip")
print(df.head())

# Convert a column with the new code
gmpr = GeoMapper()
df = gmpr.replace_geocode(fips_data, "fips", "zip")
print(df.head())

fips       date  count  total    zip    weight
0  01123 2018-01-01    2.0    4.0  35010  0.461001
1  01123 2018-01-01    2.0    4.0  35072  0.013264
2  01123 2018-01-01    2.0    4.0  35089  0.017661
3  01123 2018-01-01    2.0    4.0  36078  0.113826
4  01123 2018-01-01    2.0    4.0  36255  0.000433
        date    zip     count     total
0 2018-01-01  00602  0.000000  0.000000
1 2018-01-01  00610  0.000000  0.000000
2 2018-01-01  00676  0.000000  0.000000
3 2018-01-01  00677  0.000000  0.000000
4 2018-01-01  35010  0.922001  1.844002


In [16]:
gmpr = GeoMapper()
df = gmpr.replace_geocode(fips_data, "fips", "hrr")
df

Unnamed: 0,date,hrr,count,total
0,2018-01-01,1,1.772347,3.544694
1,2018-01-01,183,7157.392404,71424.648014
2,2018-01-01,184,2863.607596,28576.351986
3,2018-01-01,382,1.0,1.0
4,2018-01-01,7,0.227653,0.455306


In [19]:
df = gmpr.replace_geocode(fips_data, "fips", "hrr")
df2 = gmpr.replace_geocode(fips_data, "fips", "zip")
df2 = gmpr.replace_geocode(df2, "zip", "hrr")
np.allclose(df[['count', 'total']].values, df2[['count', 'total']].values)

True

# Utility Inner Workings

## Deriving a crosswalk
Given two crosswalks, we create a derived crosswalk by merging on the common code. This is the method used in `geo_data_proc.py`.

In [21]:
state_df = pd.read_csv("state_codes_table.csv", dtype={"state_code": str, "state_id": str, "state_name": str})
zip_fips_df = pd.read_csv("zip_fips_table.csv", dtype={"zip": str, "fips": str})
zip_fips_df["state_code"] = zip_fips_df["fips"].str[:2]
zip_state_code_df = zip_fips_df.merge(state_df, on="state_code", how="left").drop(columns=["fips", "state_id", "state_name"])
assert 52 == len(zip_state_code_df.state_code.unique())
zip_state_code_df

Unnamed: 0,zip,weight,state_code
0,00601,0.994346,72
1,00601,0.005654,72
2,00602,1.000000,72
3,00603,1.000000,72
4,00606,0.948753,72
...,...,...,...
44405,99923,1.000000,02
44406,99925,1.000000,02
44407,99926,1.000000,02
44408,99927,1.000000,02


A weighted crosswalk requires a summation.

In [25]:
FIPS_ZIP_OUT_FILENAME = "fips_zip_table.csv"
ZIP_HRR_OUT_FILENAME = "zip_hrr_table.csv"
from os.path import join, isfile

fz_df = pd.read_csv(
    FIPS_ZIP_OUT_FILENAME,
    dtype={"fips": str, "zip": str, "weight": float},
)
zh_df = pd.read_csv(
    ZIP_HRR_OUT_FILENAME,
    dtype={"zip": str, "hrr": str},
)

df = (fz_df.merge(zh_df, on="zip", how="left")
          .drop(columns="zip")
          .groupby(["fips", "hrr"])
          .sum()
          .reset_index())
df

Unnamed: 0,fips,hrr,weight
0,01001,1,0.039105
1,01001,7,0.960895
2,01003,134,0.031998
3,01003,6,0.968002
4,01005,2,0.974360
...,...,...,...
5178,56039,274,0.003804
5179,56039,423,0.996196
5180,56041,423,1.000000
5181,56043,274,1.000000


## Adding a geocode column
Adding a new geocode column is a merge using a matching geocode (left or inner joins, depending on whether we wish to keep NAs or not). Here we translate from zip to fips on some faux data. Since this a merge on the left, invalid ZIP values present in the data, but not present in the crosswalk simply get NAN entries in their columns. If the crosswalk is weighted, a "weights" column is added also.

In [27]:
zip_data = pd.DataFrame(
        {
            "zip": ["45140", "45147", "00500", "95616", "95618"],
            "date": pd.date_range("2018-01-01", periods=5),
            "count": [2, np.nan, 20, 100, 21],
            "total": [2, 20, 40, np.nan, 20]
        }
    )
zip_fips_df = pd.read_csv("zip_fips_table.csv", dtype={"zip": str, "fips": str})

data_df = zip_data.merge(zip_fips_df, left_on="zip", right_on="zip", how="left")
data_df

Unnamed: 0,zip,date,count,total,fips,weight
0,45140,2018-01-01,2.0,2.0,39025.0,0.52357
1,45140,2018-01-01,2.0,2.0,39061.0,0.288115
2,45140,2018-01-01,2.0,2.0,39165.0,0.188315
3,45147,2018-01-02,,20.0,39025.0,0.938776
4,45147,2018-01-02,,20.0,39061.0,0.061224
5,500,2018-01-03,20.0,40.0,,
6,95616,2018-01-04,100.0,,6113.0,1.0
7,95618,2018-01-05,21.0,20.0,6095.0,0.003372
8,95618,2018-01-05,21.0,20.0,6113.0,0.996628


## Replacing a column
If there are no weights, we just drop the old column and we're done. If there are weights, we multiply the data by the weights and sum over the old codes. A helpful way to think of the operation is a multiplication of the data matrix (row vectors are columns of the dataframe) $D$ by the weights matrix $W$, resulting in $D*W$. The weights matrix is row-stochastic (i.e. rows sum to 1). 

Note that the aggregation step (i.e. linear combination of source code values) requires a decision for how to handle NA values. We choose to zero-fill them to avoid propagating NAs.

In [28]:
data_df = data_df.drop(columns="zip")

# Multiply and aggregate
data_df[["count", "total"]] = data_df[["count", "total"]].multiply(data_df["weight"], axis=0)
data_df = (data_df.drop("weight", axis=1)
                  .groupby(["date", "fips"])
                  .sum()
                  .reset_index())
data_df

Unnamed: 0,date,fips,count,total
0,2018-01-01,39025,1.04714,1.04714
1,2018-01-01,39061,0.576229,0.576229
2,2018-01-01,39165,0.376631,0.376631
3,2018-01-02,39025,0.0,18.77551
4,2018-01-02,39061,0.0,1.22449
5,2018-01-04,6113,100.0,0.0
6,2018-01-05,6095,0.070819,0.067446
7,2018-01-05,6113,20.929181,19.932554


## Building population weights for FIPS <-> ZIP

In [29]:
FIPS_BY_ZIP_POP_URL = (
    "https://www2.census.gov/geo/docs/maps-data/data/rel/zcta_county_rel_10.txt?#"
)
pop_df = pd.read_csv(FIPS_BY_ZIP_POP_URL)

# Create the FIPS column by combining the state and county codes
pop_df["fips"] = pop_df["STATE"].astype(str).str.zfill(2) + pop_df["COUNTY"].astype(
    str
).str.zfill(3)

# Create the ZIP column by adding leading zeros to the ZIP
pop_df["zip"] = pop_df["ZCTA5"].astype(str).str.zfill(5)

# Pare down the dataframe to just the relevant columns: zip, fips, and population
pop_df = pop_df[["zip", "fips", "POPPT"]].rename(columns={"POPPT": "pop"})

pop_df.set_index(
    ["fips", "zip"], inplace=True
)  # can we do without this and resetting index below?
pop_df

Unnamed: 0_level_0,Unnamed: 1_level_0,pop
fips,zip,Unnamed: 2_level_1
72001,00601,18465
72141,00601,105
72003,00602,41520
72005,00603,54689
72093,00606,6276
...,...,...
02198,99923,87
02198,99925,819
02198,99926,1460
02198,99927,94


In [31]:
# 2010 Census, corresponds to 308 million population figure
pop_df["pop"].sum()

312462997

## US Census FIPS <-> ZIP crosswalk versus simplemaps.com
We're switching to the US Census table for safety. The FIPS to ZIP weights in the two are essentially the same.

In [110]:
df_census = GeoMapper().load_crosswalk("zip", "fips")
df_simplemaps = pd.read_csv("../../data_proc/geomap/uszips.csv")
print(df_simplemaps["population"].sum())

326256148


In [111]:
df_simplemaps["county_weights"] = df_simplemaps["county_weights"].transform(lambda x: list(eval(x).items()))
df_simplemaps = df_simplemaps.explode("county_weights")
df_simplemaps["county_fips"] = df_simplemaps["county_weights"].apply(lambda x: x[0])
df_simplemaps["county_weights"] = df_simplemaps["county_weights"].apply(lambda x: x[1]/100)
df_simplemaps = df_simplemaps.rename(columns={"county_fips": "fips"})
df_simplemaps["zip"] = df_simplemaps["zip"].astype(str).str.zfill(5)
df_simplemaps["fips"] = df_simplemaps["fips"].astype(str).str.zfill(5)
df = df_census.merge(df_simplemaps, on=["zip", "fips"], how="left")

In [62]:
df["weight"].sub(df["county_weights"]).abs().mean()

1.1494991956541422e-05

In [68]:
1 - df["weight"].corr(df["county_weights"])

1.307895680646709e-09

In [120]:
df = df.dropna(subset=["population"])
print(df.groupby("zip")["population"].unique().sum()[0] - df["population"].multiply(df["county_weights"]).sum(),
      df.groupby("zip")["population"].unique().sum()[0] - df["population"].multiply(df["weight"]).sum())

113.4559999704361 147.0


## We have updated the FIPS to HRR tables since the last version (James' version)
And they haven't changed by very much. 
Note: Since JHU is now deactivated, this code may not work.

In [None]:
df_new = GeoMapper().load_crosswalk("fips", "hrr")
df_old = pd.read_csv("https://raw.githubusercontent.com/cmu-delphi/covidcast-indicators/jhu_fix_0824/_delphi_utils_python/delphi_utils/data/fips_hrr_cross.csv?token=AANZ76Q7CUS7REWHRIGNKV27KHH6U", dtype={"fips": str, "hrr": str, "weight": float})
df_old["fips"] = df_old["fips"].str.zfill(5)
df = df_new.groupby(["hrr", "fips"]).sum().reset_index().merge(df_old, on=["fips", "hrr"], how="left")
df.weight_x.sub(df.weight_y).abs().mean()

## Adding HHS codes
These are the department of health and human services region codes. They aggregate states into larger regions. I couldn't find a crosswalk file on the web, so I built one manually below.

In [None]:
with open("../../data_proc/geomap/hhs.txt") as f:
    s = f.readlines()

# Process text from https://www.hhs.gov/about/agencies/iea/regional-offices/index.html
s = [int(st[7:9]) if "Region" in st else st for st in s]
s = [st.strip().split(", ") if type(st) == str else st for st in s]
d = {s[i]:s[i+1] for i in range(0, len(s), 2)}
d = {key:[s.lstrip(' and') for s in d[key]] for key in d}

# Flatten
d = [[(key,x) for x in d[key]] for key in d]
d = [x for y in d for x in y]

# Make naming adjustments
d.remove((2, "the Virgin Islands"))
d.append((2, "U.S. Virgin Islands"))
d.remove((9, "Commonwealth of the Northern Mariana Islands"))
d.append((9, "Northern Mariana Islands"))

# Make dataframe
hhs = pd.DataFrame(d, columns=["hhs", "state_name"])
hhs['hhs'] = hhs['hhs'].astype(str)

ss_df = pd.read_csv("state_codes_table.csv",
    dtype={"state_code": str, "state_name": str, "state_id": str},
)

ss_df = ss_df.merge(hhs, on="state_name", how="left").dropna()
