This notebook creates a conversion table for census tracts mapping from 2019 to 2020. It also prints accuracy matrics for the interpolation

In [24]:
# import libraries

import pandas as pd
import geopandas
import numpy as np
import seaborn as sns
import folium
from scipy import stats
from io import BytesIO
import requests

In [25]:
# Skip geometry column due to type mismatch

# Read grund truth values for census tracts
tract_truths = pd.read_csv("tracts2020.csv")
tract_truths = tract_truths.drop(columns="geometry")

# Read grund interpolated values for census tracts
tract_interpolated = pd.read_csv("interpolated.csv")
tract_interpolated = tract_interpolated.drop(columns="geometry") # Use population as weights
#tract_interpolated = pd.read_csv("interpolated_house.csv")
#tract_interpolated = tract_interpolated.drop(columns="geometry")# Use houses as weights

Create conversion table

In [32]:
# For each 2020 census tract, stores all associated 2019 census tracts in a list
conversion_table = []

# Get all the column names corresponding to tracts
tract2019_cols = [col for col in tract_interpolated.columns.tolist()[1:] if 'GEOID' in col]

# Iterate over interpolated data
for index, tract in tract_interpolated.iterrows():
    # Create a list for related tracts
    related = [t[6:] for t in tract2019_cols if tract[t] > 0]
    conversion_table.append([str(int(tract["GEOID"])), related])

[['10001040100', ['10001040100']],
 ['10001040201', ['10001040201']],
 ['10001040203', ['10001040203']],
 ['10001040204', ['10001040202']],
 ['10001040205', ['10001040202']],
 ['10001040206', ['10001040202']],
 ['10001040501', ['10001040501']],
 ['10001040502', ['10001040502', '10001043202']],
 ['10001040700', ['10001040700']],
 ['10001040900', ['10001040900']],
 ['10001041000', ['10001041000']],
 ['10001041101', ['10001041100']],
 ['10001041200', ['10001041200']],
 ['10001041300', ['10001041300']],
 ['10001041400', ['10001041400']],
 ['10001041500', ['10001041500', '10001041600']],
 ['10001041600', ['10001041600']],
 ['10001041701', ['10001041701', '10001042100']],
 ['10001041702', ['10001041702']],
 ['10001041802', ['10001041701', '10001041802']],
 ['10001041803', ['10001041801']],
 ['10001041804', ['10001041801']],
 ['10001041900', ['10001041900']],
 ['10001042000', ['10001042000']],
 ['10001042100', ['10001042100']],
 ['10001042203', ['10001042201']],
 ['10001042204', ['10001042201

In [35]:
# Write conversion table to a csv
conversion_frame = pd.DataFrame(data=conversion_table, columns = ["Tract_2020", "Tract_2019"])
conversion_frame.to_csv("conversion_table.csv", index=False)

Calculate the difference between the interpolated values and the ground truth data

In [5]:
# Check for mathing dataframe size
print(tract_truths.shape, tract_interpolated.shape)

# Locate interpolated tracts with missing data (3)
tract_interpolated.loc[tract_interpolated.isna().any(axis=1)]

# Locate ground truth tracts with missing data (0)
#tract_truths.loc[tract_truths.isna().any(axis=1)]

(262, 4) (262, 4)


Unnamed: 0,GEOID,pop,housing,white
41,10001990000,,,
186,10003990100,,,
261,10005990000,,,


In [9]:
# Locate census tracts with 0 counts (6)
tract_truths.loc[(tract_truths==0).any(axis=1)]

Unnamed: 0,GEOID,pop,housing,white
40,10001980000,263,0,0
41,10001990000,0,0,0
185,10003980100,0,0,0
186,10003990100,0,0,0
260,10005980000,0,0,0
261,10005990000,0,0,0


In [10]:
# Check interpolated counts where the ground truth is 0
tract_interpolated.loc[(tract_truths==0).any(axis=1)]

Unnamed: 0,GEOID,pop,housing,white
40,10001980000,144.688995,46.138072,37.722488
41,10001990000,0.0,0.0,0.0
185,10003980100,0.0,0.0,0.0
186,10003990100,0.0,0.0,0.0
260,10005980000,0.961744,0.470161,0.461362
261,10005990000,0.0,0.0,0.0


In [17]:
# Ignore GEOID 10001980000 and  when using houses as weights due to an incorrect interpolation of values onto an empty zipcode

# TODO: describe() cannot handle division by zero when computing metrics.

tract_truths = tract_truths.loc[~(tract_truths["GEOID"].isin([10001980000,10005980000]))]
tract_interpolated = tract_interpolated.loc[~(tract_interpolated["GEOID"].isin([10001980000,10005980000]))]
print(tract_truths.shape)

(260, 4)


In [18]:
# Fill missing values in interpolated data with 0
tract_interpolated = tract_interpolated.fillna(0)

In [19]:
# Calculate difference between ground truth data and interpolated data as a percentage of ground truth data
differences = (tract_truths[['pop','housing','white']] - tract_interpolated[['pop','housing','white']]) / tract_truths[['pop','housing','white']] * 100

# Replace NAN values resulting from dividing by zero with 0
# TODO: GEOID 10005980000 has interpolated values despite having a ground truth value of zero
differences = differences.fillna(0)

In [20]:
# Analyze percentile differences
differences.apply(np.abs).describe()

Unnamed: 0,pop,housing,white
count,260.0,260.0,260.0
mean,6.653993,7.705943,10.41687
std,5.733442,6.771925,8.916894
min,0.0,0.0,0.0
25%,2.790148,2.632112,3.819253
50%,5.173204,6.058293,8.361969
75%,9.020482,10.676568,15.536756
max,31.164241,41.32781,57.8125
