This notebook pulls 2010 census data and attempts to generate a large set of points that approximates a smooth surface while propogating error

In [313]:
# Declare static variables

n=500 # The number of points to assign to each census block
BINOMIAL_TRIALS = 40 # The number of trials in the binomial distribution used for weighting points in blocks. The higher the value, the more evenly distributed the population points will be through the census block
BINOMIAL_SUCCESS = 0.5 # The probability of success for each trial in he weight assignment. Must be <=1. Use 1 for a uniform distribution

# Potential other methods for weight generation:
# Apply a transformation to make the outcome more normal
# Apply a normal distribution and use min/max normalization
# Weighted Poisson binomial distribution


# Binomial distribution is nearly normal if np(1-p) >= 10
print(BINOMIAL_TRIALS * BINOMIAL_SUCCESS * (1 - BINOMIAL_SUCCESS) >= 10)

True


In [314]:
# Import libraries

import pandas as pd
import geopandas
import numpy as np
import requests
from io import BytesIO
import folium
from IPython.display import clear_output
from itertools import chain

In [315]:
# Request shapefile data for 2010 census tracts and convert to geopandas dataframe

# Shapefile url
data_url = 'https://www2.census.gov/geo/tiger/GENZ2010/gz_2010_10_140_00_500k.zip'


# Request data
data = requests.get(data_url)
# convert to pandas dataframe
tract_data = geopandas.read_file(BytesIO(data.content))

In [316]:
# Request shapefile data for 2010 census blocks and convert to geopandas dataframe

# Shapefile url
data_url = 'https://www2.census.gov/geo/tiger/TIGER2010/TABBLOCK/2010/tl_2010_10_tabblock10.zip'


# Request data
data = requests.get(data_url)
# convert to pandas dataframe
block_data = geopandas.read_file(BytesIO(data.content))

In [317]:
# For each census block, create a bounding box
block_bounds = block_data["geometry"].bounds

# Attch GEOID to boundaries
block_bounds = block_data[["GEOID10","geometry"]].merge(block_bounds, left_index=True, right_index=True)

In [318]:
# Fit a 2D Binomial distribution over the bounding boxes

# Takes in a row of 'block_bounds' and outputs a 2D Gaussian distribution of 'n' points over the bounding box, as well as the GEOID
def get_points(row,n):
    print(f"Processing Block {row['GEOID10']}...")
    # 'i' is the total number of points left to assign
    i=n
    # 'points_return' is the list of all points for the block
    # TODO: CRS is hardcoded
    points_return = geopandas.GeoSeries(crs="EPSG:4269")
    # Allocate points until n have been assigned
    while i > 0:
        # Generates a uniform distribution for the y-axis located at the center of the box
        pointsy = np.random.uniform(low=row["miny"], high=row["maxy"], size=i)
        # Generates a uniform distribution for the x-axis located at the center of the box
        pointsx = np.random.uniform(low=row["minx"], high=row["maxx"], size=i)
        # Convert the points to Shapely points
        points = geopandas.GeoSeries(geopandas.points_from_xy(pointsx, pointsy, crs="EPSG:4269"))
        # Check if the points are inside the block
        point_checks = points.within(row["geometry"])
        # Add found points to our list
        points_return = geopandas.GeoSeries(pd.concat([points_return, points[point_checks]], ignore_index=True), crs=points_return.crs)
        # Set 'i' equal to the number of missed points
        i = n - points_return.size
    
    # Generates a binomial distribution of weights
    weights = np.random.binomial(n=BINOMIAL_TRIALS, p=BINOMIAL_SUCCESS, size=n)
    # Normalize weights so that they sum to 1
    weights = weights / np.sum(weights)
    # Sort the weights based on distance from the mean
    weights = weights[np.argsort(np.abs(weights - np.mean(weights)))]
    # Generate a series containing the distance from each point to the centroid
    distances = points_return.distance(row["geometry"].centroid)
    # Create a column for the index of the point and sort by distance
    distances = distances.reset_index(name="distance").sort_values(by="distance")
    # Assign a weight to each point
    distances["weight"] = weights
    # Merge weights onto points
    points_return = pd.merge(left=points_return.rename("geometry"), right=distances, how="left", right_on="index", left_index=True)[["geometry", "weight"]]
    # Clear warnings from notebook output to prevent crash
    clear_output()
    # Return an array with every point in the cloud, the weights for each point and the GEOID
    return list(chain(points_return["geometry"].values, points_return["weight"].values, [row["GEOID10"]]))
    

In [319]:
# Fit a Gaussian distribution to each block
point_cloud = block_bounds.apply(get_points, axis=1, args=(n,), result_type='expand')

# Rename columns of the pointcloud
point_cloud.columns = ['point_' + str(x) if x<n else 'weight_' + str(x-n) if x<2*n else 'GEOID' for x in point_cloud.columns]

In [320]:
# Pull population data for 2010 Census blocks
# Define request parameters

year = '2010' # Year of interest
datasource = 'dec' # Survey name
subsource = 'pl' # Subsurvey name
GET = 'P001001' # Variables to query
FOR = 'block:*' # for predicate
IN = 'state:10&in=county:*&in=tract:*'

# Filepath to your Census API key
keyfile = 'CensusAPIKey.txt'

# Formatted API call
data_url = f'https://api.census.gov/data/{year}/{datasource}/{subsource}?get={GET}&for={FOR}&in={IN}'

# Read Census key into 'api_key'
with open(keyfile) as key:
    api_key = key.read().strip()

# Add key to url
data_url = f'{data_url}&key={api_key}'

# Request data and convert from json
data = requests.get(data_url).json()
# First entry in list is a list of variable names
data = pd.DataFrame(data[1:], columns = data[0])

# Rename columns to match shapefile pull
data.rename(columns = {"state":"STATEFP10", "county":"COUNTYFP10", "tract":"TRACTCE10", "block":"BLOCKCE10"}, inplace=True)

# Attach to block shapes
block_data = block_data.merge(data, on=["STATEFP10","COUNTYFP10","TRACTCE10","BLOCKCE10"])

In [321]:
# Pull population data for 2010 Census tracts
# This notebook pulls population, white population, and female population
# Define request parameters

year = '2010' # Year of interest
datasource = 'acs' # Survey name
subsource = 'acs5' # Subsurvey name
GET = 'B01003_001E,B01003_001EA,B01003_001M,B01003_001MA,B02001_002E,B02001_002EA,B02001_002M,B02001_002MA,B01001_026E,B01001_026EA,B01001_026M,B01001_026MA' # Variables to query
FOR = 'tract:*' # for predicate
IN = 'state:10' # in predicate


# Filepath to your Census API key
keyfile = 'CensusAPIKey.txt'

# Formatted API call
data_url = f'https://api.census.gov/data/{year}/{datasource}/{subsource}?get={GET}&for={FOR}&in={IN}'

# Read Census key into 'api_key'
with open(keyfile) as key:
    api_key = key.read().strip()

# Add key to url
data_url = f'{data_url}&key={api_key}'

# Request data and convert from json
data = requests.get(data_url).json()
# First entry in list is a list of variable names
data = pd.DataFrame(data[1:], columns = data[0])

# Rename columns to match shapefile pull
data.rename(columns = {"state":"STATE", "county":"COUNTY", "tract":"TRACT"}, inplace=True)

# Attach to tract shapes
tract_data = tract_data.merge(data, on=["STATE","COUNTY","TRACT"])

In [322]:
# Check data anotations for errors
print(tract_data.loc[~tract_data["B01003_001EA"].isnull()])
print(tract_data.loc[~tract_data["B01003_001MA"].isnull()])
print(tract_data.loc[~tract_data["B02001_002EA"].isnull()])
print(tract_data.loc[~tract_data["B02001_002MA"].isnull()])
print(tract_data.loc[~tract_data["B01001_026EA"].isnull()])
print(tract_data.loc[~tract_data["B01001_026MA"].isnull()])

Empty GeoDataFrame
Columns: [GEO_ID, STATE, COUNTY, TRACT, NAME, LSAD, CENSUSAREA, geometry, B01003_001E, B01003_001EA, B01003_001M, B01003_001MA, B02001_002E, B02001_002EA, B02001_002M, B02001_002MA, B01001_026E, B01001_026EA, B01001_026M, B01001_026MA]
Index: []
Empty GeoDataFrame
Columns: [GEO_ID, STATE, COUNTY, TRACT, NAME, LSAD, CENSUSAREA, geometry, B01003_001E, B01003_001EA, B01003_001M, B01003_001MA, B02001_002E, B02001_002EA, B02001_002M, B02001_002MA, B01001_026E, B01001_026EA, B01001_026M, B01001_026MA]
Index: []
Empty GeoDataFrame
Columns: [GEO_ID, STATE, COUNTY, TRACT, NAME, LSAD, CENSUSAREA, geometry, B01003_001E, B01003_001EA, B01003_001M, B01003_001MA, B02001_002E, B02001_002EA, B02001_002M, B02001_002MA, B01001_026E, B01001_026EA, B01001_026M, B01001_026MA]
Index: []
Empty GeoDataFrame
Columns: [GEO_ID, STATE, COUNTY, TRACT, NAME, LSAD, CENSUSAREA, geometry, B01003_001E, B01003_001EA, B01003_001M, B01003_001MA, B02001_002E, B02001_002EA, B02001_002M, B02001_002MA, B010

In [323]:
# Look for missing tract data

tract_data.loc[pd.isna(tract_data[['B01003_001E','B01003_001M','B02001_002E','B02001_002M','B01001_026E','B01001_026M']]).any(axis=1)][['B01003_001E','B01003_001M','B02001_002E','B02001_002M','B01001_026E','B01001_026M']]


Unnamed: 0,B01003_001E,B01003_001M,B02001_002E,B02001_002M,B01001_026E,B01001_026M


In [324]:
# Assign a fraction of the population of each block as a value to each point

# Merge each point to the 2010 census block containing it
population_per_point = point_cloud.merge(block_data, how="left", left_on="GEOID", right_on="GEOID10")

# Multiply each weight by the block population to get the block population per point
population_per_point[[x for x in population_per_point.columns if 'weight' in x]] = population_per_point[[x for x in population_per_point.columns if 'weight' in x]].mul(population_per_point["P001001"].astype(int), axis=0)

In [325]:
# Flatten to a GeoSeries where each row is a point and its weight
weights = np.array([[row["weight_" + str(i)] for i in range(n)] for _, row in population_per_point.iterrows()]).flatten()
points = np.array([[row["point_" + str(i)] for i in range(n)] for _, row in population_per_point.iterrows()]).flatten()
points_list = geopandas.GeoDataFrame({"population_per_point":weights,"geometry":points}, crs="EPSG:4269")


# Determine the number of points in the point cloud. This should be n * the number of census blocks
print(points_list.shape[0] / n == block_data.shape[0])

  exec(code_obj, self.user_global_ns, self.user_ns)
  points = np.array([[row["point_" + str(i)] for i in range(n)] for _, row in population_per_point.iterrows()]).flatten()
  points = np.array([[row["point_" + str(i)] for i in range(n)] for _, row in population_per_point.iterrows()]).flatten()


True


In [326]:
# Spatially join each point to the 2010 census tract containing it
variables_per_point = geopandas.sjoin(points_list, tract_data, how="left", op='within')

In [327]:
# Look for missing data: ~750 points with missed in the join
variables_per_point.loc[pd.isna(variables_per_point[['B01003_001E','B01003_001M','B02001_002E','B02001_002M','B01001_026E','B01001_026M',"index_right"]]).any(axis=1)][['B01003_001E','B01003_001M','B02001_002E','B02001_002M','B01001_026E','B01001_026M']]

Unnamed: 0,B01003_001E,B01003_001M,B02001_002E,B02001_002M,B01001_026E,B01001_026M
32508,,,,,,
32526,,,,,,
32527,,,,,,
32567,,,,,,
32589,,,,,,
...,...,...,...,...,...,...
11999459,,,,,,
11999472,,,,,,
11999480,,,,,,
11999482,,,,,,


In [328]:
# WARNING: Plot is large and should only be rendered if necessary
# TODO: Points around the edge of the state are being lost

"""
# Find and plot all missed points 
missed_points = variables_per_point.loc[variables_per_point["index_right"].isna()]

# initialize the map and store it in a folium map object
us_map = folium.Map(location=[38.9108, -75.5277], zoom_start=8, tiles=None)

# Add background tiles
folium.TileLayer('CartoDB positron',name="Light Map",control=False).add_to(us_map)

# Style and highlight functions map population values to color values
style_function = lambda x: {"weight":0.5, 
                            'color':'black',
                            'fillColor':'red', 
                            'fillOpacity':0.75}

# Add a map over the tiles with the given colors and a tooltip
NIL=folium.features.GeoJson(
        missed_points, # Full geopandas data
        style_function=style_function, # function for base colors
        control=False
    )

# Add elements to map
us_map.add_child(NIL)"""

'\n# Find and plot all missed points \nmissed_points = variables_per_point.loc[variables_per_point["index_right"].isna()]\n\n# initialize the map and store it in a folium map object\nus_map = folium.Map(location=[38.9108, -75.5277], zoom_start=8, tiles=None)\n\n# Add background tiles\nfolium.TileLayer(\'CartoDB positron\',name="Light Map",control=False).add_to(us_map)\n\n# Style and highlight functions map population values to color values\nstyle_function = lambda x: {"weight":0.5, \n                            \'color\':\'black\',\n                            \'fillColor\':\'red\', \n                            \'fillOpacity\':0.75}\n\n# Add a map over the tiles with the given colors and a tooltip\nNIL=folium.features.GeoJson(\n        missed_points, # Full geopandas data\n        style_function=style_function, # function for base colors\n        control=False\n    )\n\n# Add elements to map\nus_map.add_child(NIL)'

In [329]:
# ~175 points with 0 population
variables_per_point.loc[variables_per_point["B01003_001E"].astype(float) <= 0]

Unnamed: 0,population_per_point,geometry,index_right,GEO_ID,STATE,COUNTY,TRACT,NAME,LSAD,CENSUSAREA,...,B01003_001M,B01003_001MA,B02001_002E,B02001_002EA,B02001_002M,B02001_002MA,B01001_026E,B01001_026EA,B01001_026M,B01001_026MA
835526,0.0,POINT (-75.43200 39.28843),5.0,1400000US10001990000,10,001,990000,9900,Tract,0.0,...,119,,0,,119,,0,,119,
835567,0.0,POINT (-75.44993 39.32030),5.0,1400000US10001990000,10,001,990000,9900,Tract,0.0,...,119,,0,,119,,0,,119,
835618,0.0,POINT (-75.42928 39.28597),5.0,1400000US10001990000,10,001,990000,9900,Tract,0.0,...,119,,0,,119,,0,,119,
835634,0.0,POINT (-75.43670 39.30613),5.0,1400000US10001990000,10,001,990000,9900,Tract,0.0,...,119,,0,,119,,0,,119,
835641,0.0,POINT (-75.44570 39.31728),5.0,1400000US10001990000,10,001,990000,9900,Tract,0.0,...,119,,0,,119,,0,,119,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7576996,0.0,POINT (-75.44037 39.78709),181.0,1400000US10003990100,10,003,990100,9901,Tract,0.0,...,119,,0,,119,,0,,119,
7576997,0.0,POINT (-75.45121 39.78432),181.0,1400000US10003990100,10,003,990100,9901,Tract,0.0,...,119,,0,,119,,0,,119,
7576998,0.0,POINT (-75.44240 39.78510),181.0,1400000US10003990100,10,003,990100,9901,Tract,0.0,...,119,,0,,119,,0,,119,
7576999,0.0,POINT (-75.44222 39.78716),181.0,1400000US10003990100,10,003,990100,9901,Tract,0.0,...,119,,0,,119,,0,,119,


In [330]:
# Exclude missed points from the list
variables_per_point = variables_per_point.dropna(subset = ["B01003_001E", "B02001_002E", "B01001_026E","B01003_001M", "B02001_002M", "B01001_026M"])
# Exclude points corresponding to a population of 0
variables_per_point = variables_per_point.loc[variables_per_point["B01003_001E"].astype(float) > 0]


# Divide variables of intersest by tract population and multiply by the portion of the population represented by each point
variables_per_point[["B01003_001E", "B02001_002E", "B01001_026E"]] = variables_per_point[["B01003_001E", "B02001_002E", "B01001_026E"]].astype(int).div(variables_per_point["B01003_001E"].astype(int), axis=0).mul(variables_per_point["population_per_point"].astype(float), axis=0)

# Propogate Error through ratios: dg = g(a,b) sqrt([da/a]^2 + [db/b]^2)
variables_per_point["B02001_002T"] = variables_per_point["B02001_002E"].astype(int).div(variables_per_point["B01003_001E"].astype(int), axis=0).mul((variables_per_point["B02001_002M"].astype(float).div(variables_per_point["B02001_002E"].astype(float)).apply(np.square) + variables_per_point["B01003_001M"].astype(float).div(variables_per_point["B01003_001E"].astype(float)).apply(np.square)).apply(np.sqrt), axis=0)
# Propogate error for multiplication with weights: df_n = w_n * x_n * dg
variables_per_point["B02001_002T"] = variables_per_point["B02001_002T"].mul(variables_per_point["population_per_point"], axis=0)

# Repeat for other variables
# Propogate Error through ratios: dg = g(a,b) sqrt([da/a]^2 + [db/b]^2)
variables_per_point["B01001_026T"] = variables_per_point["B01001_026E"].astype(int).div(variables_per_point["B01003_001E"].astype(int), axis=0).mul((variables_per_point["B01001_026M"].astype(float).div(variables_per_point["B01001_026E"].astype(float)).apply(np.square) + variables_per_point["B01003_001M"].astype(float).div(variables_per_point["B01003_001E"].astype(float)).apply(np.square)).apply(np.sqrt), axis=0)
# Propogate error for multiplication with weights: df_n = w_n * x_n * dg
variables_per_point["B01001_026T"] = variables_per_point["B01001_026T"].mul(variables_per_point["population_per_point"], axis=0)

# For population, we do not have a way to propogate error since we are summing block data

# Reset index
variables_per_point = variables_per_point.reset_index()

In [331]:
variables_per_point

Unnamed: 0,index,population_per_point,geometry,index_right,GEO_ID,STATE,COUNTY,TRACT,NAME,LSAD,...,B02001_002E,B02001_002EA,B02001_002M,B02001_002MA,B01001_026E,B01001_026EA,B01001_026M,B01001_026MA,B02001_002T,B01001_026T
0,0,0.512451,POINT (-75.48513 39.12139),87.0,1400000US10001041100,10,001,041100,411,Tract,...,0.408471,,392,,0.268270,,282,,,
1,1,0.536854,POINT (-75.48523 39.12085),87.0,1400000US10001041100,10,001,041100,411,Tract,...,0.427922,,392,,0.281045,,282,,,
2,2,0.585659,POINT (-75.48637 39.12054),87.0,1400000US10001041100,10,001,041100,411,Tract,...,0.466824,,392,,0.306594,,282,,,
3,3,0.658866,POINT (-75.48428 39.12180),87.0,1400000US10001041100,10,001,041100,411,Tract,...,0.525176,,392,,0.344919,,282,,,
4,4,0.463646,POINT (-75.48579 39.12102),87.0,1400000US10001041100,10,001,041100,411,Tract,...,0.369569,,392,,0.242721,,282,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11963535,12057495,0.000000,POINT (-75.46311 38.52237),176.0,1400000US10005051702,10,005,051702,517.02,Tract,...,0.000000,,518,,0.000000,,305,,,
11963536,12057496,0.000000,POINT (-75.46728 38.52285),176.0,1400000US10005051702,10,005,051702,517.02,Tract,...,0.000000,,518,,0.000000,,305,,,
11963537,12057497,0.000000,POINT (-75.46613 38.52173),176.0,1400000US10005051702,10,005,051702,517.02,Tract,...,0.000000,,518,,0.000000,,305,,,
11963538,12057498,0.000000,POINT (-75.46912 38.52775),176.0,1400000US10005051702,10,005,051702,517.02,Tract,...,0.000000,,518,,0.000000,,305,,,


In [332]:
# Print the number of points missed in the transfer of data from tracts to points
print(points_list.shape[0] -  variables_per_point.shape[0])

93960


In [333]:
# Request shapefile data for 2020 census tracts and convert to geopandas dataframe

# Shapefile url
data_url = 'https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_10_tract_500k.zip'


# Request data
data = requests.get(data_url)
# convert to pandas dataframe
tract2020 = geopandas.read_file(BytesIO(data.content))

In [334]:
# Spatially join points to 2020 census tracts
interpolated_values = geopandas.sjoin(variables_per_point[["geometry","B01003_001E", "B02001_002E", "B01001_026E","B01003_001M", "B02001_002T", "B01001_026T"]], tract2020, how="right", op='within')

# Assign 0 value to tracts with no points
interpolated_values = interpolated_values.fillna(value=0)

# Propogate error through the summation of points: dh_m = sqrt( sum_{i=1}^n delta_i (dg^2) : where delta_i=1 if the it point is in the mth 2020 census tract and delta_i=0 otherwise
interpolated_errors = interpolated_values[["B01003_001M", "B02001_002T", "B01001_026T"]].astype(float).apply(np.square)#.div(np.log(n^2))
interpolated_errors["GEOID"] = interpolated_values["GEOID"]
interpolated_errors = (interpolated_errors.groupby("GEOID").sum()).apply(np.sqrt)

# Sum the values for each 2020 tract
interpolated_values = interpolated_values[["GEOID", "B01003_001E", "B02001_002E", "B01001_026E"]].groupby("GEOID").sum()

# Join values and errors into a completed dataframe
final_df = interpolated_values.join(interpolated_errors, on="GEOID")

# Reset GEOID from index to column
final_df = final_df.reset_index()

In [335]:
interpolated_errors

Unnamed: 0_level_0,B01003_001M,B02001_002T,B01001_026T
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10001040100,113860.976133,2069.023096,0.0
10001040201,121328.671121,0.000000,0.0
10001040203,102433.916903,0.000000,0.0
10001040204,222418.632293,0.000000,0.0
10001040205,158928.315680,6798.789785,0.0
...,...,...,...
10005051702,208433.537990,0.000000,0.0
10005051801,134264.793639,0.000000,0.0
10005051802,97192.839052,0.000000,0.0
10005051900,126579.263855,0.000000,0.0


In [336]:
# Pull population data for 2020 Census tracts
# Define request parameters

year = '2020' # Year of interest
datasource = 'acs' # Survey name
subsource = 'acs5' # Subsurvey name
GET = 'B01003_001E,B01003_001M,B02001_002E,B02001_002M,B01001_026E,B01001_026M' # Variables to query
FOR = 'tract:*' # for predicate
IN = 'state:10' # in predicate


# Filepath to your Census API key
keyfile = 'CensusAPIKey.txt'

# Formatted API call
data_url = f'https://api.census.gov/data/{year}/{datasource}/{subsource}?get={GET}&for={FOR}&in={IN}'

# Read Census key into 'api_key'
with open(keyfile) as key:
    api_key = key.read().strip()

# Add key to url
data_url = f'{data_url}&key={api_key}'

# Request data and convert from json
data = requests.get(data_url).json()
# First entry in list is a list of variable names
tract2020_data = pd.DataFrame(data[1:], columns = data[0])

# Add a GEOID column to the data
tract2020_data["GEOID"] = tract2020_data["state"].astype(str) + tract2020_data["county"].astype(str) +tract2020_data["tract"].astype(str)

In [337]:
final_df.merge(tract2020_data, left_on="GEOID", right_on="GEOID", how="left")

Unnamed: 0,GEOID,B01003_001E_x,B02001_002E_x,B01001_026E_x,B01003_001M_x,B02001_002T,B01001_026T,B01003_001E_y,B01003_001M_y,B02001_002E_y,B02001_002M,B01001_026E_y,B01001_026M,state,county,tract
0,10001040100,6542.063480,5970.195966,3250.807969,113860.976133,2069.023096,0.0,7531,759,6181,793,3658,433,10,001,040100
1,10001040201,5052.014601,3566.500754,2716.862840,121328.671121,0.000000,0.0,4766,485,3103,383,2176,259,10,001,040201
2,10001040203,5008.045787,3145.658888,2599.997290,102433.916903,0.000000,0.0,5310,644,2999,502,2691,404,10,001,040203
3,10001040204,4644.794217,3343.300562,2444.193920,222418.632293,0.000000,0.0,6108,908,3867,523,3447,591,10,001,040204
4,10001040205,2878.770294,2071.707018,1514.830687,158928.315680,6798.789785,0.0,3934,558,2448,439,2189,331,10,001,040205
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254,10005051702,5621.588864,5062.646271,2926.594918,208433.537990,0.000000,0.0,5976,804,4835,666,3208,616,10,005,051702
255,10005051801,4887.829562,4017.058855,2665.354985,134264.793639,0.000000,0.0,5240,692,4052,649,2626,415,10,005,051801
256,10005051802,4181.810033,2325.536289,2208.138509,97192.839052,0.000000,0.0,4140,562,2339,405,2325,393,10,005,051802
257,10005051900,4557.209723,3312.847341,2243.302952,126579.263855,0.000000,0.0,4359,623,3665,552,2322,380,10,005,051900


In [338]:
# Write combined dataframe of 2020 ground truth and estimated values to a csv
final_df.merge(tract2020_data, left_on="GEOID", right_on="GEOID").to_csv("estimates" + str(n) + ".csv", index=False)