This notebook pulls 2010 census data and attempts to generate a large set of points that approximates a smooth surface

In [59]:
# Declare static variables

n=50 # The number of points to assign to each census block
BINOMIAL_TRIALS = 40 # The number of trials in the binomial distribution used for weighting points in blocks. The higher the value, the more evenly distributed the population points will be through the census block
BINOMIAL_SUCCESS = 0.5 # The probability of success for each trial in he weight assignment. Must be <=1. Use 1 for a uniform distribution

# Apply a transformation to make the outcome more normal
# Apply a normal distribution and use min/max normalization
# Weighted Poisson binomial distribution


# Binomial distribution is nearly normal if np(1-p) >= 10
print(BINOMIAL_TRIALS * BINOMIAL_SUCCESS * (1 - BINOMIAL_SUCCESS) >= 10)

True


In [60]:
# Import libraries

import pandas as pd
import geopandas
import numpy as np
import requests
from io import BytesIO
import folium
from IPython.display import clear_output
from itertools import chain

In [61]:
# Request shapefile data for 2010 census tracts and convert to geopandas dataframe

# Shapefile url
data_url = 'https://www2.census.gov/geo/tiger/GENZ2010/gz_2010_10_140_00_500k.zip'


# Request data
data = requests.get(data_url)
# convert to pandas dataframe
tract_data = geopandas.read_file(BytesIO(data.content))

In [62]:
# Request shapefile data for 2010 census tracts and convert to geopandas dataframe

# Shapefile url
data_url = 'https://www2.census.gov/geo/tiger/TIGER2010/TABBLOCK/2010/tl_2010_10_tabblock10.zip'


# Request data
data = requests.get(data_url)
# convert to pandas dataframe
block_data = geopandas.read_file(BytesIO(data.content))

In [63]:
# For each census block, create a bounding box
block_bounds = block_data["geometry"].bounds

# Attch GEOID to boundaries
block_bounds = block_data[["GEOID10","geometry"]].merge(block_bounds, left_index=True, right_index=True)

In [64]:
# Fit a 2D Gaussian distribution over the bounding boxes

# Takes in a row of 'block_bounds' and outputs a 2D Gaussian distribution of 'n' points over the bounding box, as well as the GEOID
def get_points(row,n):
    print(f"Processing Block {row['GEOID10']}...")
    # 'i' is the total number of points left to assign
    i=n
    # 'points_return' is the list of all points for the block
    # TODO: CRS is hardcoded
    points_return = geopandas.GeoSeries(crs="EPSG:4269")
    # Allocate points until n have been assigned
    while i > 0:
        # Generates a uniform distribution for the y-axis located at the center of the box
        pointsy = np.random.uniform(low=row["miny"], high=row["maxy"], size=i)
        # Generates a uniform distribution for the x-axis located at the center of the box
        pointsx = np.random.uniform(low=row["minx"], high=row["maxx"], size=i)
        # Convert the points to Shapely points
        points = geopandas.GeoSeries(geopandas.points_from_xy(pointsx, pointsy, crs="EPSG:4269"))
        # Check if the points are inside the block
        point_checks = points.within(row["geometry"])
        # Add found points to our list
        points_return = geopandas.GeoSeries(pd.concat([points_return, points[point_checks]], ignore_index=True), crs=points_return.crs)
        # Set 'i' equal to the number of missed points
        i = n - points_return.size
    
    # Generates a binomial distribution of weights
    weights = np.random.binomial(n=BINOMIAL_TRIALS, p=BINOMIAL_SUCCESS, size=n)
    # Normalize weights so that they sum to 1
    weights = weights / np.sum(weights)
    # Sort the weights based on distance from the mean
    weights = weights[np.argsort(np.abs(weights - np.mean(weights)))]
    # Generate a series containing the distance from each point to the centroid
    distances = points_return.distance(row["geometry"].centroid)
    # Create a column for the index of the point and sort by distance
    distances = distances.reset_index(name="distance").sort_values(by="distance")
    # Assign a weight to each point
    distances["weight"] = weights
    # Merge weights onto points
    points_return = pd.merge(left=points_return.rename("geometry"), right=distances, how="left", right_on="index", left_index=True)[["geometry", "weight"]]
    # Clear warnings from notebook output to prevent crash
    clear_output()
    # Return an array with every point in the cloud, the weights for each point and the GEOID
    return list(chain(points_return["geometry"].values, points_return["weight"].values, [row["GEOID10"]]))
    

In [65]:
# Fit a Gaussian distribution to each block
point_cloud = block_bounds.apply(get_points, axis=1, args=(n,), result_type='expand')

# Rename columns of the pointcloud
point_cloud.columns = ['point_' + str(x) if x<n else 'weight_' + str(x-n) if x<2*n else 'GEOID' for x in point_cloud.columns]

point_cloud

Unnamed: 0,point_0,point_1,point_2,point_3,point_4,point_5,point_6,point_7,point_8,point_9,...,weight_41,weight_42,weight_43,weight_44,weight_45,weight_46,weight_47,weight_48,weight_49,GEOID
0,POINT (-75.48540428805879 39.122463175901345),POINT (-75.48516145865977 39.12222474654306),POINT (-75.48640795724907 39.121513691181285),POINT (-75.48613968344127 39.122176719084486),POINT (-75.48671302142901 39.12112379290057),POINT (-75.48738984283504 39.121306717626254),POINT (-75.48616210894822 39.1212910808486),POINT (-75.4866616479266 39.12180252588641),POINT (-75.48659501186899 39.12117484871072),POINT (-75.48605295204759 39.120961345544515),...,0.023739,0.015826,0.019782,0.015826,0.022750,0.019782,0.016815,0.023739,0.017804,100010411001014
1,POINT (-75.49036307708161 39.12193742578074),POINT (-75.4903644294427 39.12257001326125),POINT (-75.48964051318981 39.12275347073757),POINT (-75.48904168140598 39.12219303715132),POINT (-75.49001384457601 39.1221244808423),POINT (-75.48986717367794 39.1217279654576),POINT (-75.49063186371008 39.12309042589996),POINT (-75.49029824480434 39.12250656602376),POINT (-75.49051374029368 39.1218728666554),POINT (-75.49039890404671 39.1209949147895),...,0.025484,0.019368,0.021407,0.015291,0.021407,0.024465,0.021407,0.020387,0.018349,100010411001007
2,POINT (-75.48285945537195 39.11874059708786),POINT (-75.48194947176447 39.1195639218825),POINT (-75.48187145433769 39.119809792000815),POINT (-75.48352824464902 39.118817527697324),POINT (-75.48202627519287 39.11975806219204),POINT (-75.48305492090158 39.11990247356092),POINT (-75.48215512501811 39.12022253830098),POINT (-75.48202981362863 39.12044665534576),POINT (-75.48278358446893 39.11985854855589),POINT (-75.48300431953966 39.120081603664374),...,0.020768,0.016615,0.020768,0.021807,0.023884,0.022845,0.016615,0.015576,0.020768,100010411001018
3,POINT (-75.4372842092825 39.106808268719774),POINT (-75.4395658766594 39.10336961301425),POINT (-75.43687238671595 39.106822252241756),POINT (-75.43903720754643 39.10581609111123),POINT (-75.43849922871563 39.10638706874362),POINT (-75.43741872421933 39.106446411836096),POINT (-75.44006963978322 39.106961810482396),POINT (-75.43940923830856 39.10584967007986),POINT (-75.43831223331084 39.10437799489425),POINT (-75.44065708884423 39.10679058360428),...,0.022772,0.022772,0.015842,0.017822,0.019802,0.021782,0.021782,0.023762,0.018812,100010432021103
4,POINT (-75.5283973316182 39.165280047475875),POINT (-75.52881217726846 39.166665413855156),POINT (-75.52856222127984 39.166621925154985),POINT (-75.52809186789374 39.16587903492034),POINT (-75.52829036517322 39.165561284990105),POINT (-75.52884652551012 39.167114603938),POINT (-75.52878804227588 39.166835781560714),POINT (-75.52839703476427 39.165715986400066),POINT (-75.52828301364525 39.16563564476295),POINT (-75.52890956761132 39.16689021798563),...,0.017875,0.019861,0.021847,0.021847,0.013903,0.021847,0.020854,0.019861,0.027805,100010409001039
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24110,POINT (-75.4083526978792 38.545176716219004),POINT (-75.3947313273384 38.54031084788462),POINT (-75.39541214593702 38.541822310514604),POINT (-75.40402025456999 38.54810911517315),POINT (-75.39560808624711 38.54051677571343),POINT (-75.40274493446539 38.538200851740676),POINT (-75.39301139648725 38.53968935271884),POINT (-75.4100458390299 38.54412982156674),POINT (-75.39021609563731 38.53677740213403),POINT (-75.40590029792718 38.54085479443655),...,0.019527,0.018499,0.014388,0.012333,0.019527,0.016444,0.020555,0.023638,0.015416,100050517011022
24111,POINT (-75.46105049494129 38.5391698445983),POINT (-75.45633322612197 38.53636404582049),POINT (-75.45474287185685 38.54432261646419),POINT (-75.45534421586086 38.54315808864868),POINT (-75.46006826278641 38.53649414447544),POINT (-75.45376411621874 38.545128009187515),POINT (-75.45781434196303 38.5404029534779),POINT (-75.45650582568419 38.5448759520301),POINT (-75.45588427899668 38.535870786575316),POINT (-75.45858401435459 38.53984780724432),...,0.022290,0.016211,0.018237,0.018237,0.019250,0.021277,0.024316,0.024316,0.022290,100050517011047
24112,POINT (-75.45369269284264 38.53448447390266),POINT (-75.45205831892802 38.53349367689627),POINT (-75.45376510898963 38.53442289161952),POINT (-75.45157219486018 38.533605636545296),POINT (-75.45564989184096 38.53498068320163),POINT (-75.45213254245154 38.53320285492029),POINT (-75.45323041585489 38.53411564184433),POINT (-75.45290229504646 38.534106735885274),POINT (-75.45607369948797 38.534179561349056),POINT (-75.45356507228557 38.53536339037269),...,0.017078,0.019924,0.017078,0.023719,0.018975,0.022770,0.016129,0.021822,0.019924,100050517011049
24113,POINT (-75.45975316197126 38.52891738933228),POINT (-75.45980208397275 38.530212372402396),POINT (-75.45893621925376 38.52898996665965),POINT (-75.46045388179216 38.531812147564814),POINT (-75.46061095774382 38.53173495465444),POINT (-75.45793841207654 38.531035615383495),POINT (-75.45790122340894 38.53125320430659),POINT (-75.45996269109686 38.530827585436676),POINT (-75.45913308678837 38.53043610994763),POINT (-75.45757467428527 38.53011543239535),...,0.017734,0.016749,0.020690,0.021675,0.017734,0.020690,0.018719,0.017734,0.016749,100050517021120


In [66]:
# Pull population data for 2010 Census blocks
# Define request parameters

year = '2010' # Year of interest
datasource = 'dec' # Survey name
subsource = 'pl' # Subsurvey name
GET = 'P001001,H001001,P001003' # Variables to query
FOR = 'block:*' # for predicate
IN = 'state:10&in=county:*&in=tract:*'

# Filepath to your Census API key
keyfile = 'CensusAPIKey.txt'

# Formatted API call
data_url = f'https://api.census.gov/data/{year}/{datasource}/{subsource}?get={GET}&for={FOR}&in={IN}'

# Read Census key into 'api_key'
with open(keyfile) as key:
    api_key = key.read().strip()

# Add key to url
data_url = f'{data_url}&key={api_key}'

# Request data and convert from json
data = requests.get(data_url).json()
# First entry in list is a list of variable names
data = pd.DataFrame(data[1:], columns = data[0])

# Rename columns to match shapefile pull
data.rename(columns = {"state":"STATEFP10", "county":"COUNTYFP10", "tract":"TRACTCE10", "block":"BLOCKCE10"}, inplace=True)

# Attach to block shapes
block_data = block_data.merge(data, on=["STATEFP10","COUNTYFP10","TRACTCE10","BLOCKCE10"])

In [67]:
# Pull population data for 2010 Census tracts
# Define request parameters

year = '2010' # Year of interest
datasource = 'dec' # Survey name
subsource = 'pl' # Subsurvey name
GET = 'P001001,H001001,P001003' # Variables to query
FOR = 'tract:*' # for predicate
IN = 'state:10' # in predicate


# Filepath to your Census API key
keyfile = 'CensusAPIKey.txt'

# Formatted API call
data_url = f'https://api.census.gov/data/{year}/{datasource}/{subsource}?get={GET}&for={FOR}&in={IN}'

# Read Census key into 'api_key'
with open(keyfile) as key:
    api_key = key.read().strip()

# Add key to url
data_url = f'{data_url}&key={api_key}'

# Request data and convert from json
data = requests.get(data_url).json()
# First entry in list is a list of variable names
data = pd.DataFrame(data[1:], columns = data[0])

# Rename columns to match shapefile pull
data.rename(columns = {"state":"STATE", "county":"COUNTY", "tract":"TRACT"}, inplace=True)

# Attach to tract shapes
tract_data = tract_data.merge(data, on=["STATE","COUNTY","TRACT"])

In [68]:
block_data

Unnamed: 0,STATEFP10,COUNTYFP10,TRACTCE10,BLOCKCE10,GEOID10,NAME10,MTFCC10,UR10,UACE10,UATYP10,FUNCSTAT10,ALAND10,AWATER10,INTPTLAT10,INTPTLON10,geometry,P001001,H001001,P001003
0,10,001,041100,1014,100010411001014,Block 1014,G5040,U,24580,U,S,50816,0,+39.1216358,-075.4858233,"POLYGON ((-75.48486 39.12239, -75.48481 39.122...",244,77,187
1,10,001,041100,1007,100010411001007,Block 1007,G5040,U,24580,U,S,48931,0,+39.1219647,-075.4904073,"POLYGON ((-75.49019 39.12340, -75.49005 39.123...",167,50,135
2,10,001,041100,1018,100010411001018,Block 1018,G5040,U,24580,U,S,28485,0,+39.1196396,-075.4826429,"POLYGON ((-75.48313 39.11849, -75.48333 39.118...",33,10,21
3,10,001,043202,1103,100010432021103,Block 1103,G5040,R,,,S,160376,0,+39.1054024,-075.4393957,"POLYGON ((-75.44219 39.10734, -75.43890 39.107...",14,6,12
4,10,001,040900,1039,100010409001039,Block 1039,G5040,U,24580,U,S,12254,0,+39.1662742,-075.5285219,"POLYGON ((-75.52849 39.16525, -75.52862 39.165...",53,27,43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24110,10,005,051701,1022,100050517011022,Block 1022,G5040,R,,,S,2614704,0,+38.5418205,-075.4026864,"POLYGON ((-75.41186 38.54192, -75.41169 38.542...",1,1,1
24111,10,005,051701,1047,100050517011047,Block 1047,G5040,R,,,S,1331210,0,+38.5406585,-075.4582756,"POLYGON ((-75.45237 38.54473, -75.45222 38.544...",72,29,72
24112,10,005,051701,1049,100050517011049,Block 1049,G5040,R,,,S,130691,0,+38.5342417,-075.4535089,"POLYGON ((-75.45807 38.53492, -75.45771 38.535...",35,12,29
24113,10,005,051702,1120,100050517021120,Block 1120,G5040,R,,,S,155272,0,+38.5304973,-075.4586287,"POLYGON ((-75.45939 38.52753, -75.45941 38.527...",0,0,0


In [69]:
# Assign a fraction of the population of each block as a value to each point

# Merge each point to the 2010 census block containing it
population_per_point = point_cloud.merge(block_data, how="left", left_on="GEOID", right_on="GEOID10")

# Multiply each weight by the block population to get the block population per point
population_per_point[[x for x in population_per_point.columns if 'weight' in x]] = population_per_point[[x for x in population_per_point.columns if 'weight' in x]].mul(population_per_point["P001001"].astype(int), axis=0)

In [70]:
# Flatten to a GeoSeries where each row is a point and its weight
weights = np.array([[row["weight_" + str(i)] for i in range(n)] for _, row in population_per_point.iterrows()]).flatten()
points = np.array([[row["point_" + str(i)] for i in range(n)] for _, row in population_per_point.iterrows()]).flatten()
points_list = geopandas.GeoDataFrame({"population_per_point":weights,"geometry":points}, crs="EPSG:4269")


# Determine the number of points in the point cloud. This should be n * the number of census blocks
print(points_list.shape[0] / n == block_data.shape[0])

  exec(code_obj, self.user_global_ns, self.user_ns)
  points = np.array([[row["point_" + str(i)] for i in range(n)] for _, row in population_per_point.iterrows()]).flatten()
  points = np.array([[row["point_" + str(i)] for i in range(n)] for _, row in population_per_point.iterrows()]).flatten()


True


In [71]:
# Spatially join each point to the 2010 census tract containing it
variables_per_point = geopandas.sjoin(points_list, tract_data, how="left", op='within')

In [None]:
# WARNING: Plot is large and should only be rendered if necessary
# TODO: Points around the edge of the state are being lost

"""
# Find and plot all missed points 
missed_points = variables_per_point.loc[variables_per_point["index_right"].isna()]

# initialize the map and store it in a folium map object
us_map = folium.Map(location=[38.9108, -75.5277], zoom_start=8, tiles=None)

# Add background tiles
folium.TileLayer('CartoDB positron',name="Light Map",control=False).add_to(us_map)

# Style and highlight functions map population values to color values
style_function = lambda x: {"weight":0.5, 
                            'color':'black',
                            'fillColor':'red', 
                            'fillOpacity':0.75}

# Add a map over the tiles with the given colors and a tooltip
NIL=folium.features.GeoJson(
        missed_points, # Full geopandas data
        style_function=style_function, # function for base colors
        control=False
    )

# Add elements to map
us_map.add_child(NIL)"""

In [73]:
# Exclude missed points from the list
variables_per_point = variables_per_point.loc[~variables_per_point["index_right"].isna()]

# Divide variables of intersest by tract population and multiply by the portion of the population represented by each point
variables_per_point[["P001001", "H001001", "P001003"]] = variables_per_point[["P001001", "H001001", "P001003"]].astype(int).div(variables_per_point["P001001"].astype(int), axis=0).mul(variables_per_point["population_per_point"], axis=0)
# Reset index
variables_per_point = variables_per_point.reset_index()

In [74]:
variables_per_point

Unnamed: 0,index,population_per_point,geometry,index_right,GEO_ID,STATE,COUNTY,TRACT,NAME,LSAD,CENSUSAREA,P001001,H001001,P001003
0,0,5.309594,POINT (-75.48540 39.12246),87.0,1400000US10001041100,10,001,041100,411,Tract,6.190,5.309594,1.628370,3.851586
1,1,5.309594,POINT (-75.48516 39.12222),87.0,1400000US10001041100,10,001,041100,411,Tract,6.190,5.309594,1.628370,3.851586
2,2,4.826904,POINT (-75.48641 39.12151),87.0,1400000US10001041100,10,001,041100,411,Tract,6.190,4.826904,1.480337,3.501441
3,3,5.068249,POINT (-75.48614 39.12218),87.0,1400000US10001041100,10,001,041100,411,Tract,6.190,5.068249,1.554353,3.676513
4,4,5.550940,POINT (-75.48671 39.12112),87.0,1400000US10001041100,10,001,041100,411,Tract,6.190,5.550940,1.702387,4.026658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1198094,1205745,0.000000,POINT (-75.46536 38.52694),176.0,1400000US10005051702,10,005,051702,517.02,Tract,68.279,0.000000,0.000000,0.000000
1198095,1205746,0.000000,POINT (-75.46064 38.52211),176.0,1400000US10005051702,10,005,051702,517.02,Tract,68.279,0.000000,0.000000,0.000000
1198096,1205747,0.000000,POINT (-75.46064 38.52865),176.0,1400000US10005051702,10,005,051702,517.02,Tract,68.279,0.000000,0.000000,0.000000
1198097,1205748,0.000000,POINT (-75.46857 38.53064),176.0,1400000US10005051702,10,005,051702,517.02,Tract,68.279,0.000000,0.000000,0.000000


In [75]:
# Print the number of points missed in the transfer of data from tracts to points
print(points_list.shape[0] -  variables_per_point.shape[0])

7651


In [76]:
# Request shapefile data for 2020 census tracts and convert to geopandas dataframe

# Shapefile url
data_url = 'https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_10_tract_500k.zip'


# Request data
data = requests.get(data_url)
# convert to pandas dataframe
tract2020 = geopandas.read_file(BytesIO(data.content))

In [77]:
# Spatially join points to 2020 census tracts
interpolated_values = geopandas.sjoin(variables_per_point[["GEO_ID","geometry","P001001","H001001","P001003"]], tract2020, how="left", op='within')

# Sum the values for each 2020 tract
interpolated_values = interpolated_values[["GEOID", "P001001", "H001001", "P001003"]].groupby("GEOID").sum()

interpolated_values


#tract2020

Unnamed: 0_level_0,P001001,H001001,P001003
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10001040100,6548.242397,2471.838965,5708.405583
10001040201,5050.798198,2026.548975,3673.109374
10001040203,5009.913146,2019.119618,3118.632860
10001040204,4648.483726,1730.501139,3156.040402
10001040205,2879.554648,1072.025340,1954.759832
...,...,...,...
10005051702,5629.248436,2309.575020,5027.464670
10005051801,4884.315443,2116.940281,3854.111221
10005051802,4179.184964,1750.575143,2495.311666
10005051900,4554.121751,1828.635053,3623.310217


In [78]:
# Pull population data for 2020 Census tracts
# Define request parameters

year = '2020' # Year of interest
datasource = 'dec' # Survey name
subsource = 'pl' # Subsurvey name
GET = 'P1_001N,H1_001N,P1_003N' # Variables to query
FOR = 'tract:*' # for predicate
IN = 'state:10' # in predicate


# Filepath to your Census API key
keyfile = 'CensusAPIKey.txt'

# Formatted API call
data_url = f'https://api.census.gov/data/{year}/{datasource}/{subsource}?get={GET}&for={FOR}&in={IN}'

# Read Census key into 'api_key'
with open(keyfile) as key:
    api_key = key.read().strip()

# Add key to url
data_url = f'{data_url}&key={api_key}'

# Request data and convert from json
data = requests.get(data_url).json()
# First entry in list is a list of variable names
tract2020_data = pd.DataFrame(data[1:], columns = data[0])

# Add a GEOID column to the data
tract2020_data["GEOID"] = tract2020_data["state"].astype(str) + tract2020_data["county"].astype(str) +tract2020_data["tract"].astype(str)

In [79]:
# Write combined dataframe of 2020 ground truth and estimated values to a csv
interpolated_values.merge(tract2020_data, left_index=True, right_on="GEOID").to_csv("estimates.csv", index=False)

In [80]:
interpolated_values.merge(tract2020_data, left_index=True, right_on="GEOID")

Unnamed: 0,P001001,H001001,P001003,P1_001N,H1_001N,P1_003N,state,county,tract,GEOID
220,6548.242397,2471.838965,5708.405583,7315,2740,5980,10,001,040100,10001040100
221,5050.798198,2026.548975,3673.109374,5446,2123,3424,10,001,040201,10001040201
222,5009.913146,2019.119618,3118.632860,5182,2157,2808,10,001,040203,10001040203
223,4648.483726,1730.501139,3156.040402,6451,2269,3613,10,001,040204,10001040204
224,2879.554648,1072.025340,1954.759832,4699,1985,2430,10,001,040205,10001040205
...,...,...,...,...,...,...,...,...,...,...
214,5629.248436,2309.575020,5027.464670,6577,2590,5286,10,005,051702,10005051702
215,4884.315443,2116.940281,3854.111221,5359,2154,3636,10,005,051801,10005051801
216,4179.184964,1750.575143,2495.311666,4354,1740,2256,10,005,051802,10005051802
217,4554.121751,1828.635053,3623.310217,4760,1949,3566,10,005,051900,10005051900
