This notebook pulls 2010 census data and attempts to generate a large set of points that approximates a smooth surface

In [1]:
# Declare static variables

n=10 # The number of points to assign to each census block

In [2]:
# Import libraries

import pandas as pd
import geopandas
import numpy as np
import requests
from io import BytesIO
import folium
from IPython.display import clear_output

In [3]:
# Request shapefile data for 2010 census tracts and convert to geopandas dataframe

# Shapefile url
data_url = 'https://www2.census.gov/geo/tiger/GENZ2010/gz_2010_10_140_00_500k.zip'


# Request data
data = requests.get(data_url)
# convert to pandas dataframe
tract_data = geopandas.read_file(BytesIO(data.content))

In [4]:
# Request shapefile data for 2010 census tracts and convert to geopandas dataframe

# Shapefile url
data_url = 'https://www2.census.gov/geo/tiger/TIGER2010/TABBLOCK/2010/tl_2010_10_tabblock10.zip'


# Request data
data = requests.get(data_url)
# convert to pandas dataframe
block_data = geopandas.read_file(BytesIO(data.content))

In [5]:
# For each census block, create a bounding box
block_bounds = block_data["geometry"].bounds

# Attch GEOID to boundaries
block_bounds = block_data[["GEOID10","geometry"]].merge(block_bounds, left_index=True, right_index=True)

In [6]:
# Fit a 2D Gaussian distribution over the bounding boxes

# Takes in a row of 'block_bounds' and outputs a 2D Gaussian distribution of 'n' points over the bounding box, as well as the GEOID
def get_points(row,n):
    print(f"Processing Block {row['GEOID10']}...")
    # 'i' is the total number of points left to assign
    i=n
    # 'points_return' is the list of all points for the block
    # TODO: CRS is hardcoded
    points_return = geopandas.GeoSeries(crs="EPSG:4269")
    # Allocate points until n have been assigned
    while i > 0:
        # Generates a Gaussian distribution for the y-axis, located at the center of the box with standard deviation such that 95% of the distribution lies in the bounding box
        pointsy = np.random.normal(loc=(row["maxy"] + row["miny"])/2, scale=np.abs(row["maxy"] - row["miny"])/1.96, size=i)
        # Generates a Gaussian distribution for the x-axis, located at the center of the box with standard deviation such that 95% of the distribution lies in the bounding box
        pointsx = np.random.normal(loc=(row["maxx"] + row["minx"])/2, scale=np.abs(row["maxx"] - row["minx"])/1.96, size=i)
        # Convert the points to Shapely points
        points = geopandas.GeoSeries(geopandas.points_from_xy(pointsx, pointsy, crs="EPSG:4269"))
        # Check if the points are inside the block
        point_checks = points.within(row["geometry"])
        # Add found points to our list
        points_return = geopandas.GeoSeries(pd.concat([points_return, points[point_checks]], ignore_index=True), crs=points_return.crs)
        # Set 'i' equal to the number of missed points
        i = n - points_return.size
    
    # Clear warnings from notebook output to prevent crash
    clear_output()
    # Return an array with every point in the cloud and the GEOID
    return np.append(points_return.values, row["GEOID10"])
    

In [7]:
# Fit a Gaussian distribution to each block
point_cloud = block_bounds.apply(get_points, axis=1, args=(n,), result_type='expand')

# Rename columns of the pointcloud
point_cloud.columns = ['GEOID' if str(x)==str(n) else 'point_' + str(x) for x in point_cloud.columns]

In [8]:
# Pull population data for 2010 Census blocks
# Define request parameters

year = '2010' # Year of interest
datasource = 'dec' # Survey name
subsource = 'pl' # Subsurvey name
GET = 'P001001,H001001,P001003' # Variables to query
FOR = 'block:*' # for predicate
IN = 'state:10&in=county:*&in=tract:*'

# Filepath to your Census API key
keyfile = 'CensusAPIKey.txt'

# Formatted API call
data_url = f'https://api.census.gov/data/{year}/{datasource}/{subsource}?get={GET}&for={FOR}&in={IN}'

# Read Census key into 'api_key'
with open(keyfile) as key:
    api_key = key.read().strip()

# Add key to url
data_url = f'{data_url}&key={api_key}'

# Request data and convert from json
data = requests.get(data_url).json()
# First entry in list is a list of variable names
data = pd.DataFrame(data[1:], columns = data[0])

# Rename columns to match shapefile pull
data.rename(columns = {"state":"STATEFP10", "county":"COUNTYFP10", "tract":"TRACTCE10", "block":"BLOCKCE10"}, inplace=True)

# Attach to block shapes
block_data = block_data.merge(data, on=["STATEFP10","COUNTYFP10","TRACTCE10","BLOCKCE10"])

In [9]:
# Pull population data for 2010 Census tracts
# Define request parameters

year = '2010' # Year of interest
datasource = 'dec' # Survey name
subsource = 'pl' # Subsurvey name
GET = 'P001001,H001001,P001003' # Variables to query
FOR = 'tract:*' # for predicate
IN = 'state:10' # in predicate


# Filepath to your Census API key
keyfile = 'CensusAPIKey.txt'

# Formatted API call
data_url = f'https://api.census.gov/data/{year}/{datasource}/{subsource}?get={GET}&for={FOR}&in={IN}'

# Read Census key into 'api_key'
with open(keyfile) as key:
    api_key = key.read().strip()

# Add key to url
data_url = f'{data_url}&key={api_key}'

# Request data and convert from json
data = requests.get(data_url).json()
# First entry in list is a list of variable names
data = pd.DataFrame(data[1:], columns = data[0])

# Rename columns to match shapefile pull
data.rename(columns = {"state":"STATE", "county":"COUNTY", "tract":"TRACT"}, inplace=True)

# Attach to tract shapes
tract_data = tract_data.merge(data, on=["STATE","COUNTY","TRACT"])

In [10]:
# Assign a fraction of the population of each block as a value to each point
point_cloud["population_per_point"] = block_data["P001001"].astype(int) / n

In [11]:
# Flatten to a GeoSeries where each row is a point and its weight
weights = np.array([[row["population_per_point"] for _ in range(n)] for i, row in point_cloud.iterrows()]).flatten()
points = np.array([[row["point_" + str(i)] for i in range(n)] for _, row in point_cloud.iterrows()]).flatten()
points_list = geopandas.GeoDataFrame({"population_per_point":weights,"geometry":points}, crs="EPSG:4269")

  exec(code_obj, self.user_global_ns, self.user_ns)
  points = np.array([[row["point_" + str(i)] for i in range(n)] for _, row in point_cloud.iterrows()]).flatten()
  points = np.array([[row["point_" + str(i)] for i in range(n)] for _, row in point_cloud.iterrows()]).flatten()


In [12]:
# For each point, multiplies variables of interest in the containing census tract by population weight and returns results
def assign_to_points(row, tracts):
    checks = tracts["geometry"].contains(row["geometry"])
    if checks.any():
        containing_tract = tracts.loc[checks]
        output = containing_tract[["P001001","H001001","P001003"]].astype(int) * row["population_per_point"]
        output["population_per_point"] = row["population_per_point"]
        output["geometry"] = row["geometry"]
        return output.values[0]
    else:
        print("Missed a point!")
        return [0,0,0,0,row["geometry"]]

In [13]:
# Transfer data from 2010 census tracts to smooth surface
variables_per_point = points_list.apply(assign_to_points, axis=1, args=(tract_data,), result_type='expand')
clear_output()
# Add descriptive column names
variables_per_point.columns = ["P001001","H001001","P001003","population_per_point","geometry"]
# Cast to a GeoDataFrame
variables_per_point = geopandas.GeoDataFrame(variables_per_point)

In [14]:
# Request shapefile data for 2020 census tracts and convert to geopandas dataframe

# Shapefile url
data_url = 'https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_10_tract_500k.zip'


# Request data
data = requests.get(data_url)
# convert to pandas dataframe
tract2020 = geopandas.read_file(BytesIO(data.content))

In [23]:
# For each 2020 census tract, find all points within the tract and sum their values. Divide each variable of interest by the summed population to get estimates
def sum_to_tract(row, points):
    # Test each point for containment in the tract
    checks = points["geometry"].within(row["geometry"])
    # If there are any points in the tract, sum the points
    if checks.any():
        # Locate all points in the tract
        points_in_tract = points.loc[checks]
        # Sum all points in the tract
        summed_var = points_in_tract[["P001001","H001001","P001003","population_per_point"]].sum()
        # Divide variables of interest by summed population
        summed_var = summed_var[["P001001","H001001","P001003"]] / summed_var["population_per_point"]
        # Add the GEOID of the tract as a column
        summed_var["GEOID"] = row["GEOID"]
        return summed_var.values
    else:
        print("Tract contains no points!")
        return [0,0,0, row["GEOID"]]
    
    

In [24]:
# Sum data from points to 2020 census tracts
interpolated_values = tract2020.apply(sum_to_tract, axis=1, args=(variables_per_point,), result_type='expand')

In [25]:
# Pull population data for 2020 Census tracts
# Define request parameters

year = '2020' # Year of interest
datasource = 'dec' # Survey name
subsource = 'pl' # Subsurvey name
GET = 'P1_001N,H1_001N,P1_003N' # Variables to query
FOR = 'tract:*' # for predicate
IN = 'state:10' # in predicate


# Filepath to your Census API key
keyfile = 'CensusAPIKey.txt'

# Formatted API call
data_url = f'https://api.census.gov/data/{year}/{datasource}/{subsource}?get={GET}&for={FOR}&in={IN}'

# Read Census key into 'api_key'
with open(keyfile) as key:
    api_key = key.read().strip()

# Add key to url
data_url = f'{data_url}&key={api_key}'

# Request data and convert from json
data = requests.get(data_url).json()
# First entry in list is a list of variable names
tract2020_data = pd.DataFrame(data[1:], columns = data[0])

# Add a GEOID column to the data
tract2020_data["GEOID"] = tract2020_data["state"].astype(str) + tract2020_data["county"].astype(str) +tract2020_data["tract"].astype(str)

In [31]:
# Write combined dataframe of 2020 ground truth and estimated values to a csv
interpolated_values.merge(tract2020_data, left_on=3, right_on="GEOID").to_csv("estimates.csv", index=False)

In [27]:
tract2020_data

Unnamed: 0,P1_001N,H1_001N,P1_003N,state,county,tract,GEOID
0,5298,2363,857,10,003,000200,10003000200
1,3012,1304,414,10,003,000300,10003000300
2,2957,1611,1191,10,003,000400,10003000400
3,3453,1554,318,10,003,000500,10003000500
4,2773,1029,107,10,003,000601,10003000601
...,...,...,...,...,...,...,...
257,3722,1850,2944,10,001,043202,10001043202
258,6553,1824,2590,10,001,043300,10001043300
259,5648,2348,3869,10,001,043400,10001043400
260,108,25,72,10,001,980000,10001980000


In [28]:
tract2020

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,AFFGEOID,GEOID,NAME,NAMELSAD,STUSPS,NAMELSADCO,STATE_NAME,LSAD,ALAND,AWATER,geometry
0,10,003,010300,1400000US10003010300,10003010300,103,Census Tract 103,DE,New Castle County,Delaware,CT,1836196,0,"POLYGON ((-75.48088 39.80435, -75.47630 39.808..."
1,10,003,011300,1400000US10003011300,10003011300,113,Census Tract 113,DE,New Castle County,Delaware,CT,1502593,0,"POLYGON ((-75.51524 39.80824, -75.51187 39.811..."
2,10,003,013903,1400000US10003013903,10003013903,139.03,Census Tract 139.03,DE,New Castle County,Delaware,CT,2747066,0,"POLYGON ((-75.70552 39.63554, -75.70252 39.645..."
3,10,003,015400,1400000US10003015400,10003015400,154,Census Tract 154,DE,New Castle County,Delaware,CT,2435271,0,"POLYGON ((-75.56422 39.70979, -75.56117 39.713..."
4,10,003,013608,1400000US10003013608,10003013608,136.08,Census Tract 136.08,DE,New Castle County,Delaware,CT,976164,0,"POLYGON ((-75.69565 39.70322, -75.69488 39.708..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254,10,001,040204,1400000US10001040204,10001040204,402.04,Census Tract 402.04,DE,Kent County,Delaware,CT,6926052,194346,"POLYGON ((-75.63464 39.27787, -75.63404 39.279..."
255,10,005,050808,1400000US10005050808,10005050808,508.08,Census Tract 508.08,DE,Sussex County,Delaware,CT,5767786,393311,"POLYGON ((-75.22637 38.74773, -75.22247 38.750..."
256,10,001,040100,1400000US10001040100,10001040100,401,Census Tract 401,DE,Kent County,Delaware,CT,124745857,0,"POLYGON ((-75.76010 39.29682, -75.75000 39.297..."
257,10,005,051307,1400000US10005051307,10005051307,513.07,Census Tract 513.07,DE,Sussex County,Delaware,CT,7122418,705505,"POLYGON ((-75.12455 38.56941, -75.12366 38.573..."
