This notebook pulls 2010 census data and attempts to generate a large set of points that approximates a smooth surface

In [None]:
# Declare static variables

n=100 # The number of points to assign to each census block

In [1]:
# Import libraries

import pandas as pd
import geopandas
import numpy as np
import requests
from io import BytesIO

In [88]:
# Request shapefile data for 2010 census tracts and convert to geopandas dataframe

# Shapefile url
data_url = 'https://www2.census.gov/geo/tiger/GENZ2010/gz_2010_10_140_00_500k.zip'


# Request data
data = requests.get(data_url)
# convert to pandas dataframe
tract_data = geopandas.read_file(BytesIO(data.content))

In [89]:
# Request shapefile data for 2010 census tracts and convert to geopandas dataframe

# Shapefile url
data_url = 'https://www2.census.gov/geo/tiger/TIGER2010/TABBLOCK/2010/tl_2010_10_tabblock10.zip'


# Request data
data = requests.get(data_url)
# convert to pandas dataframe
block_data = geopandas.read_file(BytesIO(data.content))

In [90]:
tract_data

Unnamed: 0,GEO_ID,STATE,COUNTY,TRACT,NAME,LSAD,CENSUSAREA,geometry
0,1400000US10001042800,10,001,042800,428,Tract,57.104,"POLYGON ((-75.73286 38.95737, -75.73845 39.027..."
1,1400000US10001042900,10,001,042900,429,Tract,26.924,"POLYGON ((-75.48935 38.90583, -75.48939 38.905..."
2,1400000US10001043100,10,001,043100,431,Tract,59.523,"POLYGON ((-75.72400 38.84672, -75.72406 38.847..."
3,1400000US10001043300,10,001,043300,433,Tract,2.967,"POLYGON ((-75.52838 39.16143, -75.52827 39.161..."
4,1400000US10001043400,10,001,043400,434,Tract,25.532,"POLYGON ((-75.48949 38.90550, -75.48939 38.905..."
...,...,...,...,...,...,...,...,...
213,1400000US10005051202,10,005,051202,512.02,Tract,1.487,"POLYGON ((-75.05625 38.56308, -75.05486 38.546..."
214,1400000US10005051204,10,005,051204,512.04,Tract,1.002,"POLYGON ((-75.05363 38.53216, -75.05359 38.531..."
215,1400000US10005051205,10,005,051205,512.05,Tract,1.412,"POLYGON ((-75.04931 38.46747, -75.04906 38.456..."
216,1400000US10005051301,10,005,051301,513.01,Tract,13.239,"POLYGON ((-75.15542 38.59404, -75.14132 38.597..."


In [91]:
block_data

Unnamed: 0,STATEFP10,COUNTYFP10,TRACTCE10,BLOCKCE10,GEOID10,NAME10,MTFCC10,UR10,UACE10,UATYP10,FUNCSTAT10,ALAND10,AWATER10,INTPTLAT10,INTPTLON10,geometry
0,10,001,041100,1014,100010411001014,Block 1014,G5040,U,24580,U,S,50816,0,+39.1216358,-075.4858233,"POLYGON ((-75.48486 39.12239, -75.48481 39.122..."
1,10,001,041100,1007,100010411001007,Block 1007,G5040,U,24580,U,S,48931,0,+39.1219647,-075.4904073,"POLYGON ((-75.49019 39.12340, -75.49005 39.123..."
2,10,001,041100,1018,100010411001018,Block 1018,G5040,U,24580,U,S,28485,0,+39.1196396,-075.4826429,"POLYGON ((-75.48313 39.11849, -75.48333 39.118..."
3,10,001,043202,1103,100010432021103,Block 1103,G5040,R,,,S,160376,0,+39.1054024,-075.4393957,"POLYGON ((-75.44219 39.10734, -75.43890 39.107..."
4,10,001,040900,1039,100010409001039,Block 1039,G5040,U,24580,U,S,12254,0,+39.1662742,-075.5285219,"POLYGON ((-75.52849 39.16525, -75.52862 39.165..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24110,10,005,051701,1022,100050517011022,Block 1022,G5040,R,,,S,2614704,0,+38.5418205,-075.4026864,"POLYGON ((-75.41186 38.54192, -75.41169 38.542..."
24111,10,005,051701,1047,100050517011047,Block 1047,G5040,R,,,S,1331210,0,+38.5406585,-075.4582756,"POLYGON ((-75.45237 38.54473, -75.45222 38.544..."
24112,10,005,051701,1049,100050517011049,Block 1049,G5040,R,,,S,130691,0,+38.5342417,-075.4535089,"POLYGON ((-75.45807 38.53492, -75.45771 38.535..."
24113,10,005,051702,1120,100050517021120,Block 1120,G5040,R,,,S,155272,0,+38.5304973,-075.4586287,"POLYGON ((-75.45939 38.52753, -75.45941 38.527..."


In [92]:
# For each census block, create a bounding box
block_bounds = block_data["geometry"].bounds

# Attch GEOID to boundaries
block_bounds = block_data[["GEOID10","geometry"]].merge(block_bounds, left_index=True, right_index=True)
block_bounds

Unnamed: 0,GEOID10,geometry,minx,miny,maxx,maxy
0,100010411001014,"POLYGON ((-75.48486 39.12239, -75.48481 39.122...",-75.487649,39.120168,-75.484047,39.123216
1,100010411001007,"POLYGON ((-75.49019 39.12340, -75.49005 39.123...",-75.491827,39.120589,-75.488532,39.123399
2,100010411001018,"POLYGON ((-75.48313 39.11849, -75.48333 39.118...",-75.484172,39.118489,-75.481151,39.120792
3,100010432021103,"POLYGON ((-75.44219 39.10734, -75.43890 39.107...",-75.442189,39.101999,-75.436480,39.107579
4,100010409001039,"POLYGON ((-75.52849 39.16525, -75.52862 39.165...",-75.529189,39.165251,-75.527861,39.167298
...,...,...,...,...,...,...
24110,100050517011022,"POLYGON ((-75.41186 38.54192, -75.41169 38.542...",-75.415960,38.534188,-75.384837,38.551595
24111,100050517011047,"POLYGON ((-75.45237 38.54473, -75.45222 38.544...",-75.470003,38.534917,-75.451289,38.547926
24112,100050517011049,"POLYGON ((-75.45807 38.53492, -75.45771 38.535...",-75.458071,38.532153,-75.450400,38.535985
24113,100050517021120,"POLYGON ((-75.45939 38.52753, -75.45941 38.527...",-75.461433,38.527526,-75.455986,38.532930


In [107]:
# Fit a 2D Gaussian distribution over the bounding boxes

# Takes in a row of 'block_bounds' and outputs a 2D Gaussian distribution of 'n' points over the bounding box, as well as the GEOID
def get_points(row,n):
    print(f"Processing Block {row['GEOID10']}...")
    # 'i' is the total number of points left to assign
    i=n
    # 'points_return' is the list of all points for the block
    # TODO: CRS is hardcoded
    points_return = geopandas.GeoSeries(crs="EPSG:4269")
    # Allocate points until n have been assigned
    while i > 0:
        # Generates a Gaussian distribution for the y-axis, located at the center of the box with standard deviation such that 95% of the distribution lies in the bounding box
        pointsy = np.random.normal(loc=(row["maxy"] + row["miny"])/2, scale=np.abs(row["maxy"] - row["miny"])/1.96, size=i)
        # Generates a Gaussian distribution for the x-axis, located at the center of the box with standard deviation such that 95% of the distribution lies in the bounding box
        pointsx = np.random.normal(loc=(row["maxx"] + row["minx"])/2, scale=np.abs(row["maxx"] - row["minx"])/1.96, size=i)
        # Convert the points to Shapely points
        points = geopandas.GeoSeries(geopandas.points_from_xy(pointsx, pointsy, crs="EPSG:4269"))
        # Check if the points are inside the block
        point_checks = points.within(row["geometry"])
        # Add found points to our list
        points_return = geopandas.GeoSeries(pd.concat([points_return, points[point_checks]], ignore_index=True), crs=points_return.crs)
        # Set 'i' equal to the number of missed points
        i = n - points_return.size
    
    
    print("Done")
    # Return an array with every point in the cloud and the GEOID
    return np.append(points_return.values, row["GEOID10"])
    

In [None]:
# Fit a Gaussian distribution to each block
point_cloud = block_bounds.apply(get_points, axis=1, args=(n,), result_type='expand')

# Rename columns of the pointcloud
point_cloud.columns = ['GEOID' if str(x)==str(n) else 'point_' + str(x) for x in point_cloud.columns]