In [1]:
# !pip install censusgeocode

In [2]:
import glob
import json
import requests
import pandas as pd
from pprint import pprint

# Census Examples 

This notebook uses the `censusgeocode` package in Python (which is simply a wrapper around the US Census' official Geocoder API) to get census geographies for list of addresses or lat/longs

- https://pypi.org/project/censusgeocode/

### Step 1 | Grab your data at the address level

In [3]:
df = pd.read_parquet('example-data.parquet').sample(1000)
df

Unnamed: 0,year,borough,zip,incident_address,lat,long,num_complaints
34606,2021,BRONX,10467,2723 MATTHEWS AVENUE,40.866045,-73.862573,1
95311,2022,MANHATTAN,10034,48 POST AVENUE,40.863209,-73.923706,1
70243,2022,BRONX,10462,653 MORRIS PARK AVENUE,40.844925,-73.867597,2
88515,2022,MANHATTAN,10005,30 WALL STREET,40.706937,-74.010377,1
82911,2022,BROOKLYN,11225,1101 UNION STREET,40.669475,-73.956750,3
...,...,...,...,...,...,...,...
75697,2022,BROOKLYN,11206,57 VERNON AVENUE,40.694537,-73.950015,1
13292,2020,BROOKLYN,11226,1060 FLATBUSH AVENUE,40.644997,-73.958027,1
55857,2021,MANHATTAN,10040,620 WEST 190 STREET,40.855135,-73.930208,1
48192,2021,BROOKLYN,11239,1149 ELTON STREET,40.656420,-73.873986,38


### Step 2 | Geoode Lat/Long if they're not already present

It already exists in this dataset. Census geocode has a function to go from addresss --> lat/long, but I haven't had time to implement it here. This dataset already has lat/longs. Message me if you're struggling with this step.

### Step 3 | Get Census Geographies

In [6]:
# Code adapted from:
# https://gis.stackexchange.com/questions/363830/applying-the-censusgeocode-package-to-an-entire-dataframe-of-geocoded-data
# Defines a geocode function that accepts lat/long and spits out geographies
# The code then runs that funciton in parllel (for speed).

import pandas as pd
import censusgeocode as cg
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm

import requests_cache
cache = requests_cache.CachedSession("geocode_cache", backend="filesystem")

def geocode(lat, lng):
    try:
        url = "https://geocoding.geo.census.gov/geocoder/geographies/coordinates"
        params = {
            "x": lng,
            "y": lat,
            "benchmark": "Public_AR_Census2020",
            "vintage": "Census2020_Census2020",
            "format": "json"
        }
        response = cache.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        census = data['result']['geographies']['Census Blocks'][0]
        return census
    except Exception as e:
        print(f"Error geocoding ({lat}, {lng}): {e}")
        return None

def bulk_geocode(latitudes, longitudes):
    """
    Geocode a list of latitudes and longitudes in parallel (for speed).
    """

    with ThreadPoolExecutor() as tpe:
        latitudes = df['lat']
        longitudes = df['long']
        mapped_results = tpe.map(geocode, latitudes, longitudes)
        data = list(tqdm(mapped_results, total=len(df)))

    return pd.DataFrame(data)

census_geos_df = bulk_geocode(df['lat'], df['long']) 
census_geos_df.head()


  0%|          | 0/100 [00:00<?, ?it/s]

Unnamed: 0,SUFFIX,POP100,GEOID,CENTLAT,BLOCK,AREAWATER,STATE,BASENAME,OID,LSADC,...,TRACT,CENTLON,BLKGRP,AREALAND,HU100,INTPTLON,MTFCC,LWBLKTYP,UR,COUNTY
0,,594,360050340004003,40.8666653,4003,0,36,4003,210701006028146,BK,...,34000,-73.8630129,4,20254,239,-73.8630129,G5040,L,U,5
1,,1518,360610291005000,40.8634936,5000,0,36,5000,210701008618787,BK,...,29100,-73.9245734,5,25695,587,-73.9245734,G5040,L,U,61
2,,382,360050236001001,40.8460171,1001,0,36,1001,210701006027682,BK,...,23600,-73.8676634,1,17313,149,-73.8676634,G5040,L,U,5
3,,24,360610007001005,40.7070345,1005,0,36,1005,210701008618484,BK,...,700,-74.0098451,1,9769,2,-74.0098451,G5040,L,U,61
4,,675,360470325001001,40.6698386,1001,0,36,1001,210701004648418,BK,...,32500,-73.9568075,1,18347,314,-73.9568075,G5040,L,U,47


In [None]:
df_with_geos = pd.concat(
    [ 
        df.reset_index(drop=True),
        census_geos_df.reset_index(drop=True)
    ], 
    axis=1)

df_with_geos.head()

# Step 4 | Pick a geographical level and get Census data
Do you want Census data at the state level? county? tract? block?

1. Pick a geographical level.
2. See `census-example.ipynb` if you want to learn how to get Census data at your desired level

# Hope that helps!