In [1]:
# !pip install censusgeocode

In [2]:
import glob
import json
import requests
import pandas as pd
from pprint import pprint

# Census Examples 

This notebook uses the `censusgeocode` package in Python (which is simply a wrapper around the US Census' official Geocoder API) to get census geographies for list of addresses or lat/longs

- https://pypi.org/project/censusgeocode/

### Step 1 | Grab your data at the address level

In [3]:
df = pd.read_parquet('example-data.parquet').sample(100)
df

Unnamed: 0,year,borough,zip,incident_address,lat,long,num_complaints
22848,2020,QUEENS,11105,20-38 31 STREET,40.778923,-73.907309,4
1893,2020,BRONX,10456,770 EAST 166 STREET,40.826268,-73.901733,2
79023,2022,BROOKLYN,11213,1545 ST JOHN PLACE,40.670351,-73.926591,13
83593,2022,BROOKLYN,11226,2011 NEWKIRK AVENUE,40.637877,-73.958464,8
72418,2022,BRONX,10468,2471 DAVIDSON AVENUE,40.863392,-73.901794,9
...,...,...,...,...,...,...,...
95062,2022,MANHATTAN,10033,60 PINEHURST AVENUE,40.850340,-73.939586,80
56026,2021,MANHATTAN,10075,1493 YORK AVENUE,40.771497,-73.950365,3
35962,2021,BRONX,10469,2962 WICKHAM AVENUE,40.870127,-73.837884,5
34945,2021,BRONX,10467,3555 CARLISLE PLACE,40.877933,-73.863300,1


### Step 2 | Geoode Lat/Long if they're not already present

It already exists in this dataset. Census geocode has a function to go from addresss --> lat/long, but I haven't had time to implement it here. This dataset already has lat/longs. Message me if you're struggling with this step.

### Step 3 | Get Census Geographies

In [4]:
# Code adapted from:
# https://gis.stackexchange.com/questions/363830/applying-the-censusgeocode-package-to-an-entire-dataframe-of-geocoded-data
# Defines a geocode function that accepts lat/long and spits out geographies
# The code then runs that funciton in parllel (for speed).

import pandas as pd
import censusgeocode as cg
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm

def geocode(lat, lng):
    """
    Geocode a single latitude and longitude.
    """

    census = cg.coordinates(lng, lat)['2020 Census Blocks'][0]

    data = dict(geoid=census['GEOID'], 
                state=census['STATE'], 
                county=census['COUNTY'], 
                tract=census['TRACT'], 
                block=census['BLOCK'])
    
    return data

def bulk_geocode(latitudes, longitudes):
    """
    Geocode a list of latitudes and longitudes in parallel (for speed).
    """
        
    with ThreadPoolExecutor() as tpe:
        latitudes = df['lat']
        longitudes = df['long']
        mapped_results = tpe.map(geocode, latitudes, longitudes)
        data = list(tqdm(mapped_results, total=len(df)))

    return pd.DataFrame(data)

census_geos_df = bulk_geocode(df['lat'], df['long']) 
census_geos_df.head()


  0%|          | 0/100 [00:00<?, ?it/s]

Unnamed: 0,geoid,state,county,tract,block
0,360810111001002,36,81,11100,1002
1,360050135001000,36,5,13500,1000
2,360470357011000,36,47,35701,1000
3,360470516011000,36,47,51601,1000
4,360050265003000,36,5,26500,3000


In [5]:
df_with_geos = pd.concat(
    [ 
        df.reset_index(drop=True),
        census_geos_df.reset_index(drop=True)
    ], 
    axis=1)

df_with_geos.head()

Unnamed: 0,year,borough,zip,incident_address,lat,long,num_complaints,geoid,state,county,tract,block
0,2020,QUEENS,11105,20-38 31 STREET,40.778923,-73.907309,4,360810111001002,36,81,11100,1002
1,2020,BRONX,10456,770 EAST 166 STREET,40.826268,-73.901733,2,360050135001000,36,5,13500,1000
2,2022,BROOKLYN,11213,1545 ST JOHN PLACE,40.670351,-73.926591,13,360470357011000,36,47,35701,1000
3,2022,BROOKLYN,11226,2011 NEWKIRK AVENUE,40.637877,-73.958464,8,360470516011000,36,47,51601,1000
4,2022,BRONX,10468,2471 DAVIDSON AVENUE,40.863392,-73.901794,9,360050265003000,36,5,26500,3000


# Step 4 | Pick a geographical level and get Census data
Do you want Census data at the state level? county? tract? block?

1. Pick a geographical level.
2. See `census-example.ipynb` if you want to learn how to get Census data at your desired level

# Hope that helps!