In [1]:
import argparse
import boto3
import pandas as pd
import s3fs
import numpy as np 
import io 
import pyarrow as pa 
import pyarrow.parquet as pq
import duckdb
import os
import requests

In [8]:
# Define the URL of the pipe-delimited file
url = "https://www2.census.gov/geo/docs/reference/state.txt"

# Read the file into a pandas DataFrame
df = pd.read_csv(url, sep="|", header=0, names=["fips", "usps", "state_name", "gnisid"], dtype=str)

# Display the first few rows
df.fips.tolist()


['01',
 '02',
 '04',
 '05',
 '06',
 '08',
 '09',
 '10',
 '11',
 '12',
 '13',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '53',
 '54',
 '55',
 '56',
 '60',
 '66',
 '69',
 '72',
 '74',
 '78']

In [None]:
con = duckdb.connect('../hmda.duckdb')

In [None]:
# Create duckdb table
text = '''
create or replace table state_fips_xref
as
select *
from df
'''
con.execute(text)

In [None]:
con.execute("select count(*) from state_fips_xref").fetchall()

In [4]:
state_fips_df = df.copy()

In [None]:
import zipfile

def download_and_extract_tract_centroids(resolution="500k", year="2020"):
    # URL for cartographic boundary shapefiles
    url = f"https://www2.census.gov/geo/tiger/GENZ{year}/shp/cb_{year}_us_tract_{resolution}.zip"
    print(f"🔽 Downloading from {url}...")

    # Download and unzip in memory
    response = requests.get(url, verify=False)
    with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
        zf.extractall("cb_tract_shapefiles")

    # Load shapefile
    shp_path = "cb_tract_shapefiles/cb_2020_us_tract_500k.shp"
    gdf = gpd.read_file(shp_path)

    # Ensure geometry is in lat/lon (EPSG:4326)
    gdf = gdf.to_crs("EPSG:4326")

    # Extract centroid coordinates
    gdf["centroid_lat"] = gdf.geometry.centroid.y
    gdf["centroid_lon"] = gdf.geometry.centroid.x

    # Clean columns
    gdf["state_fips"] = gdf["STATEFP"]
    gdf["county_fips"] = gdf["COUNTYFP"]
    gdf["tract_code"] = gdf["TRACTCE"]
    gdf["tract_fips"] = gdf["GEOID"]

    df = gdf[["tract_fips", "state_fips", "county_fips", "tract_code", "centroid_lat", "centroid_lon"]].copy()
    return df

# Example usage
tract_centroids = download_and_extract_tract_centroids()
tract_centroids.head()


In [None]:
text = '''
create or replace table tract_centroids_raw
as 
select *
from tract_centroids
'''
con.execute(text)

In [None]:
state_fips_df.head()

In [None]:
con.close()

In [7]:
census_api_key = '170bf83ddd021687d0e321361a226c36a6d45a12'
state_fips_list = state_fips_df.fips.tolist()
acs_variable_dict = {
    "B03002_001E": "total_population", 
    "B03002_003E": "white_population", 
    "B03002_004E": "black_population", 
    "B03002_012E": "latinx_population", 
    "B19013_001E": "median_household_income", 
    "B25077_001E": "median_home_value", 
    "B15003_022E": "bachelors_degree_population", 
    "B15003_023E": "graduate_degree_population", 
    "B15003_017E": "high_school_diploma_population", 
    "B17001_002E": "total_population_below_poverty_level", 
    "B22010_002E": "total_population_receiving_federal_assistance", 
    "B19083_001E": "gini_index_of_income_inequality"
}
keys_string = ','.join(acs_variable_dict.keys())
all_data = []
params = {
    "get": "NAME,B03002_001E,B03002_003E,B03002_004E,B03002_012E,B19013_001E,"
           "B25077_001E,B15003_022E,B15003_023E,B15003_017E,B17001_002E,"
           "B22010_002E,B19083_001E",
    "for": "tract:*",
    "in": "state:01"  # Alabama
}
headers = {
    "User-Agent": "Mozilla/5.0 (compatible; MyCensusClient/1.0; +https://example.com)"
}
base_url = "https://api.census.gov/data/2021/acs/acs5"
for state in state_fips_list[:10]: 
#     url = f"{base_url}get=NAME,{keys_string}&for=tract:*&in=state:{state}&key={census_api_key}"
#     print(url)
    params['in'] = f"state:{state}"
    response = requests.get(base_url, params=params, headers=headers, timeout=30)
    response.raise_for_status()
    data = response.json()
    print(data)
#     if response.status_code == 200: 
#         rows = response.json()
#         headers, data = rows[0], rows[1:]
#         all_rows.extend(data)
#     else: 
#         print(f"Didnt get data for state {state} - {response.text}")
# acs_df = pd.DataFrame(all_data, columns=headers)
# acs_df.head()

[['NAME', 'B03002_001E', 'B03002_003E', 'B03002_004E', 'B03002_012E', 'B19013_001E', 'B25077_001E', 'B15003_022E', 'B15003_023E', 'B15003_017E', 'B17001_002E', 'B22010_002E', 'B19083_001E', 'state', 'county', 'tract'], ['Census Tract 201, Autauga County, Alabama', '1791', '1474', '171', '28', '57399', '138700', '164', '68', '336', '275', '129', '0.4607', '01', '001', '020100'], ['Census Tract 202, Autauga County, Alabama', '2010', '775', '1088', '10', '52176', '106800', '185', '82', '410', '117', '121', '0.3975', '01', '001', '020200'], ['Census Tract 203, Autauga County, Alabama', '3577', '2433', '983', '13', '63704', '110400', '263', '127', '603', '614', '166', '0.3673', '01', '001', '020300'], ['Census Tract 204, Autauga County, Alabama', '3802', '3398', '251', '28', '70000', '156500', '599', '366', '599', '307', '39', '0.5569', '01', '001', '020400'], ['Census Tract 205.01, Autauga County, Alabama', '4381', '3253', '976', '95', '60917', '166200', '725', '372', '744', '790', '160', 

[['NAME', 'B03002_001E', 'B03002_003E', 'B03002_004E', 'B03002_012E', 'B19013_001E', 'B25077_001E', 'B15003_022E', 'B15003_023E', 'B15003_017E', 'B17001_002E', 'B22010_002E', 'B19083_001E', 'state', 'county', 'tract'], ['Census Tract 1, Aleutians East Borough, Alaska', '3409', '436', '60', '359', '72258', '144100', '275', '127', '1007', '454', '108', '0.3956', '02', '013', '000100'], ['Census Tract 1, Aleutians West Census Area, Alaska', '912', '280', '38', '46', '58750', '-666666666', '32', '9', '273', '135', '23', '0.3959', '02', '016', '000100'], ['Census Tract 2, Aleutians West Census Area, Alaska', '4339', '920', '144', '465', '94271', '412100', '522', '63', '1151', '280', '39', '0.3245', '02', '016', '000200'], ['Census Tract 1.01, Anchorage Municipality, Alaska', '5600', '4467', '60', '381', '138706', '378100', '899', '541', '638', '284', '42', '0.3023', '02', '020', '000101'], ['Census Tract 1.02, Anchorage Municipality, Alaska', '4452', '3796', '1', '37', '80909', '344300', '5

[['NAME', 'B03002_001E', 'B03002_003E', 'B03002_004E', 'B03002_012E', 'B19013_001E', 'B25077_001E', 'B15003_022E', 'B15003_023E', 'B15003_017E', 'B17001_002E', 'B22010_002E', 'B19083_001E', 'state', 'county', 'tract'], ['Census Tract 9426, Apache County, Arizona', '1737', '0', '0', '47', '22222', '37500', '41', '13', '496', '625', '186', '0.5791', '04', '001', '942600'], ['Census Tract 9427, Apache County, Arizona', '5511', '77', '9', '26', '29844', '48800', '131', '99', '1128', '2052', '383', '0.5115', '04', '001', '942700'], ['Census Tract 9440, Apache County, Arizona', '5966', '89', '30', '127', '42105', '34200', '212', '181', '1166', '1794', '514', '0.5243', '04', '001', '944000'], ['Census Tract 9441, Apache County, Arizona', '5968', '106', '141', '94', '21740', '42900', '188', '59', '784', '2638', '565', '0.5588', '04', '001', '944100'], ['Census Tract 9442.01, Apache County, Arizona', '4185', '155', '11', '49', '27088', '25000', '128', '163', '476', '2056', '443', '0.5095', '04'

KeyboardInterrupt: 

In [10]:
hud_api_token = ' eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJhdWQiOiI2IiwianRpIjoiMGQ5ZTc4ZjA2NjAzMzU4NzNiNGU1ZDliYzQyZmNmOTAxZGIyZTNjMGE0NmZjNzI4NjhlMmQyNTA5ZTZjNzFiNGVkMzQzMWIyOWZmYWZhOTEiLCJpYXQiOjE3NDc4Nzc4ODEuODAxOTU4LCJuYmYiOjE3NDc4Nzc4ODEuODAxOTYsImV4cCI6MjA2MzQxMDY4MS43OTgxMzcsInN1YiI6Ijk4ODkzIiwic2NvcGVzIjpbXX0.V-VSRSRq9Rf-udiaWlByRVaxEw2UhueskHxWhCmLRvxS2AJmhVgqGMcx_0n32DVAbV0WnI31NchNX3LUdKEDnQ'
url = "https://www.huduser.gov/hudapi/public/usps?type=6&query=All&year=2023"
headers = {"Authorization": "Bearer {0}".format(hud_api_token)}

response = requests.get(url, headers=headers)

if response.status_code != 200: 
    print(f"failed - {response.status_code}")
else: 
    hud_df = pd.DataFrame(response.json()['data']['results'])

In [14]:
hud_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188846 entries, 0 to 188845
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   tract      188846 non-null  object 
 1   geoid      188846 non-null  object 
 2   res_ratio  188846 non-null  float64
 3   bus_ratio  188846 non-null  float64
 4   oth_ratio  188846 non-null  float64
 5   tot_ratio  188846 non-null  float64
 6   city       188846 non-null  object 
 7   state      188846 non-null  object 
dtypes: float64(4), object(4)
memory usage: 11.5+ MB
