In [16]:
!pip install requests pandas
import requests
import pandas as pd
from google.colab import files
import numpy as np



In [44]:


# 1. Census API key
API_KEY = "afb0fcad5614b182f62c60618a0295b272fa9355"
states = ["11", "51", "24"]  # DC, VA, MD

# 2. Variables to fetch (2023 ACS 5-year)
variables = [
    "B19013_001E",  # Median household income
    "B01003_001E",  # Total population
    "B02001_002E",  # White alone
    "B02001_003E",  # Black alone
    "B03003_003E",  # Hispanic or Latino
    "B02001_005E",  # Asian alone
    "B08201_002E",  # No vehicle
    "B18101_002E",  # Population with disability
    # Elderly 65+ male
    "B01001_020E","B01001_021E","B01001_022E","B01001_023E","B01001_024E","B01001_025E",
    # Elderly 65+ female
    "B01001_044E","B01001_045E","B01001_046E","B01001_047E","B01001_048E","B01001_049E"
]

# 3. Build the API URL
var_str = ",".join(variables)
all_states = []

#Loop and append each state's data
for st in states:
    url = f"https://api.census.gov/data/2023/acs/acs5?get={var_str}&for=tract:*&in=state:{st}&in=county:*&key={API_KEY}"
    r = requests.get(url)
    data = r.json()
    df = pd.DataFrame(data[1:], columns=data[0])
    all_states.append(df)

# ðŸ”¥ Combine DC + MD + VA
df_full = pd.concat(all_states, ignore_index=True)
print(df_full.shape)

(3879, 23)


In [45]:

numeric_cols = variables
df_full[numeric_cols] = df_full[numeric_cols].apply(pd.to_numeric, errors='coerce')

df_full['pct_white'] = df_full['B02001_002E'] / df_full['B01003_001E'] * 100
df_full['pct_black'] = df_full['B02001_003E'] / df_full['B01003_001E'] * 100
df_full['pct_hispanic'] = df_full['B03003_003E'] / df_full['B01003_001E'] * 100
df_full['pct_asian'] = df_full['B02001_005E'] / df_full['B01003_001E'] * 100
df_full['pct_no_vehicle'] = df_full['B08201_002E'] / df_full['B01003_001E'] * 100
df_full['pct_disability'] = df_full['B18101_002E'] / df_full['B01003_001E'] * 100

elderly_cols = [
    "B01001_020E","B01001_021E","B01001_022E","B01001_023E","B01001_024E","B01001_025E",
    "B01001_044E","B01001_045E","B01001_046E","B01001_047E","B01001_048E","B01001_049E"
]
df_full['pct_elderly_65plus'] = df_full[elderly_cols].sum(axis=1) / df_full['B01003_001E'] * 100

In [47]:
df_full['state'] = df_full['state'].astype(str).str.zfill(2)
df_full['county'] = df_full['county'].astype(str).str.zfill(3)
df_full['tract'] = df_full['tract'].astype(str).str.zfill(6)

# Create GEOID
df_full['GEOID'] = df_full['state'] + df_full['county'] + df_full['tract']

In [48]:
df_final = df_full[[
    'state', 'county', 'tract', 'GEOID',
    'B19013_001E','B01003_001E',
    'pct_white','pct_black','pct_hispanic','pct_asian',
    'pct_no_vehicle','pct_disability','pct_elderly_65plus'
]].copy()

df_final.columns = [
    'state', 'county', 'tract_id', 'GEOID',
    'median_household_income', 'total_population',
    'pct_white','pct_black','pct_hispanic','pct_asian',
    'pct_no_vehicle','pct_disability','pct_elderly_65plus'
]
df_final.head()

Unnamed: 0,state,county,tract_id,GEOID,median_household_income,total_population,pct_white,pct_black,pct_hispanic,pct_asian,pct_no_vehicle,pct_disability,pct_elderly_65plus
0,11,1,101,11001000101,135708,1181,75.190517,4.911092,7.62066,3.132938,26.333616,42.591025,31.583404
1,11,1,102,11001000102,159583,3056,84.751309,0.490838,13.252618,2.715969,16.917539,47.054974,31.11911
2,11,1,201,11001000201,-666666666,2835,59.435626,7.936508,13.439153,18.659612,0.0,39.858907,0.0
3,11,1,202,11001000202,152059,3964,79.919273,2.447023,9.687185,6.458123,15.716448,46.442987,18.440969
4,11,1,300,11001000300,174470,5669,71.141295,5.732933,14.394073,4.33939,4.251191,47.521609,11.977421


In [49]:


# 1. Fix missing income markers
df_final.loc[:, 'median_household_income'] = (
    df_final['median_household_income']
        .replace([-666666666, 250001], np.nan)
)

# Replace zero population (if any) with NaN
df_final.loc[df_final['total_population'] == 0, 'total_population'] = np.nan

# OPTIONAL: drop rows where population is missing
df_final = df_final[df_final['total_population'].notna()].copy()




  df_final.loc[:, 'median_household_income'] = (


In [50]:
df_final.to_csv("census_data.csv", index=False)
files.download("census_data.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [51]:
def load_centroids(url):
    print("Loading:", url)

    # Read all tables
    tables = pd.read_html(url)

    # Find the table that contains GEOID
    target = None
    for t in tables:
        # Clean columns â€” flatten multi-index
        t.columns = [str(c).replace(" ", "").upper() for c in t.columns]
        if "GEOID" in t.columns:
            target = t
            break

    if target is None:
        raise ValueError("GEOID column not found in any table.")

    # Keep only relevant columns
    keep_cols = ["GEOID", "CENTLAT", "CENTLON", "INTPTLAT", "INTPTLON"]
    keep_cols = [c for c in keep_cols if c in target.columns]

    df = target[keep_cols].copy()

    # Convert GEOID â†’ last 6 digits = tract ID
    df["tract_id"] = df["GEOID"].astype(str).str[-6:]

    return df


# URLs with state abbreviations
dc_url = "https://tigerweb.geo.census.gov/tigerwebmain/Files/acs25/tigerweb_acs25_tract_2024_acs24_dc.html"
md_url = "https://tigerweb.geo.census.gov/tigerwebmain/Files/acs25/tigerweb_acs25_tract_2024_acs24_md.html"
va_url = "https://tigerweb.geo.census.gov/tigerwebmain/Files/acs25/tigerweb_acs25_tract_2024_acs24_va.html"

dc_cent = load_centroids(dc_url)
md_cent = load_centroids(md_url)
va_cent = load_centroids(va_url)

# Combine
centroids = pd.concat([dc_cent, md_cent, va_cent], ignore_index=True)

centroids.head()

Loading: https://tigerweb.geo.census.gov/tigerwebmain/Files/acs25/tigerweb_acs25_tract_2024_acs24_dc.html
Loading: https://tigerweb.geo.census.gov/tigerwebmain/Files/acs25/tigerweb_acs25_tract_2024_acs24_md.html
Loading: https://tigerweb.geo.census.gov/tigerwebmain/Files/acs25/tigerweb_acs25_tract_2024_acs24_va.html


Unnamed: 0,GEOID,CENTLAT,CENTLON,INTPTLAT,INTPTLON,tract_id
0,11001000101,38.908632,-77.05459,38.907699,-77.054777,101
1,11001000102,38.905546,-77.061269,38.905422,-77.062005,102
2,11001000201,38.909217,-77.074342,38.909217,-77.074342,201
3,11001000202,38.906167,-77.069458,38.906305,-77.069636,202
4,11001000300,38.917549,-77.075766,38.91791,-77.074873,300


In [53]:
df_final['GEOID'] = df_final['GEOID'].astype(str)
centroids['GEOID'] = centroids['GEOID'].astype(str)

merged = df_final.merge(centroids, on="GEOID", how="left")
merged.head()

Unnamed: 0,state,county,tract_id_x,GEOID,median_household_income,total_population,pct_white,pct_black,pct_hispanic,pct_asian,pct_no_vehicle,pct_disability,pct_elderly_65plus,CENTLAT,CENTLON,INTPTLAT,INTPTLON,tract_id_y
0,11,1,101,11001000101,135708.0,1181.0,75.190517,4.911092,7.62066,3.132938,26.333616,42.591025,31.583404,38.908632,-77.05459,38.907699,-77.054777,101
1,11,1,102,11001000102,159583.0,3056.0,84.751309,0.490838,13.252618,2.715969,16.917539,47.054974,31.11911,38.905546,-77.061269,38.905422,-77.062005,102
2,11,1,201,11001000201,,2835.0,59.435626,7.936508,13.439153,18.659612,0.0,39.858907,0.0,38.909217,-77.074342,38.909217,-77.074342,201
3,11,1,202,11001000202,152059.0,3964.0,79.919273,2.447023,9.687185,6.458123,15.716448,46.442987,18.440969,38.906167,-77.069458,38.906305,-77.069636,202
4,11,1,300,11001000300,174470.0,5669.0,71.141295,5.732933,14.394073,4.33939,4.251191,47.521609,11.977421,38.917549,-77.075766,38.91791,-77.074873,300


In [54]:
merged.to_csv("final_merged_dataset.csv", index=False)
files.download("final_merged_dataset.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>