# Daniels Geo Analysis: Dataset + City Centroids

This notebook loads the core dataset, joins city centroids using `utils/city_centroids.py`, and reports match coverage.

In [1]:
# Set working directory to project root (one level up from this notebook)
import os
from pathlib import Path

nb_dir = Path.cwd()
proj_root = nb_dir.parent if nb_dir.name == 'jupyter_notebooks' else nb_dir
os.chdir(proj_root)
print('Working directory:', Path.cwd())

Working directory: c:\Users\Daniel\vsc-projects\codeinstitute\projects\us-pollution-data


In [2]:
# Imports
import sys
import pandas as pd
from pathlib import Path
from IPython.display import display


In [3]:
# Configure paths
DATASET_PATH = Path('data/archive.zip')
CENTROIDS_JSON_PATH = Path('data/processed/city_centroids.json')
print('Dataset:', DATASET_PATH.resolve())
print('Centroids JSON:', CENTROIDS_JSON_PATH.resolve())

Dataset: C:\Users\Daniel\vsc-projects\codeinstitute\projects\us-pollution-data\data\archive.zip
Centroids JSON: C:\Users\Daniel\vsc-projects\codeinstitute\projects\us-pollution-data\data\processed\city_centroids.json


In [4]:
# Load dataset
import pandas as pd

df = pd.read_csv(DATASET_PATH, compression='zip')
print(f"Loaded df with {len(df):,} rows and {df.shape[1]} columns")
# Ensure key columns are strings
for c in ['State','County','City']:
    if c in df.columns:
        df[c] = df[c].astype(str)
display(df.head(3)[[c for c in ['State','County','City'] if c in df.columns]])

Loaded df with 1,746,661 rows and 29 columns


Unnamed: 0,State,County,City
0,Arizona,Maricopa,Phoenix
1,Arizona,Maricopa,Phoenix
2,Arizona,Maricopa,Phoenix


In [5]:
# Import helper and load centroids JSON
# Ensure utils is importable
if str(Path('')).strip() not in sys.path:
    sys.path.insert(0, str(Path('')))

from utils.city_centroids import load_centroids_json, apply_city_centroids

centroids = load_centroids_json(CENTROIDS_JSON_PATH)
print('Centroids states:', len(centroids))

Centroids states: 36


In [6]:
# Apply city centroids to df
expected = ['State','County','City']
missing = [c for c in expected if c not in df.columns]
if missing:
    raise KeyError(f"Missing expected columns for centroids join: {missing}")

_ = apply_city_centroids(
    df,
    centroids,
    state_col='State',
    county_col='County',
    city_col='City',
    lat_col='lat_city',
    lon_col='lon_city',
    source_col='coord_source_city',
)

mask_hit = df[['lat_city','lon_city']].notna().all(axis=1)
print(f"Matched rows: {int(mask_hit.sum()):,} / {len(df):,} ({mask_hit.mean():.2%})")
print("Sample with city coords:")
cols = [c for c in ['State','County','City','lat_city','lon_city','coord_source_city'] if c in df.columns]
display(df.loc[mask_hit, cols].head(10))

Matched rows: 1,517,339 / 1,746,661 (86.87%)
Sample with city coords:


Unnamed: 0,State,County,City,lat_city,lon_city,coord_source_city
0,Arizona,Maricopa,Phoenix,33.448437,-112.074141,city_county_state
1,Arizona,Maricopa,Phoenix,33.448437,-112.074141,city_county_state
2,Arizona,Maricopa,Phoenix,33.448437,-112.074141,city_county_state
3,Arizona,Maricopa,Phoenix,33.448437,-112.074141,city_county_state
4,Arizona,Maricopa,Phoenix,33.448437,-112.074141,city_county_state
5,Arizona,Maricopa,Phoenix,33.448437,-112.074141,city_county_state
6,Arizona,Maricopa,Phoenix,33.448437,-112.074141,city_county_state
7,Arizona,Maricopa,Phoenix,33.448437,-112.074141,city_county_state
8,Arizona,Maricopa,Phoenix,33.448437,-112.074141,city_county_state
9,Arizona,Maricopa,Phoenix,33.448437,-112.074141,city_county_state


## County Population Data (ACS)

We fetch US county populations from the Census Bureau ACS 5-year (table `B01003_001E` = Total Population) and build a `pop_county` DataFrame keyed by state and county FIPS.

In [7]:
# Build pop_county from Census ACS 5-year (Total Population)
import requests
import pandas as pd

ACS_YEAR = 2022
VAR = 'B01003_001E'  # Total population
url = f"https://api.census.gov/data/{ACS_YEAR}/acs/acs5?get=NAME,{VAR}&for=county:*&in=state:*"
resp = requests.get(url, timeout=60)
resp.raise_for_status()
rows = resp.json()
cols = rows[0]
rec = rows[1:]
raw = pd.DataFrame(rec, columns=cols)

# Rename and types
pop_county = raw.rename(columns={
    'NAME': 'name',
    VAR: 'population',
    'state': 'state_fips',
    'county': 'county_fips',
})
pop_county['population'] = pd.to_numeric(pop_county['population'], errors='coerce').astype('Int64')
pop_county['state_fips'] = pop_county['state_fips'].astype(str).str.zfill(2)
pop_county['county_fips'] = pop_county['county_fips'].astype(str).str.zfill(3)

print(f"pop_county shape: {pop_county.shape}")
display(pop_county.head())

pop_county shape: (3222, 4)


Unnamed: 0,name,population,state_fips,county_fips
0,"Autauga County, Alabama",58761,1,1
1,"Baldwin County, Alabama",233420,1,3
2,"Barbour County, Alabama",24877,1,5
3,"Bibb County, Alabama",22251,1,7
4,"Blount County, Alabama",59077,1,9


In [8]:
# (Optional) Prepare df keys to enable a future county join
# This does not join yet; it only creates padded FIPS strings if present.
if {'State Code','County Code'} <= set(df.columns):
    df['state_fips'] = df['State Code'].astype(str).str.zfill(2)
    df['county_fips'] = df['County Code'].astype(str).str.zfill(3)
    print('Prepared df.state_fips and df.county_fips for potential join.')
    display(df[['State','County','state_fips','county_fips']].head(3))
else:
    print("Columns 'State Code' and/or 'County Code' not found; skipping key preparation.")

Prepared df.state_fips and df.county_fips for potential join.


Unnamed: 0,State,County,state_fips,county_fips
0,Arizona,Maricopa,4,13
1,Arizona,Maricopa,4,13
2,Arizona,Maricopa,4,13


## Place (City) Population Data (ACS)
We fetch ACS 5-year place-level populations for the states present in the dataset, normalize place names to city names, and build `pop_place`. Then we attempt a state+city name join to add `population_place`.

In [10]:
# Fetch ACS place populations for states present in df (with simple retry/backoff)
import re
import time
import requests
import pandas as pd

ACS_YEAR = 2022
VAR = 'B01003_001E'  # Total population

if 'state_fips' not in df.columns and 'State Code' in df.columns:
    df['state_fips'] = df['State Code'].astype(str).str.zfill(2)

state_fips_list = sorted(df['state_fips'].dropna().astype(str).str.zfill(2).unique())
print('States to query (FIPS):', state_fips_list[:10], '... total', len(state_fips_list))

all_rows = []
for sf in state_fips_list:
    url = f"https://api.census.gov/data/{ACS_YEAR}/acs/acs5?get=NAME,{VAR}&for=place:*&in=state:{sf}"
    attempts = 0
    while attempts < 3:
        try:
            r = requests.get(url, timeout=60)
            r.raise_for_status()
            j = r.json()
            cols = j[0]
            for row in j[1:]:
                all_rows.append(dict(zip(cols, row)))
            break
        except Exception as e:
            attempts += 1
            if attempts >= 3:
                print(f"Failed for state {sf}: {e}")
            time.sleep(0.5 * attempts)
    time.sleep(0.2)

pop_place_raw = pd.DataFrame(all_rows)
if pop_place_raw.empty:
    print('No place population rows retrieved.')
else:
    pop_place = pop_place_raw.rename(columns={
        'NAME': 'name',
        VAR: 'population',
        'state': 'state_fips',
        'place': 'place_fips',
    })
    pop_place['population'] = pd.to_numeric(pop_place['population'], errors='coerce').astype('Int64')
    pop_place['state_fips'] = pop_place['state_fips'].astype(str).str.zfill(2)
    pop_place['place_fips'] = pop_place['place_fips'].astype(str).str.zfill(5)

    # Normalize NAME to a city-like token (strip suffixes like 'city', 'town', 'CDP', etc.)
    def norm_city_name(s: str) -> str:
        s = str(s)
        s0 = s.split(',')[0].strip()
        s0 = re.sub(r"\s+(city|town|village|borough|municipality|CDP|urban county|balance|metro government|consolidated government)$",
                    "", s0, flags=re.IGNORECASE)
        return s0.strip().lower()

    pop_place['city_norm'] = pop_place['name'].map(norm_city_name)

    # Prepare df city norm
    def df_norm_city(x):
        return str(x).strip().lower()

    df['city_norm'] = df['City'].map(df_norm_city)

    # Join by (state_fips, city_norm)
    place_keys = pop_place[['state_fips','city_norm','population']]
    place_keys = (place_keys
                  .sort_values(['state_fips','city_norm','population'], ascending=[True, True, False])
                  .drop_duplicates(['state_fips','city_norm'], keep='first'))

    df = df.merge(place_keys, how='left', on=['state_fips','city_norm'])
    df = df.rename(columns={'population': 'population_place'})

    hit = df['population_place'].notna().sum()
    print(f"Matched place population for {hit:,} rows out of {len(df):,} ({hit/len(df):.2%}).")
    display(df[['State','City','state_fips','city_norm','population_place']].head(10))

States to query (FIPS): ['01', '02', '04', '05', '06', '08', '09', '10', '11', '12'] ... total 47
Failed for state 80: Expecting value: line 1 column 1 (char 0)
Matched place population for 1,472,410 rows out of 1,746,661 (84.30%).


Unnamed: 0,State,City,state_fips,city_norm,population_place
0,Arizona,Phoenix,4,phoenix,1609456
1,Arizona,Phoenix,4,phoenix,1609456
2,Arizona,Phoenix,4,phoenix,1609456
3,Arizona,Phoenix,4,phoenix,1609456
4,Arizona,Phoenix,4,phoenix,1609456
5,Arizona,Phoenix,4,phoenix,1609456
6,Arizona,Phoenix,4,phoenix,1609456
7,Arizona,Phoenix,4,phoenix,1609456
8,Arizona,Phoenix,4,phoenix,1609456
9,Arizona,Phoenix,4,phoenix,1609456
