In [1]:
# Imports and path setup (ensure project root is importable)
import sys, os
from pathlib import Path
import pandas as pd

ROOT = Path.cwd().resolve().parent  # parent of jupyter_notebooks
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from utils.population_join import (
    enrich_with_centroids,
    enrich_with_state_population,
    enrich_with_city_population,
)

# Use ../data/cleaned_pollution_data.zip relative to notebook
BASE_PATH = Path('../data/cleaned_pollution_data.zip').resolve()
print('Looking for base data at:', BASE_PATH)
assert BASE_PATH.exists(), f'Base data not found: {BASE_PATH}'

Looking for base data at: C:\Users\Daniel\vsc-projects\codeinstitute\projects\us-pollution-data\data\cleaned_pollution_data.zip


In [2]:
# Load base cleaned dataset
df = pd.read_csv(BASE_PATH, compression='zip', index_col=0)
# Reset index to avoid duplicate index errors
df = df.reset_index(drop=True)
print(df.shape)
df.head(2)

(412856, 17)


Unnamed: 0,State,County,City,Date Local,NO2 Mean,NO2 1st Max Value,NO2 1st Max Hour,NO2 AQI,O3 Mean,O3 1st Max Value,O3 1st Max Hour,O3 AQI,SO2 Mean,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI
0,Virginia,Fairfax,Seven Corners,2000-01-01,31.0,41.0,23,39,0.00275,0.006,11,5,11.666667,1.247368,1.6,0,16.0
1,Virginia,Fairfax,Seven Corners,2000-01-02,19.041667,36.0,0,34,0.020208,0.027,13,23,6.583333,1.070833,1.3,0,14.0


In [3]:
# Apply enrichment: city centroids + state and city populations
# Always start from the base DataFrame to avoid stale columns

centroids_path = Path('../data/processed/city_centroids.json').resolve()
pop_state_path = Path('../data/processed/pop_state_year_2000_2016_partial.csv').resolve()
pop_city_path = Path('../data/processed/pop_city_year_2000_2016_partial.csv').resolve()

print('Centroids path:', centroids_path, centroids_path.exists())
print('State pop path:', pop_state_path, pop_state_path.exists())
print('City pop path:', pop_city_path, pop_city_path.exists())

# --- Map state names to FIPS codes ---
state_name_to_fips = {
    'Alabama': '01', 'Alaska': '02', 'Arizona': '04', 'Arkansas': '05', 'California': '06',
    'Colorado': '08', 'Connecticut': '09', 'Delaware': '10', 'District of Columbia': '11',
    'Florida': '12', 'Georgia': '13', 'Hawaii': '15', 'Idaho': '16', 'Illinois': '17',
    'Indiana': '18', 'Iowa': '19', 'Kansas': '20', 'Kentucky': '21', 'Louisiana': '22',
    'Maine': '23', 'Maryland': '24', 'Massachusetts': '25', 'Michigan': '26', 'Minnesota': '27',
    'Mississippi': '28', 'Missouri': '29', 'Montana': '30', 'Nebraska': '31', 'Nevada': '32',
    'New Hampshire': '33', 'New Jersey': '34', 'New Mexico': '35', 'New York': '36', 'North Carolina': '37',
    'North Dakota': '38', 'Ohio': '39', 'Oklahoma': '40', 'Oregon': '41', 'Pennsylvania': '42',
    'Rhode Island': '44', 'South Carolina': '45', 'South Dakota': '46', 'Tennessee': '47', 'Texas': '48',
    'Utah': '49', 'Vermont': '50', 'Virginia': '51', 'Washington': '53', 'West Virginia': '54',
    'Wisconsin': '55', 'Wyoming': '56'
}
df['state_fips'] = df['State'].map(state_name_to_fips)

# Check mapping worked
print('State FIPS sample:', df[['State', 'state_fips']].drop_duplicates().head())

# Fix state_fips type in population CSVs
import pandas as pd
pop_state_df = pd.read_csv(pop_state_path)
pop_state_df['state_fips'] = pop_state_df['state_fips'].astype(str).str.zfill(2)
fixed_pop_state_path = pop_state_path.parent / 'pop_state_year_2000_2016_fixed.csv'
pop_state_df.to_csv(fixed_pop_state_path, index=False)

pop_city_df = pd.read_csv(pop_city_path)
pop_city_df['state_fips'] = pop_city_df['state_fips'].astype(str).str.zfill(2)
fixed_pop_city_path = pop_city_path.parent / 'pop_city_year_2000_2016_fixed.csv'
pop_city_df.to_csv(fixed_pop_city_path, index=False)

enriched = enrich_with_centroids(df, centroids_path=centroids_path)
if 'state_fips' in enriched.columns:
    enriched['state_fips'] = enriched['state_fips'].astype(str).str.zfill(2)
# Debug: print merge keys before city enrichment
print('enriched __state_fips dtype:', enriched["state_fips"].dtype if "state_fips" in enriched.columns else 'N/A')
print('enriched __state_fips sample:', enriched["state_fips"].drop_duplicates().head() if "state_fips" in enriched.columns else 'N/A')
pop_city_df = pd.read_csv(fixed_pop_city_path, dtype={'state_fips': str})
print('pop_city_df state_fips dtype:', pop_city_df['state_fips'].dtype)
print('pop_city_df state_fips sample:', pop_city_df['state_fips'].drop_duplicates().head())
enriched = enrich_with_state_population(enriched, pop_path=fixed_pop_state_path, state_col='state_fips')
enriched = enrich_with_city_population(enriched, pop_path=fixed_pop_city_path)

print('Enriched columns:', enriched.columns.tolist())
print('Sample enriched row:')
display(enriched.head(2))

Centroids path: C:\Users\Daniel\vsc-projects\codeinstitute\projects\us-pollution-data\data\processed\city_centroids.json True
State pop path: C:\Users\Daniel\vsc-projects\codeinstitute\projects\us-pollution-data\data\processed\pop_state_year_2000_2016_partial.csv True
City pop path: C:\Users\Daniel\vsc-projects\codeinstitute\projects\us-pollution-data\data\processed\pop_city_year_2000_2016_partial.csv True
State FIPS sample:               State state_fips
0          Virginia         51
362      California         06
7313         Nevada         32
8125       Missouri         29
14456  Pennsylvania         42
enriched __state_fips dtype: object
enriched __state_fips sample: 0        51
362      06
7313     32
8125     29
14456    42
Name: state_fips, dtype: object
pop_city_df state_fips dtype: object
pop_city_df state_fips sample: 0       10
474     11
482     12
4247    13
8609    15
Name: state_fips, dtype: object
enriched __state_fips dtype: object
enriched __state_fips sample: 0     

Unnamed: 0,State,County,City,Date Local,NO2 Mean,NO2 1st Max Value,NO2 1st Max Hour,NO2 AQI,O3 Mean,O3 1st Max Value,...,CO 1st Max Value,CO 1st Max Hour,CO AQI,state_fips_x,lat_city,lon_city,coord_source_city,state_fips_y,population_state,population_city
0,Virginia,Fairfax,Seven Corners,2000-01-01,31.0,41.0,23,39,0.00275,0.006,...,1.6,0,16.0,51,38.86592,-77.144114,city_county_state,,,8701.0
1,Virginia,Fairfax,Seven Corners,2000-01-02,19.041667,36.0,0,34,0.020208,0.027,...,1.3,0,14.0,51,38.86592,-77.144114,city_county_state,,,8701.0


In [4]:
# Save enriched dataset (Parquet preferred; CSV zip fallback)
OUT_PARQUET = Path('../data/cleaned_enriched.parquet').resolve()
OUT_CSV_ZIP = Path('../data/cleaned_enriched.csv.zip').resolve()
saved = []
try:
    enriched.to_parquet(OUT_PARQUET, index=False)
    saved.append(str(OUT_PARQUET))
except Exception as e:
    print('Parquet save failed, will rely on CSV zip fallback:', e)

try:
    enriched.to_csv(OUT_CSV_ZIP, index=False, compression='zip')
    saved.append(str(OUT_CSV_ZIP))
except Exception as e:
    print('CSV zip save failed:', e)

print('Saved:', saved)
len(enriched), enriched.columns.tolist()[:10]

Saved: ['C:\\Users\\Daniel\\vsc-projects\\codeinstitute\\projects\\us-pollution-data\\data\\cleaned_enriched.parquet', 'C:\\Users\\Daniel\\vsc-projects\\codeinstitute\\projects\\us-pollution-data\\data\\cleaned_enriched.csv.zip']


(412856,
 ['State',
  'County',
  'City',
  'Date Local',
  'NO2 Mean',
  'NO2 1st Max Value',
  'NO2 1st Max Hour',
  'NO2 AQI',
  'O3 Mean',
  'O3 1st Max Value'])

In [5]:
# Quick verification: reload Parquet if available
if Path('data/cleaned_enriched.parquet').exists():
    test = pd.read_parquet('data/cleaned_enriched.parquet')
    print('Reloaded parquet:', test.shape)
    display(test.sample(3, random_state=0))
else:
    print('Parquet not present; verify CSV zip if needed.')

Parquet not present; verify CSV zip if needed.
