Air quality measurements are made at a handful of stations in Allegheny County.

Overdose deaths are reported within a zipcode.

We want to estimate the air quality at the centroid of each zipcode in Allegheny County for every day with air quality data. (TODO: eventually use hourly data)

This is a fool's errand, because zipcodes are very difficult to tie to a location in space: they are based on mail carrier routes and the USPS does not share them. It's an approximation.

In [6]:
import os

import pandas as pd
import geopandas as gpd

from air_brain.data.get_data import DATA_DIR
from air_brain.util.air import PM25, SO2
from air_brain.util.loc import distance

ZIPCODE_FILE = os.path.join(DATA_DIR, "zip2latlon.csv")

In [2]:
# zipcodes, with associated point geometry
zip_df = pd.read_csv(ZIPCODE_FILE)
zip_gdf = gpd.GeoDataFrame(zip_df, geometry=gpd.points_from_xy(zip_df.longitude, zip_df.latitude), crs="EPSG:4326")

def idw(zip_gdf, air_gdf):
    """
    use inverse distance weighting to estimate the AQI at each zipcode for each date
    """
    # location of each measurement site
    air_sites = air_gdf[["site", "geometry"]].drop_duplicates()
    air_sites.geometry = air_sites.geometry.to_crs("epsg:2272")

    # distance from each measurement site to each zipcode
    idw_df = zip_gdf.copy()
    for row in air_sites.itertuples():
        idw_df["{}_dist".format(row.site)] = idw_df.geometry.to_crs("epsg:2272").distance(row.geometry)
    idw_df = idw_df[["zipcode", "geometry"] + ["{}_dist".format(x) for x in air_sites.site]]

    # generate the dataframe of zipcode x date for each measurement
    by_site = air_gdf.pivot(index="date", columns="site", values="index_value").reset_index()
    idw_df = idw_df.merge(by_site, how="cross")

    # compute the IDW (quadratic, for dispersion)
    idw_df["num"] = 0
    idw_df["denom"] = 0
    for site in air_sites.site:
        idw_df["num"] += idw_df[site].fillna(0) * (1 / idw_df["{}_dist".format(site)])
        idw_df["denom"] += idw_df[site].notna() * (1 / idw_df["{}_dist".format(site)])
    idw_df["idw"] = idw_df.num / idw_df.denom

    return idw_df[["date", "zipcode", "geometry", "idw"]]

In [4]:
# PM 2.5
pm25 = idw(zip_gdf, PM25().daily_air_gdf())
pm25.to_csv(os.path.join(DATA_DIR, "pm25_zipcode.csv"), index=False)

In [7]:
# SO2
so2 = idw(zip_gdf, SO2().daily_air_gdf())
so2.to_csv(os.path.join(DATA_DIR, "so2_zipcode.csv"), index=False)