There are two large files required for the HPMS vehicle emissions data processing that are hosted on EDF's cloud storage under the cdhi bucket in the national_hpms_data folder:
        national_hpms_emissions_all_vehicles.csv contains the emission rates per vehicle class and pollutant for every road segment in the US. 
        hpms_all_states_geometry.gdb is a geodatabase file containing all 2018 HPMS roadway segments in the nation that were used for the vehicle emissions calculations.
    
In addition, a directory containing census tract shapefiles for all states of interest must be supplied. 
These can be downloaded from the census website: https://www.census.gov/cgi-bin/geo/shapefiles/index.php?year=2015&layergroup=Census+Tracts

The census tract geometries and population data relate to the year 2015 to allow for comparison to the Climate Vulnerability Index: https://climatevulnerabilityindex.org/

In [18]:
# Import packages
import geopandas as gpd
import pandas as pd
from census import Census
from pathlib import Path
gpd.options.io_engine = "pyogrio"

# INPUT DIRECTORIES
census_shapes_dir = Path('./2015_census_tracts') # directory containing all census tract shapefiles

# OUTPUT DIRECTORIES
shapefile_dir = Path('./2015_tract_emissions_shapefiles') # directory to store output shapefiles
csv_dir = Path('./2015_tract_emissions_csvs') # directory to store output csvs

# Insert your census API key - sign up for a key at https://api.census.gov/data/key_signup.html
key = Census("")

# Read in csv with all roadway segment emission rates
all_vehicle_emission_rates = pd.read_csv('./national_hpms_data/national_hpms_emissions_all_vehicles.csv')
# Read in national 2018 hpms roadway segments used in UVM paper
hpms_roads = gpd.read_file('./national_hpms_data/hpms_all_states_geometry.gdb').to_crs(5070)

In [26]:
# Process emissions data for every tract in the US
# Outputs a shapefile and csv for each state

# glob all the shapefiles in the folder containing census tract geometries
census_shapes = census_shapes_dir.rglob("*.shp")

for file in census_shapes:
    tracts = gpd.read_file(file).to_crs(5070)

    # Get census population data via the API
    census_data = key.acs5.state_county_tract(fields = ('NAME', 'B09001_001E', 'B09001_003E', 'B09001_004E', 'B09001_005E'),
                                    state_fips = tracts.loc[0, 'STATEFP'],
                                    county_fips = '*',
                                    tract = "*",
                                    year = 2015)

    census_data = pd.DataFrame(census_data)
    # Create 5 and under column by adding together age 5, 3-4, and under 3
    census_data['Pop5Under'] = census_data[['B09001_003E', 'B09001_004E', 'B09001_005E']].sum(axis = 1)
    # Create GEOID.Tract column to join to census spatial data
    census_data['GEOID'] = census_data.loc[:, ['state', 'county', 'tract']].sum(axis = 1)
    # Subset data to make output less cluttered
    census_data = census_data.loc[:, ['B09001_001E', 'Pop5Under', 'GEOID']].rename(columns = {'B09001_001E': 'Pop18Under'})

    # Join census population data and spatial data
    tracts_pop_2020 = pd.merge(tracts, census_data, on = 'GEOID')
    # Convert area to square miles from meters
    tracts_pop_2020['area_sqmi'] = tracts_pop_2020.area/2589988
    # Calculate population per square mile
    tracts_pop_2020['Pop5SqMi'] = (tracts_pop_2020['Pop5Under']/tracts_pop_2020['area_sqmi']).round(2)


    # Merge roadways and emission rates
    roads_emissions = pd.merge(hpms_roads, all_vehicle_emission_rates, on = 'FID_Link_Cnty_Intxn')

    # Overlay returns roadways clipped to each census tract with identifying information
    tract_emissions = gpd.overlay(roads_emissions, tracts)

    # Get length of each roadway segment and multiply by emission rates
    # Total emissions are then converted to tons per year
    tract_emissions['clipped_length'] = tract_emissions.length
    tract_emissions['PM10Total'] = (tract_emissions['clipped_length'] * tract_emissions['ER_PM10'] * 365 / 907185).round(4)
    tract_emissions['PM25Total'] = (tract_emissions['clipped_length'] * tract_emissions['ER_PM25'] * 365 / 907185).round(4)
    tract_emissions['NOxTotal'] = (tract_emissions['clipped_length'] * tract_emissions['ER_3_NOX'] * 365 / 907185).round(4)
    tract_emissions['NO2Total'] = (tract_emissions['clipped_length'] * tract_emissions['ER_33_NO2'] * 365 / 907185).round(4)

    # Aggregate road segment emissions by tract
    tract_emissions = tract_emissions.groupby('GEOID').agg(
        {'PM10Total': 'sum', 'PM25Total':'sum',
        'NOxTotal':'sum', 'NO2Total':'sum'}).reset_index()

    # Join population data to emissions data
    tract_emissions = pd.merge(tracts_pop_2020, tract_emissions)
    # Create columns for emissions per square mile and round to 3 decimal places
    tract_emissions[['PM10SqMi', 'PM25SqMi', 'NOxSqMi', 'NO2SqMi']] = tract_emissions.loc[:,
                    ['PM10Total', 'PM25Total', 'NOxTotal', 'NO2Total']].div(
                        tract_emissions['area_sqmi'], axis = 0).round(3)

    # Create directory to store each states shapefile in if one doesn't exist
    if not Path.exists(Path.joinpath(shapefile_dir, file.stem)):
        Path.mkdir(Path.joinpath(shapefile_dir, file.stem))
    # Write shapefile
    tract_emissions.to_file(Path.joinpath(shapefile_dir, file.stem, file.stem + '_vehicle_emissions.shp'))
    # Subset columns and write csv
    tract_emissions.loc[:, ['GEOID', 'NAMELSAD', 'area_sqmi',
                        'Pop18Under', 'Pop5Under', 'Pop5SqMi',
                        'PM10Total', 'PM25Total', 'NOxTotal', 'NO2Total',
                        'PM10SqMi', 'PM25SqMi', 'NOxSqMi', 'NO2SqMi']
                        ].to_csv(
                            Path.joinpath(csv_dir, file.stem + '_vehicle_emissions.csv'))
    
    print(file.stem + ' processing finished!')