In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import numpy as np
from pathlib import Path

# ================================
# Process the FF10_POINT file for InMAP compatibility
# ================================

# Path to your FF10_POINT format file
file_path = "../data/raw/point/2022hc_cb6_22m/inputs/ptegu/egu_cems_2022_POINT_20240615_2022cems_stackfix2_23jul2024_v0.csv"

print(f"Reading data from {file_path}...")

# Create output directory if it doesn't exist
output_dir = Path("../data/processed")
output_dir.mkdir(parents=True, exist_ok=True)

# Count number of header lines to skip
with open(file_path, 'r') as f:
    header_lines = 0
    for line in f:
        if line.startswith('#'):
            header_lines += 1
        else:
            break

# Read the FF10_POINT file, skipping header comments
df = pd.read_csv(file_path, skiprows=header_lines, low_memory=False)

print(f"Data loaded. Shape: {df.shape}")

# Initialize lists to hold the processed data
VOC, NOx, NH3, SOx, PM2_5 = [], [], [], [], []
height, diam, temp, velocity = [], [], [], []
coords = []

# Define pollutant classifications based on the provided list
voc_pollutants = [
    'VOC', 
    # Volatile organic compounds
    '75092', '71432', '50000', '110543', '85018', '91203', '108883',
    '86737', '91576', '206440', '129000', '107028', '75070', '75569',
    '1330207', '106990', '100414', '108952', '106467', '60344', '67663',
    '78591', '80626', '74839', '74873', '75003', '75150', '75252',
    '77781', '92524', '98828', '98862', '100425', '100447', '106934',
    '107062', '108054', '108907', '117817', '121142', '123386', '127184',
    '1634044', '107131', '74884', '92875', '108101', '131113', '132649',
    '56235', '71556', '75014', '78875', '87865', '95476', '75354', '79005',
    '84742', '85449', '91225', '106423', '106445', '95487', '108383',
    '108394', '75343', '120821', '88062', '91587', '57147', '107051',
    '67561', '86748', '59892', '1319773', '189640', '91941', '77474',
    '95807', 'HOURACT', '79016', '542756', '118741', '1336363', '192972'
]

nox_pollutants = [
    'NOX', 'NO3', 'N590'
]

sox_pollutants = [
    'SO2', 'SO4', '7783064'
]

nh3_pollutants = [
    'NH3'
]

pm25_pollutants = [
    'PM25-PRI', 'PM25-FIL', 'PMFINE', 'PM-CON', 'EC', 'OC'
]

# Create a dictionary for quick lookup
pollutant_map = {}
for poll in voc_pollutants:
    pollutant_map[poll] = 'VOC'
for poll in nox_pollutants:
    pollutant_map[poll] = 'NOx'
for poll in sox_pollutants:
    pollutant_map[poll] = 'SOx'
for poll in nh3_pollutants:
    pollutant_map[poll] = 'NH3'
for poll in pm25_pollutants:
    pollutant_map[poll] = 'PM2_5'

# Process each row of the FF10_POINT file
for _, row in df.iterrows():
    try:
        # Get pollutant code and value
        poll = str(row['poll']).upper() if 'poll' in row else ""
        emis_value = row['ann_value'] if 'ann_value' in row else 0
        
        # Skip if no emissions value
        if pd.isna(emis_value) or emis_value == '':
            continue
            
        # Convert emissions value to float
        emis_value = float(emis_value)
        
        # Get units and convert to short tons if needed
        emis_units = row['emissions uom'] if 'emissions uom' in row else 'TON'
        if emis_units == 'LB':
            emis_value = emis_value / 2000  # Convert pounds to short tons
            
        # Determine pollutant category
        poll_category = pollutant_map.get(poll)
        
        # Skip pollutants not in our mapping
        if poll_category is None:
            continue
            
        # Initialize emissions values
        voc_val, nox_val, nh3_val, sox_val, pm25_val = 0, 0, 0, 0, 0
        
        # Set the appropriate value based on pollutant category
        if poll_category == 'VOC':
            voc_val = emis_value
        elif poll_category == 'NOx':
            nox_val = emis_value
        elif poll_category == 'NH3':
            nh3_val = emis_value
        elif poll_category == 'SOx':
            sox_val = emis_value
        elif poll_category == 'PM2_5':
            pm25_val = emis_value
            
        # Append emissions values to respective lists
        VOC.append(voc_val)
        NOx.append(nox_val)
        NH3.append(nh3_val)
        SOx.append(sox_val)
        PM2_5.append(pm25_val)
            
        # Process stack parameters with unit conversions
        # Height (convert to meters)
        h = row['stkhgt'] if 'stkhgt' in row else ''
        if h != '' and not pd.isna(h):
            # Assuming height is in feet in the FF10_POINT file
            height.append(float(h) * 0.3048)  # Convert feet to meters
        else:
            height.append(0)
            
        # Diameter (convert to meters)
        d = row['stkdiam'] if 'stkdiam' in row else ''
        if d != '' and not pd.isna(d):
            # Assuming diameter is in feet in the FF10_POINT file
            diam.append(float(d) * 0.3048)  # Convert feet to meters
        else:
            diam.append(0)
            
        # Temperature (convert to Kelvin)
        t = row['stktemp'] if 'stktemp' in row else ''
        if t != '' and not pd.isna(t):
            # Assuming temperature is in Fahrenheit in the FF10_POINT file
            temp.append((float(t) - 32) * 5.0/9.0 + 273.15)  # Convert F to K
        else:
            temp.append(0)
            
        # Velocity (convert to m/s)
        v = row['stkvel'] if 'stkvel' in row else ''
        if v != '' and not pd.isna(v):
            # Assuming velocity is in feet/sec in the FF10_POINT file
            velocity.append(float(v) * 0.3048)  # Convert ft/s to m/s
        else:
            velocity.append(0)
            
        # Get coordinates
        lon = row['longitude'] if 'longitude' in row else None
        lat = row['latitude'] if 'latitude' in row else None
        
        if lon is not None and lat is not None and not pd.isna(lon) and not pd.isna(lat):
            coords.append(Point(float(lon), float(lat)))
        else:
            # Skip this record if coordinates are missing
            VOC.pop()
            NOx.pop()
            NH3.pop()
            SOx.pop()
            PM2_5.pop()
            height.pop()
            diam.pop()
            temp.pop()
            velocity.pop()
            
    except Exception as e:
        print(f"Error processing row: {e}")
        continue

print(f"Processed {len(coords)} emission points")

# Create the emissions GeoDataFrame in the exact format needed for InMAP
emis = gpd.GeoDataFrame({
    "VOC": VOC, 
    "NOx": NOx, 
    "NH3": NH3, 
    "SOx": SOx, 
    "PM2_5": PM2_5,
    "height": height, 
    "diam": diam, 
    "temp": temp, 
    "velocity": velocity
}, geometry=coords, crs='epsg:4269')

# Filter out any rows with all zeros for emissions
emis = emis[(emis['VOC'] > 0) | (emis['NOx'] > 0) | (emis['NH3'] > 0) | 
            (emis['SOx'] > 0) | (emis['PM2_5'] > 0)]

# Filter for power plants only using NAICS code (if available in the original data)
if 'naics' in df.columns:
    # Create a mapping from coordinates to NAICS codes
    coord_to_naics = {}
    for _, row in df.iterrows():
        if not pd.isna(row['longitude']) and not pd.isna(row['latitude']):
            coord_key = (float(row['longitude']), float(row['latitude']))
            if 'naics' in row and not pd.isna(row['naics']):
                naics = str(row['naics'])
                # If this is a power plant NAICS code
                if naics.startswith('2211'):
                    coord_to_naics[coord_key] = True
    
    # Filter emissions to only include power plants
    power_plant_indices = []
    for i, point in enumerate(coords):
        coord_key = (point.x, point.y)
        if coord_key in coord_to_naics:
            power_plant_indices.append(i)
    
    # Create a new GeoDataFrame with only power plants
    if power_plant_indices:
        egu_emis = emis.iloc[power_plant_indices].copy()
        print(f"Filtered to {len(egu_emis)} power plant emission points")
    else:
        egu_emis = emis.copy()
        print("No power plants identified. Using all emission points.")
else:
    egu_emis = emis.copy()
    print("NAICS codes not found. Using all emission points.")

# ================================
# Inspect and validate the emissions data
# ================================

# Display summary statistics
print("\nEmissions Summary (in short tons/year):")
emission_sums = egu_emis.sum(axis=0)[["VOC", "NOx", "NH3", "SOx", "PM2_5"]]
print(emission_sums)

print("\nStack Parameter Statistics:")
stack_stats = egu_emis[["height", "diam", "temp", "velocity"]].describe()
print(stack_stats)

# Save the processed emissions data
output_file = f"{output_dir}/processed_emissions_for_inmap.gpkg"
egu_emis.to_file(output_file, driver="GPKG")
print(f"\nSaved processed emissions data to {output_file}")

# Return the processed emissions for use with the run_sr function
print("\nReady to use with run_sr function!")
print("Example: resultsISRM = run_sr(egu_emis, model='isrm', emis_units='tons/year')")

# The emissions GeoDataFrame is now in the exact format needed for InMAP
egu_gdf = egu_emis

Reading data from ../data/raw/point/2022hc_cb6_22m/inputs/ptegu/egu_cems_2022_POINT_20240615_2022cems_stackfix2_23jul2024_v0.csv...
Data loaded. Shape: (126465, 77)
Processed 90811 emission points


IndexError: positional indexers are out-of-bounds

In [3]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import numpy as np
from pathlib import Path

# ================================
# Process the FF10_POINT file for InMAP compatibility
# ================================

# Path to your specific FF10_POINT format file
file_path = "../data/raw/point/2022hc_cb6_22m/inputs/ptegu/egu_cems_2022_POINT_20240615_2022cems_stackfix2_23jul2024_v0.csv"

print(f"Reading data from {file_path}...")

# Create output directory if it doesn't exist
output_dir = Path("../data/processed")
output_dir.mkdir(parents=True, exist_ok=True)

# Count number of header lines to skip
with open(file_path, 'r') as f:
    header_lines = 0
    for line in f:
        if line.startswith('#'):
            header_lines += 1
        else:
            break

# Read the FF10_POINT file, skipping header comments
df = pd.read_csv(file_path, skiprows=header_lines, low_memory=False)

print(f"Data loaded. Shape: {df.shape}")

# Initialize lists to hold the processed data
VOC, NOx, NH3, SOx, PM2_5 = [], [], [], [], []
height, diam, temp, velocity = [], [], [], []
coords = []
naics_codes = []  # Track NAICS codes for each emission point

# Define pollutant classifications based on the provided list
voc_pollutants = [
    'VOC', 
    # Volatile organic compounds
    '75092', '71432', '50000', '110543', '85018', '91203', '108883',
    '86737', '91576', '206440', '129000', '107028', '75070', '75569',
    '1330207', '106990', '100414', '108952', '106467', '60344', '67663',
    '78591', '80626', '74839', '74873', '75003', '75150', '75252',
    '77781', '92524', '98828', '98862', '100425', '100447', '106934',
    '107062', '108054', '108907', '117817', '121142', '123386', '127184',
    '1634044', '107131', '74884', '92875', '108101', '131113', '132649',
    '56235', '71556', '75014', '78875', '87865', '95476', '75354', '79005',
    '84742', '85449', '91225', '106423', '106445', '95487', '108383',
    '108394', '75343', '120821', '88062', '91587', '57147', '107051',
    '67561', '86748', '59892', '1319773', '189640', '91941', '77474',
    '95807', 'HOURACT', '79016', '542756', '118741', '1336363', '192972'
]

nox_pollutants = [
    'NOX', 'NO3', 'N590'
]

sox_pollutants = [
    'SO2', 'SO4', '7783064'
]

nh3_pollutants = [
    'NH3'
]

pm25_pollutants = [
    'PM25-PRI', 'PM25-FIL', 'PMFINE', 'PM-CON', 'EC', 'OC'
]

# Create a dictionary for quick lookup
pollutant_map = {}
for poll in voc_pollutants:
    pollutant_map[poll] = 'VOC'
for poll in nox_pollutants:
    pollutant_map[poll] = 'NOx'
for poll in sox_pollutants:
    pollutant_map[poll] = 'SOx'
for poll in nh3_pollutants:
    pollutant_map[poll] = 'NH3'
for poll in pm25_pollutants:
    pollutant_map[poll] = 'PM2_5'

# Process each row of the FF10_POINT file
for _, row in df.iterrows():
    try:
        # Get pollutant code and value
        poll = str(row['poll']).upper() if 'poll' in row else ""
        emis_value = row['ann_value'] if 'ann_value' in row else 0
        
        # Skip if no emissions value
        if pd.isna(emis_value) or emis_value == '':
            continue
            
        # Convert emissions value to float
        emis_value = float(emis_value)
        
        # Get units and convert to short tons if needed
        emis_units = row['emissions uom'] if 'emissions uom' in row else 'TON'
        if emis_units == 'LB':
            emis_value = emis_value / 2000  # Convert pounds to short tons
            
        # Determine pollutant category
        poll_category = pollutant_map.get(poll)
        
        # Skip pollutants not in our mapping
        if poll_category is None:
            continue
            
        # Initialize emissions values
        voc_val, nox_val, nh3_val, sox_val, pm25_val = 0, 0, 0, 0, 0
        
        # Set the appropriate value based on pollutant category
        if poll_category == 'VOC':
            voc_val = emis_value
        elif poll_category == 'NOx':
            nox_val = emis_value
        elif poll_category == 'NH3':
            nh3_val = emis_value
        elif poll_category == 'SOx':
            sox_val = emis_value
        elif poll_category == 'PM2_5':
            pm25_val = emis_value
        
        # Get coordinates
        lon = row['longitude'] if 'longitude' in row else None
        lat = row['latitude'] if 'latitude' in row else None
        
        if lon is None or lat is None or pd.isna(lon) or pd.isna(lat):
            continue  # Skip records without valid coordinates
            
        # Process stack parameters with unit conversions
        # Height (convert to meters)
        h = row['stkhgt'] if 'stkhgt' in row else ''
        if h != '' and not pd.isna(h):
            # Assuming height is in feet in the FF10_POINT file
            h_val = float(h) * 0.3048  # Convert feet to meters
        else:
            h_val = 0
            
        # Diameter (convert to meters)
        d = row['stkdiam'] if 'stkdiam' in row else ''
        if d != '' and not pd.isna(d):
            # Assuming diameter is in feet in the FF10_POINT file
            d_val = float(d) * 0.3048  # Convert feet to meters
        else:
            d_val = 0
            
        # Temperature (convert to Kelvin)
        t = row['stktemp'] if 'stktemp' in row else ''
        if t != '' and not pd.isna(t):
            # Assuming temperature is in Fahrenheit in the FF10_POINT file
            t_val = (float(t) - 32) * 5.0/9.0 + 273.15  # Convert F to K
        else:
            t_val = 0
            
        # Velocity (convert to m/s)
        v = row['stkvel'] if 'stkvel' in row else ''
        if v != '' and not pd.isna(v):
            # Assuming velocity is in feet/sec in the FF10_POINT file
            v_val = float(v) * 0.3048  # Convert ft/s to m/s
        else:
            v_val = 0
        
        # Get NAICS code if available
        naics = row['naics'] if 'naics' in row else None
        
        # Add to our data lists
        coords.append(Point(float(lon), float(lat)))
        VOC.append(voc_val)
        NOx.append(nox_val)
        NH3.append(nh3_val)
        SOx.append(sox_val)
        PM2_5.append(pm25_val)
        height.append(h_val)
        diam.append(d_val)
        temp.append(t_val)
        velocity.append(v_val)
        naics_codes.append(str(naics) if naics is not None and not pd.isna(naics) else '')
            
    except Exception as e:
        print(f"Error processing row: {e}")
        continue

print(f"Processed {len(coords)} emission points")

# Create the emissions GeoDataFrame in the exact format needed for InMAP
data_dict = {
    "VOC": VOC, 
    "NOx": NOx, 
    "NH3": NH3, 
    "SOx": SOx, 
    "PM2_5": PM2_5,
    "height": height, 
    "diam": diam, 
    "temp": temp, 
    "velocity": velocity,
    "naics_code": naics_codes
}

emis = gpd.GeoDataFrame(data_dict, geometry=coords, crs='epsg:4269')

# Filter out any rows with all zeros for emissions
emis = emis[(emis['VOC'] > 0) | (emis['NOx'] > 0) | (emis['NH3'] > 0) | 
            (emis['SOx'] > 0) | (emis['PM2_5'] > 0)]

print(f"Filtered to {len(emis)} emission points with non-zero emissions")

# ================================
# Filter for power plants
# ================================

# This is already EGU CEMS data (Electricity Generating Units Continuous Emissions Monitoring System)
# So all emissions should be from power plants, but we'll check NAICS codes if available
egu_naics_prefixes = ['2211', '221111', '221112', '221113', '221114', '221115', 
                      '221116', '221117', '221118', '221121', '221122']

# Create a mask for power plants
is_power_plant = emis['naics_code'].apply(
    lambda x: any(str(x).startswith(prefix) for prefix in egu_naics_prefixes) 
              if x else False
)

# Apply the mask to filter for power plants if NAICS codes are available and identified
if is_power_plant.any():
    egu_emis = emis[is_power_plant].copy()
    print(f"Filtered to {len(egu_emis)} power plant emission points using NAICS codes")
else:
    # Since this is EGU CEMS data, we can use all points even without NAICS codes
    print("Using all emission points (dataset is already for power plants - EGU CEMS)")
    egu_emis = emis.copy()

# ================================
# Clean the data for InMAP compatibility
# ================================

# Drop the NAICS column as it's not needed for InMAP
if 'naics_code' in egu_emis.columns:
    egu_emis = egu_emis.drop(columns=['naics_code'])

# ================================
# Inspect and validate the emissions data
# ================================

# Display summary statistics - properly handle the geometry column
print("\nEmissions Summary (in short tons/year):")
# Convert to DataFrame to exclude geometry column for sum operation
emissions_df = pd.DataFrame(egu_emis.drop(columns=['geometry']))
emission_sums = emissions_df[["VOC", "NOx", "NH3", "SOx", "PM2_5"]].sum()
print(emission_sums)

print("\nStack Parameter Statistics:")
stack_stats = emissions_df[["height", "diam", "temp", "velocity"]].describe()
print(stack_stats)

# Save the processed emissions data
output_file = f"{output_dir}/processed_emissions_for_inmap.gpkg"
egu_emis.to_file(output_file, driver="GPKG")
print(f"\nSaved processed emissions data to {output_file}")

# Return the processed emissions for use with the run_sr function
print("\nReady to use with run_sr function!")
print("Example: resultsISRM = run_sr(egu_emis, model='isrm', emis_units='tons/year')")

# The emissions GeoDataFrame is now in the exact format needed for InMAP
egu_gdf = egu_emis

Reading data from ../data/raw/point/2022hc_cb6_22m/inputs/ptegu/egu_cems_2022_POINT_20240615_2022cems_stackfix2_23jul2024_v0.csv...
Data loaded. Shape: (126465, 77)
Processed 90811 emission points
Filtered to 86671 emission points with non-zero emissions
Filtered to 84806 power plant emission points using NAICS codes

Emissions Summary (in short tons/year):
VOC      3.322371e+10
NOx      7.256522e+05
NH3      1.373875e+04
SOx      8.506148e+05
PM2_5    2.303993e+05
dtype: float64

Stack Parameter Statistics:
             height          diam          temp      velocity
count  84806.000000  84806.000000  84806.000000  84806.000000
mean      62.795411      4.974696    517.902599     25.233935
std       61.025807      2.183049    213.628689     15.306950
min        0.000000      0.000000      0.000000      0.000000
25%       19.812000      3.657600    360.372222     17.007840
50%       43.281600      5.181600    422.038889     21.336000
75%       76.200000      6.096000    727.594444     

In [4]:
emissions_cems = pd.read_csv('../data/2023_annual_emissions_CEMS.csv')
plants = emissions_cems['Facility ID'].unique()

In [6]:
plants = df['Facility ID'].unique()

array([     3,      9,     10, ..., 880108, 880109, 880110], shape=(1343,))