## Functions for calculation

In [56]:
""" Functions for calculating differential exposure """

import pandas as pd
import numpy as np
import math

import progressbar

def calc_numerator(df, pollutant_col, group, same_state=False):
    num_sum = 0

    if same_state:
      df = df[df['State'] == df['Project State']]
    for ci in np.arange(len(df)):
      row = df.iloc[ci]
      # print(row)
      # print(row[group])
      county_prod = row[pollutant_col] * row['Total Population'] * row[group]/100
      # print(county_prod)

      if not math.isnan(county_prod):
        num_sum += county_prod
      else:
        if row['State'] != "Connecticut":
          print("NAN")
    # print(group)
    # print(num_sum)

    return num_sum

def calc_denom(df, group, same_state=False):
    denom_sum = 0

    if same_state:
      df = df[df['State'] == df['Project State']]

    for ci in np.arange(len(df)):
      row = df.iloc[ci]
      county_prod = row['Total Population'] * row[group]/100
      if not math.isnan(county_prod):
        denom_sum += county_prod
      else:
        if row['State'] != "Connecticut":
          print("NAN")

    # print(denom_sum)

    return denom_sum

def pop_weighted_exposure(df, pollutant_col, group, same_state=False):
  return (calc_numerator(df, pollutant_col, group, same_state)/calc_denom(df, group, same_state))

def calculate_population_weighted_exposure(df0, acs_data, projects, pollutant_cols,
                                           calc_groups, scope):
    """
    Calculate population-weighted exposure by racial/ethnic group for each project and pollutant.

    Parameters:
    df (DataFrame): The input dataframe of COBRA results. Must be pre-filtered to a single year
    acs_data (DataFrame): The dataframe of ACS data to merge with on destination FIPS
    projects (list): List of projects. If empty list, this is not used in the loop.
    pollutant_cols (list): List of pollutant columns.
    calc_groups (list): List of calculation groups.
    scope (str): National or State

    Returns:
    DataFrame: A dataframe containing population-weighted exposure by project (optional), pollutant, and group.
    """

    df = df0.merge(acs_data, left_on='Destination FIPS', right_on='FIPS', how='left')
    # Connecticut issue... Counties to Planning Regions??
    # print(np.unique(df['Destination FIPS'][~df['Destination FIPS'].isin(acs_data['FIPS'])]))

    col1 = []
    col2 = []
    col3 = []
    col4 = []

    if scope=='National':
        same_state = False
    else:
        same_state = True

    if len(projects) == 0:
        for j in pollutant_cols:
            for k in calc_groups:
                df_i = df
                col2.append(j)
                col3.append(k)
                col4.append(pop_weighted_exposure(df_i, j, k, same_state=same_state))

        exposure_df = pd.DataFrame({'Pollutant': col2, 'Group': col3, 'Exposure': col4})
        return exposure_df

    else:
        for i in progressbar.progressbar(range(len(projects))):
            # print(projects[i])
            for j in pollutant_cols:
                for k in calc_groups:
                    df_i = df[(df['Project'] == projects[i])]
                    col1.append(projects[i])
                    col2.append(j)
                    col3.append(k)
                    col4.append(pop_weighted_exposure(df_i, j, k, same_state=same_state))

        exposure_df = pd.DataFrame({'Project': col1, 'Pollutant': col2, 'Group': col3, 'Exposure': col4})

        return exposure_df

abbreviation_to_name = {
    # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#States.
    "AK": "Alaska",
    "AL": "Alabama",
    "AR": "Arkansas",
    "AZ": "Arizona",
    "CA": "California",
    "CO": "Colorado",
    "CT": "Connecticut",
    "DE": "Delaware",
    "FL": "Florida",
    "GA": "Georgia",
    "HI": "Hawaii",
    "IA": "Iowa",
    "ID": "Idaho",
    "IL": "Illinois",
    "IN": "Indiana",
    "KS": "Kansas",
    "KY": "Kentucky",
    "LA": "Louisiana",
    "MA": "Massachusetts",
    "MD": "Maryland",
    "ME": "Maine",
    "MI": "Michigan",
    "MN": "Minnesota",
    "MO": "Missouri",
    "MS": "Mississippi",
    "MT": "Montana",
    "NC": "North Carolina",
    "ND": "North Dakota",
    "NE": "Nebraska",
    "NH": "New Hampshire",
    "NJ": "New Jersey",
    "NM": "New Mexico",
    "NV": "Nevada",
    "NY": "New York",
    "OH": "Ohio",
    "OK": "Oklahoma",
    "OR": "Oregon",
    "PA": "Pennsylvania",
    "RI": "Rhode Island",
    "SC": "South Carolina",
    "SD": "South Dakota",
    "TN": "Tennessee",
    "TX": "Texas",
    "UT": "Utah",
    "VA": "Virginia",
    "VT": "Vermont",
    "WA": "Washington",
    "WI": "Wisconsin",
    "WV": "West Virginia",
    "WY": "Wyoming",
    # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#Federal_district.
    "DC": "District of Columbia",
    # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#Inhabited_territories.
    "AS": "American Samoa",
    "GU": "Guam GU",
    "MP": "Northern Mariana Islands",
    "PR": "Puerto Rico PR",
    "VI": "U.S. Virgin Islands",
}


## Read in and prepare data

In [46]:
"""
Read in combined timeseries and project-level results
"""

# Settings ===================================================================

analysis_yr = {"a": 2030, "b": 2030}
year_cols = {"a": "ID", "b": "Analysis Year"}
fips_cols = {"a": "FIPS", "b": "Destination FIPS"}
state_cols = {"a": "State", "b": "Destination State"}
colset0 = ["Delta PM 2.5", "Delta O3"]
a_scenario = "a.finalData.01"

results_dir0 = "/content/drive/MyDrive/gpDept-ResearchDept/LNG Air Pollution/LNG Health - COBRA project/Version 5 analysis"
a_f0 = results_dir0 + "/a.finalData.results/a.finalData.01-03.combined_results.csv"
b_f0 = results_dir0 + "/b.finalData.results/b.finalData.01-03.combined_results.csv"
meta_f0 = "/content/drive/MyDrive/gpDept-ResearchDept/LNG Air Pollution/LNG Health - COBRA project/COBRA_LNGHEALTH_Data.xlsx"

# Script ======================================================================
meta = pd.read_excel(meta_f0, "LNG Project Data")
a_df0 = pd.read_csv(a_f0)
b_df0 = pd.read_csv(b_f0)

colsets = {key: ([year_cols[key], fips_cols[key], state_cols[key]] + colset0) for key in list(year_cols.keys())}

# Filter to just the essentials ===============================================
a_df = a_df0[((a_df0[year_cols["a"]]==analysis_yr["a"]) & (a_df0['config_id'] == a_scenario))][colsets["a"]]
b_df = b_df0[(b_df0[year_cols["b"]]==analysis_yr["b"])][(["Project", "Total Population"] + colsets["b"])]

# Pull population data from b_df and merge into a_df ==========================
pop_df = b_df[['Destination FIPS', 'Total Population']].drop_duplicates()
a_df = a_df.merge(pop_df, left_on = 'FIPS', right_on='Destination FIPS', how='left')

# Merge project df with project state info ====================================
b_df2 = b_df.merge(meta[['Project', 'State']], on='Project', how='left')
b_df2.rename(columns={"State": "state_abbrev"}, inplace=True)
b_df2['Project State'] = b_df2['state_abbrev'].apply(lambda x: abbreviation_to_name[x])

# Rename columns for function compatibility
renamer_a = {state_cols["a"]: "State", year_cols["a"]: "Analysis Year"}
renamer_b = {state_cols["b"]: "State", year_cols["b"]: "Analysis Year"}
a_df = a_df.rename(columns=renamer_a)
b_df2 = b_df2.rename(columns=renamer_b)


In [48]:
a_df.loc[((a_df['State']=='Louisiana') & (a_df['FIPS'] == 22001))]

Unnamed: 0,Analysis Year,FIPS,State,Delta PM 2.5,Delta O3,Destination FIPS,Total Population
1079,2030,22001,Louisiana,-0.079729,-0.868372,22001,66836.4232


In [53]:
a_df.loc[a_df['State']=='Louisiana']

Unnamed: 0,Analysis Year,FIPS,State,Delta PM 2.5,Delta O3,Destination FIPS,Total Population
1079,2030,22001,Louisiana,-0.079729,-0.868372,22001,66836.4232
1080,2030,22003,Louisiana,-0.108402,-0.887430,22003,27105.7039
1081,2030,22005,Louisiana,-0.020918,-0.180466,22005,161970.3366
1082,2030,22007,Louisiana,-0.020939,-0.175489,22007,24947.8905
1083,2030,22009,Louisiana,-0.029998,-0.355274,22009,42614.2559
...,...,...,...,...,...,...,...
1138,2030,22119,Louisiana,-0.018042,-0.177499,22119,41270.2589
1139,2030,22121,Louisiana,-0.021367,-0.240764,22121,29245.7591
1140,2030,22123,Louisiana,-0.012835,-0.161813,22123,11500.1367
1141,2030,22125,Louisiana,-0.019912,-0.262172,22125,16636.8312


In [50]:
"""
Read in census data and give better names
"""

# Info to read-in ACS demographic data and give it better column names =========
f_acs = {
    "race": '/content/drive/MyDrive/gpDept-ResearchDept/LNG Air Pollution/LNG Health - COBRA project/Version 5 analysis/ACSDP5Y2022.DP05_2024-05-09T163456/ACSDP5Y2022.DP05-Data - FIPS Edit.csv',
    "poverty": '/content/drive/MyDrive/gpDept-ResearchDept/LNG Air Pollution/LNG Health - COBRA project/Version 5 analysis/ACSST5Y2022.S1701_2024-07-03T153513/ACSST5Y2022.S1701-Data-EditFIPS.csv'
}
acs_cols = {
    "race": {'FIPS_ACS': 'FIPS',
            # 'DP05_0066PE': 'pWhite', 'DP05_0066PM': 'pWhiteMarginError',
             'DP05_0079PE': 'pWhite', 'DP05_0079PM': 'pWhiteMarginError',
            'DP05_0067PE': 'pBlackAA', 'DP05_0067PM': 'pBlackAAMarginError',
            'DP05_0068PE': 'pAmerIndianAN', 'DP05_0068PM': 'pAmerIndianANMarginError',
            'DP05_0069PE': 'pAsian', 'DP05_0069PM': 'pAsianMarginError',
            'DP05_0070PE': 'pNativeHawaiianPI', 'DP05_0070PM': 'pNativeHawaiianPIMarginError',
            'DP05_0071PE': 'pOther', 'DP05_0071PM': 'pOtherMarginError',
            'DP05_0073PE': 'pHispanicLatino', 'DP05_0073PM': 'pHispanicLatinoMarginError' # Fill with NA ("*****" bc sum?)
            },
    "poverty": {'FIPS_ACS': 'FIPS',
        'S1701_C03_001E': 'pBelowPovertyLevel'
    }
}

# Calculation groups associated with ACS data (% of population in a binned category)
calc_groups = {
    "race": [list(acs_cols["race"].values())[i] for i in np.arange(1, 14, 2)],
    "poverty": ["pBelowPovertyLevel", "pAbovePovertyLevel"]
}

# Read the ACS data into a dictionary of dataframes
acs_data = {}

for key, value in f_acs.items():
    print("Reading "+ value)
    a = pd.read_csv(value, header = [0,1])[acs_cols[key].keys()]
    a.columns = a.columns.droplevel(1)
    a.rename(columns=acs_cols[key], inplace=True)
    acs_data[key] = a

# Create column in poverty data for pAbovePovertyLevel
acs_data['poverty']['pAbovePovertyLevel'] = acs_data['poverty']['pBelowPovertyLevel'].apply(lambda x: 100-x)

# Create additional lists to loop through for calculating pop-weighted exposure
projects = np.unique(b_df2['Project'])
pollutant_cols = ['Delta PM 2.5', 'Delta O3']

Reading /content/drive/MyDrive/gpDept-ResearchDept/LNG Air Pollution/LNG Health - COBRA project/Version 5 analysis/ACSDP5Y2022.DP05_2024-05-09T163456/ACSDP5Y2022.DP05-Data - FIPS Edit.csv
Reading /content/drive/MyDrive/gpDept-ResearchDept/LNG Air Pollution/LNG Health - COBRA project/Version 5 analysis/ACSST5Y2022.S1701_2024-07-03T153513/ACSST5Y2022.S1701-Data-EditFIPS.csv


  a = pd.read_csv(value, header = [0,1])[acs_cols[key].keys()]


## Calculate population-weighted exposure levels

In [36]:
""" Crunch numbers for population-weighted exposure by race/ethnicity """

# by_project_instate_race = (
#     calculate_population_weighted_exposure(
#         b_df2,
#         acs_data['race'],
#         projects,
#         pollutant_cols,
#         calc_groups["race"],
#         "State")
# )

# by_project_national_race = (
#     calculate_population_weighted_exposure(
#         b_df2,
#         acs_data['race'],
#         projects,
#         pollutant_cols,
#         calc_groups["race"],
#         "National")
# )

all_projects_race = (
    calculate_population_weighted_exposure(
        a_df,
        acs_data['race'],
        [],
        pollutant_cols,
        calc_groups["race"],
        "National")
)

all_projects_LA_race = (
    calculate_population_weighted_exposure(
        a_df[a_df["State"]=="Louisiana"],
        acs_data['race'],
        [],
        pollutant_cols,
        calc_groups["race"],
        "National")
)

all_projects_TX_race = (
    calculate_population_weighted_exposure(
        a_df[a_df["State"]=="Texas"],
        acs_data['race'],
        [],
        pollutant_cols,
        calc_groups["race"],
        "National")
)

In [57]:
all_projects_LA_race = (
    calculate_population_weighted_exposure(
        a_df[a_df["State"]=="Louisiana"],
        acs_data['race'],
        [],
        pollutant_cols,
        calc_groups["race"],
        "National")
)

all_projects_LA_race

pWhite
-125588.57780313925
3002846.586177406
pBlackAA
-64532.44522872463
1686141.7337963823
pAmerIndianAN
-3397.6631901895266
84192.39315619938
pAsian
-4724.700198519031
114877.09058414686
pNativeHawaiianPI
-249.01999093215738
6306.342654503999
pOther
-8743.454400957813
210506.48780646746
pHispanicLatino
-12007.92189288041
282034.95092367416
pWhite
-979823.2160380771
3002846.586177406
pBlackAA
-475116.9308629791
1686141.7337963823
pAmerIndianAN
-24632.606996788414
84192.39315619938
pAsian
-31188.198587968614
114877.09058414686
pNativeHawaiianPI
-1813.8249307640876
6306.342654503999
pOther
-57900.68703422377
210506.48780646746
pHispanicLatino
-77624.04467028583
282034.95092367416


Unnamed: 0,Pollutant,Group,Exposure
0,Delta PM 2.5,pWhite,-0.041823
1,Delta PM 2.5,pBlackAA,-0.038272
2,Delta PM 2.5,pAmerIndianAN,-0.040356
3,Delta PM 2.5,pAsian,-0.041128
4,Delta PM 2.5,pNativeHawaiianPI,-0.039487
5,Delta PM 2.5,pOther,-0.041535
6,Delta PM 2.5,pHispanicLatino,-0.042576
7,Delta O3,pWhite,-0.326298
8,Delta O3,pBlackAA,-0.281778
9,Delta O3,pAmerIndianAN,-0.292575


In [38]:
from pandas import ExcelWriter

# names = ['by_project_instate_race', 'by_project_national_race', 'all_projects_race', 'all_projects_LA_race', 'all_projects_TX_race']
names = ['all_projects_race', 'all_projects_LA_race', 'all_projects_TX_race']

def save_xls(list_dfs, xls_path):
    with ExcelWriter(xls_path) as writer:
        for n, df in enumerate(list_dfs):
            df.to_excel(writer, names[n], index=False)

# save_xls([by_project_instate_race, by_project_national_race, all_projects_race, all_projects_LA_race, all_projects_TX_race],
save_xls([all_projects_race, all_projects_LA_race, all_projects_TX_race],
         '/content/drive/MyDrive/gpDept-ResearchDept/LNG Air Pollution/LNG Health - COBRA project/Version 5 analysis/pop_weighted_exposure.results/pop_weighted_exposure.results.race_ethnicity2.b2030-a2030.xlsx')

In [None]:
""" Crunch numbers for population-weighted exposure for population below poverty level """

by_project_instate_pov = (
    calculate_population_weighted_exposure(
        b_df2,
        acs_data['poverty'],
        projects,
        pollutant_cols,
        calc_groups["poverty"],
        "State")
)

by_project_national_pov = (
    calculate_population_weighted_exposure(
        b_df2,
        acs_data['poverty'],
        projects,
        pollutant_cols,
        calc_groups["poverty"],
        "National")
)

all_projects_pov = (
    calculate_population_weighted_exposure(
        a_df,
        acs_data['poverty'],
        [],
        pollutant_cols,
        calc_groups["poverty"],
        "National")
)


all_projects_LA_pov = (
    calculate_population_weighted_exposure(
        a_df[a_df["State"]=="Louisiana"],
        acs_data['poverty'],
        [],
        pollutant_cols,
        calc_groups["poverty"],
        "National")
)

all_projects_TX_pov = (
    calculate_population_weighted_exposure(
        a_df[a_df["State"]=="Texas"],
        acs_data['poverty'],
        [],
        pollutant_cols,
        calc_groups["poverty"],
        "National")
)

100% (32 of 32) |########################| Elapsed Time: 0:00:05 Time:  0:00:05
100% (32 of 32) |########################| Elapsed Time: 0:01:16 Time:  0:01:16


In [None]:
from pandas import ExcelWriter

names = ['by_project_instate_pov', 'by_project_national_pov', 'all_projects_pov', 'all_projects_LA_pov', 'all_projects_TX_pov']

def save_xls(list_dfs, xls_path):
    with ExcelWriter(xls_path) as writer:
        for n, df in enumerate(list_dfs):
            df.to_excel(writer, names[n])

save_xls([by_project_instate_pov, by_project_national_pov, all_projects_pov, all_projects_LA_pov, all_projects_TX_pov],
         '/content/drive/MyDrive/gpDept-ResearchDept/LNG Air Pollution/LNG Health - COBRA project/Version 5 analysis/pop_weighted_exposure.results/pop_weighted_exposure.results.poverty.b2023-a2028.xlsx')

In [None]:
print([i for i in a_df0.columns])
print([i for i in b_df0.columns])

['ID', 'destindx', 'FIPS', 'State', 'County', 'Base PM 2.5', 'Control PM 2.5', 'Delta PM 2.5', 'Base O3', 'Control O3', 'Delta O3', '$ Total Health Benefits(low estimate)', '$ Total Health Benefits(high estimate)', 'Total Mortality(low estimate)', '$ Total Mortality(low estimate)', 'Total Mortality(high estimate)', '$ Total Mortality(high estimate)', 'PM Mortality, All Cause (low)', '$ PM Mortality, All Cause (low)', 'PM Mortality, All Cause (high)', '$ PM Mortality, All Cause (high)', 'PM Infant Mortality', '$ PM Infant Mortality', 'Total O3 Mortality', '$ Total O3 Mortality', 'O3 Mortality (Short-term exposure)', '$ O3 Mortality (Short term exposure)', 'O3 Mortality (Long-term exposure)', '$ O3 Mortality (Long-term exposure)', 'Total Asthma Symptoms', '$ Total Asthma Symptoms', 'PM Asthma Symptoms, Albuterol use', '$ PM Asthma Symptoms, Albuterol use', 'O3 Asthma Symptoms, Chest Tightness', '$ O3 Asthma Symptoms, Chest Tightness', 'O3 Asthma Symptoms, Cough', '$ O3 Asthma Symptoms, C