# analysis.final.b.step1

This script loads project-specific csvs from one or more directories, adds identifying
information to each row marking metadata for each COBRA batch run and specific project,
merges with metadata about the source LNG project and destination county, and
calculates a few new fields for data analysis.

In [1]:
from google.colab import drive
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def load_excel_data(file_path, sheet_name=None):
    """
    Load data from an Excel spreadsheet using Pandas.

    Parameters:
    - file_path (str): The file path to the Excel spreadsheet.
    - sheet_name (str or int, default None): Name or index of the sheet to read. If None, it reads the first sheet.

    Returns:
    - DataFrame: A pandas DataFrame containing the data.
    """
    try:
        # Load data from Excel file
        if sheet_name is not None:
            data = pd.read_excel(file_path, sheet_name=sheet_name)
        else:
            data = pd.read_excel(file_path)
        return data
    except FileNotFoundError:
        print("File not found. Please provide a valid file path.")
        return None
    except Exception as e:
        print("An error occurred:", e)
        return None

def aggregate_csvs_prj(folder_path):
  from warnings import simplefilter
  simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
  dataframes = []
  # Iterate over files in the folder
  for filename in os.listdir(folder_path):
      if filename.startswith("results") and filename.endswith(".csv"):
          file_path = os.path.join(folder_path, filename)
          ID = filename.split(".")[4]  # Extract name piece from filename
          # Read CSV file into a dataframe
          df = pd.read_csv(file_path, skiprows = [1], skipfooter=1, engine='python')
          df.loc[:, "ID"] = ID
          # Append dataframe to the list
          dataframes.append(df.iloc[:])

  # Concatenate dataframes into a single grouped dataframe
  all_results = pd.concat(dataframes)

  return all_results


def load_dirs(input_dirs, dir_meta_cols, FIPS_crosswalk_file):
    """
    Load csvs from input directories and merge dataframes.

    Parameters:
    - input_dirs (list): List of input directories.
    - dir_meta_cols (list): List of dictionaries containing metadata columns for each directory.
    - FIPS_crosswalk_file (str): Path to the crosswalk file containing FIPS and SOURCEINDX columns.

    Returns:
    - agg_df0 (DataFrame): Merged DataFrame with added "Destination FIPS" column.
    """

    agg_dfs = []

    for i in np.arange(len(input_dirs)):
        dfi = aggregate_csvs_prj(input_dirs[i])
        for key, val in dir_meta_cols[i].items():
            dfi[key] = val
        agg_dfs.append(dfi)

    agg_df0 = pd.concat(agg_dfs)
    agg_df0['ID'] = agg_df0['ID'].str.replace("_", " ")

    return agg_df0

def load_summarized_demographic_data(path_dict, summary_col):
  """
  Load csvs based on fnames in path_dict, calculate the total values for each field
  regardless of age, and drop age columns
  """
  AgeCols = ['Age0'] + ['Age'+str(i) for i in np.arange(1, 100)]

  out_dfs = []

  for i in np.arange(len(path_dict['fname'])):
    out_df0 = pd.read_csv(path_dict['fname'][i])
    out_df0[summary_col] = out_df0[AgeCols].sum(axis=1)
    out_df0['Baseline Year'] = path_dict['year'][i]
    out_df0.drop(columns=AgeCols, inplace=True)
    out_dfs.append(out_df0)

  return pd.concat(out_dfs)


incidence_vars = [
    'Total Mortality(low estimate)',
    'Total Mortality(high estimate)',
    'PM Mortality, All Cause (low)',
    'PM Mortality, All Cause (high)',
    'PM Infant Mortality',
    'Total O3 Mortality',
    'O3 Mortality (Short-term exposure)',
    'O3 Mortality (Long-term exposure)',
    'Total Asthma Symptoms',
    'PM Asthma Symptoms, Albuterol use',
    'O3 Asthma Symptoms, Chest Tightness',
    'O3 Asthma Symptoms, Cough',
    'O3 Asthma Symptoms, Shortness of Breath',
    'O3 Asthma Symptoms, Wheeze',
    'Total Incidence, Asthma',
    'PM Incidence, Asthma',
    'O3 Incidence, Asthma',
    'Total Incidence, Hay Fever/Rhinitis',
    'PM Incidence, Hay Fever/Rhinitis',
    'O3 Incidence, Hay Fever/Rhinitis',
    'Total ER Visits, Respiratory',
    'PM ER Visits, Respiratory',
    'O3 ER Visits, Respiratory',
    'Total Hospital Admits, All Respiratory',
    'PM Hospital Admits, All Respiratory',
    'O3 Hospital Admits, All Respiratory',
    'PM Nonfatal Heart Attacks',
    'PM Minor Restricted Activity Days',
    'PM Work Loss Days',
    'PM Incidence Lung Cancer',
    'PM HA Cardio Cerebro and Peripheral Vascular Disease',
    'PM HA Alzheimers Disease',
    'PM HA Parkinsons Disease',
    'PM Incidence Stroke',
    'PM Incidence Out of Hospital Cardiac Arrest',
    'PM ER visits All Cardiac Outcomes',
    'O3 ER Visits, Asthma',
    'O3 School Loss Days, All Cause'
]


In [2]:
"""
This script loads project-specific csvs from one or more directories, adds identifying
information to each row marking metadata for each COBRA batch run and specific project,
merges with metadata about the source LNG project and destination county, and
calculates a few new fields for data analysis.

"""

# Method parameters ===========================================================

# Batch-specific metadata to add to each row
dir_meta_cols = [{"Analysis Year": 2023, "discount rate": 2, "config_id": "b.finalData.01"},
                 {"Analysis Year": 2030, "discount rate": 2, "config_id": "b.finalData.02"},
                 {"Analysis Year": 2050, "discount rate": 2, "config_id": "b.finalData.03"}
                 ]
# Project-specific metadata to add to each row
prj_meta_cols = ["Project", "Terminal", "Project Status", "DOE NFTA Authorization Status"]

# Manually assigned directories and file paths ================================
# Project data spreadsheet
f0 = '/content/drive/MyDrive/gpDept-ResearchDept/LNG Air Pollution/LNG Health - COBRA project/Version 5 analysis/240610-FinalData-AllProjects.xlsx'

# Metadata spreadsheet
f1 = '/content/drive/MyDrive/gpDept-ResearchDept/LNG Air Pollution/LNG Health - COBRA project/COBRA_LNGHEALTH_Data.xlsx'

# Input spreadsheets and directories
results_dir0 = "/content/drive/MyDrive/gpDept-ResearchDept/LNG Air Pollution/LNG Health - COBRA project/Version 5 analysis"
pop_f0 = "/content/drive/MyDrive/gpDept-ResearchDept/LNG Air Pollution/LNG Health - COBRA project/COBRA (from desktop version) - v5.1/default data/default_YYYY_population_data.csv"

# FIPS crosswalk
FIPS_crosswalk_file = "/content/drive/MyDrive/gpDept-ResearchDept/LNG Air Pollution/LNG Health - COBRA project/COBRA (from desktop version) - v5.1/data dictionary/SOURCEINDX to FIPS crosswalk.csv"

# Script ======================================================================

# 0. Automatically assign some fpaths and directories =========================
pop_file_info = {'fname': [pop_f0.replace("YYYY", str(i['Analysis Year'])) for i in dir_meta_cols],
            'year': [i['Analysis Year'] for i in dir_meta_cols]}
input_dirs = [(results_dir0 + '/' + i["config_id"]) for i in dir_meta_cols]

# 1. Aggregate CSVs ===========================================================
agg_df0 = load_dirs(input_dirs, dir_meta_cols, FIPS_crosswalk_file)

# 2.1 Merge with project-level metadata =======================================
# Load project-level metadata
prj_meta = load_excel_data(f1, sheet_name = 'LNG Project Data')[prj_meta_cols]

# Merge based on ID and shared SC Project Title
agg_df1 = agg_df0.merge(prj_meta, left_on="ID", right_on ="Project")
agg_df1.drop(['ID'], axis=1, inplace=True)

# 2.2 Merge with population and incidence data ================================
pop_data = load_summarized_demographic_data(pop_file_info, 'Total Population')

agg_df2 = (agg_df1
           .merge(
               (pop_data[['FIPS', 'Baseline Year', 'Total Population']]),
               left_on = ['FIPS', 'Analysis Year'],
               right_on = ['FIPS', 'Baseline Year']
))

agg_df2.rename(columns = {'County': 'Destination County', 'State': 'Destination State', 'FIPS': 'Destination FIPS'}, inplace=True)

# 3. Calculate cases per million people
agg_df3 = agg_df2
for var in incidence_vars:
    new_col_name = var + ' PER MILLION'
    agg_df3[new_col_name] = agg_df3[var] / (agg_df3['Total Population'] / 1000000)

# This is helpful for plotting later
agg_df3['Project Status'] = pd.Categorical(agg_df3['Project Status'], categories=['Operating', 'Under Construction', 'Planned'])

In [3]:
"""
Export combined results to CSV
"""
agg_df_f0 = results_dir0 + "/b.finalData.results/b.finalData.01-03.combined_results.csv"
agg_df3.to_csv(agg_df_f0, index=False)