In [1]:
from google.colab import drive
import pandas as pd
import numpy as np

pollutants_default = ['NOx', 'SO2', 'PM25', 'VOC']
crosswalk_file_default = "SOURCEINDX to FIPS crosswalk.csv"

def load_excel_data(file_path, sheet_name=None):
    """
    Load data from an Excel spreadsheet using Pandas.

    Parameters:
    - file_path (str): The file path to the Excel spreadsheet.
    - sheet_name (str or int, default None): Name or index of the sheet to read. If None, it reads the first sheet.

    Returns:
    - DataFrame: A pandas DataFrame containing the data.
    """
    try:
        # Load data from Excel file
        if sheet_name is not None:
            data = pd.read_excel(file_path, sheet_name=sheet_name)
        else:
            data = pd.read_excel(file_path)
        return data
    except FileNotFoundError:
        print("File not found. Please provide a valid file path.")
        return None
    except Exception as e:
        print("An error occurred:", e)
        return None

def fips_to_sourceindx(fips_code, **options):
    """
    Convert FIPS code to sourceindx using a crosswalk file.

    Parameters:
    - fips_code (str): FIPS code to convert.
    - crosswalk_file (str): Path to the crosswalk file. Default is "SOURCEINDX to FIPS crosswalk.csv".

    Returns:
    - int or None: Corresponding sourceindx for the given FIPS code, or None if FIPS code is not found.
    """
    # Read the crosswalk file
    if "crosswalk_file" in options:
      crosswalk_data = pd.read_csv(options['crosswalk_file'])
    else:
      crosswalk_data = pd.read_csv(crosswalk_file_default)

    # Filter the crosswalk data for the given FIPS code
    filtered_data = crosswalk_data[crosswalk_data['FIPS'] == fips_code]
    # Check if FIPS code is found
    if len(filtered_data) > 0:
        sourceindx = filtered_data.iloc[0]['SOURCEINDX']
        return sourceindx
    else:
        print(f"No sourceindx found for FIPS code {fips_code}.")
        return None


def create_mask(df, FIPS, **options):
    """
    Create a mask for selecting the sector, county, and stack height
    where emissions are modified based on the provided tier levels, source index,
    and type index.

    Parameters:
    - df (DataFrame): The pandas DataFrame containing emissions data.
    - FIPS (int): The FIPS of the county to filter.
    - tier_levels (list, optional): A list containing three tier levels (TIER1, TIER2, TIER3).
    - typeindx (int, optional): The index of the stack height to filter.

    Returns:
    - Series: A boolean mask indicating rows where the conditions are met.
    """

    # if "crosswalk_file" in options:
    #   crosswalk_file = options['crosswalk_file']
    # else:
    #   crosswalk_file = crosswalk_file_default

    if "typeindx" in options:
      typeindx = options['typeindx']
    else:
      typeindx = 2

    if "tiers" in options:
      tiers = options['tiers']
    else:
      tiers = [2, 3, 1]

    return ((df['TIER1'] == tiers[0]) & (df['TIER2'] == tiers[1]) &
            (df['TIER3'] == tiers[2]) & (df['sourceindx'] == fips_to_sourceindx(FIPS, **options)) &
            (df['typeindx'] == typeindx))

def apply_modifications(df0, df_mods, **options):
    """
    Apply modifications to df0 based on adjustments specified in df_mods.

    This function modifies df0 based on adjustments specified in df_mods.
    It increases the corresponding values in df0 for NO2, SO2, H3, SOA, PM25, and VOC
    by the adjustment values provided in df_mods.

    Parameters:
    - df0 (DataFrame): The original DataFrame containing emissions data.
    - df_mods (DataFrame): DataFrame containing adjustments to be applied to df0.

    Returns:
    - DataFrame: The modified DataFrame (df0) after applying adjustments.
    """

    if 'pollutants' in options:
      pollutants = options['pollutants']
    else:
      pollutants = pollutants_default

    # Create a copy of the baseline emissions dataframe to return with modifications
    df_modf = df0.copy()

    # Iterate over each LNG project in the df_mods dataframe
    for index, row in df_mods.iterrows():

        # Create a mask based on the project county and tier & stackheight options (if provided)
        mask = create_mask(df_modf, row['FIPS'], **options)

        if not mask.any():
            print(f"No matching rows found for adjustment at index {index}. Skipping...")
            continue
        # For each pollutant, modify the emissions volume by adding an amount to the baseline
        for pollutant in pollutants:
            df_modf.loc[mask, pollutant] += row[pollutant]

    return df_modf


def load_baseline_data(baseline_info, year, mode='simple'):
    """
    Load baseline data based on the specified mode.

    Parameters:
    - baseline_info (DataFrame): DataFrame containing baseline information with columns 'fname' and 'year'.
    - year (int): The year for which baseline data is required.
    - mode (str): The mode of loading baseline data. Can be 'simple', 'align', or 'stepwise'.

    Returns:
    - str: The filename corresponding to the loaded baseline data.
    """
    if mode == "simple":
        # Return the fname value in the first row of baseline_info
        return [pd.read_csv(baseline_info.iloc[0]['fname']), baseline_info.iloc[0]['year']]
    elif mode == "align":
        # Return the fname value where year matches the input year
        matching_rows = baseline_info[baseline_info['year'] == year]
        if not matching_rows.empty:
            return [pd.read_csv(matching_rows.iloc[0]['fname']), matching_rows.iloc[0]['year']]
        else:
            raise ValueError("No baseline data found for the input year.")
    elif mode == "stepwise":
        # Return the fname value where year is equal to or less than the input year, selecting the highest row
        relevant_rows = baseline_info[baseline_info['year'] <= year]
        if not relevant_rows.empty:
            return [pd.read_csv(relevant_rows.iloc[-1]['fname']), relevant_rows.iloc[-1]['year']]
        else:
            raise ValueError("No matching baseline data found for the input year or later.")
    else:
        raise ValueError("Invalid mode. Mode must be one of 'simple', 'align', or 'stepwise'.")


def filter_lng_data_by_year(lng_timeseries, year):
    """
    Filter lng_data to include only projects that started in the specified year.

    Parameters:
    - lng_data (DataFrame): DataFrame containing LNG project data.
    - year (int): The year to filter lng_data.

    Returns:
    - DataFrame: Filtered DataFrame containing LNG projects that started in the specified year.
    """
    filtered_lng_timeseries = lng_timeseries[lng_timeseries['Year'] == year].copy()
    return filtered_lng_timeseries


def build_timeseries0(lng_data, project_lifetime=30, **options):
    """
    Build a time series DataFrame with one row per year per project over the project's operating lifetime (without at attenuation option)

    Parameters:
    - lng_data (DataFrame): DataFrame containing LNG project data.
    - project_lifetime (int, optional): The duration of the project's operating lifetime in years. Default is 30.

    Returns:
    - DataFrame: Time series DataFrame with one row per year per project over the project's operating lifetime.
    """
    # Initialize an empty list to store project time series data
    time_series_data = []

    if 'pollutants' in options:
      pollutants = options['pollutants']
    else:
      pollutants = pollutants_default

    # Iterate over each LNG project
    for idx, row in lng_data.iterrows():
        # Calculate the start and end years of the project's operating lifetime
        start_year = row['start_year']
        end_year = start_year + project_lifetime

        # Create a DataFrame for the project's time series data
        project_time_series = pd.DataFrame({'Year': range(int(start_year), int(end_year))})
        project_time_series['ProjectID'] = row.name  # Add a column for ProjectID
        project_time_series['SC Project Title'] = row['SC Project Title']  # Add SC Project Title column
        project_time_series.set_index('ProjectID', inplace=True)

        # Fill pollutant values for each year
        for pollutant in pollutants:
            project_time_series[pollutant] = row[pollutant]

        # Append project time series data to the list
        time_series_data.append(project_time_series)

    # Concatenate project time series data into a single DataFrame
    lng_timeseries = pd.concat(time_series_data)

    return lng_timeseries



def attenuate_data(lng_timeseries, attenuations, **options):
    """
    Attenuate the first n years of data for each project in lng_timeseries.

    Parameters:
    - lng_timeseries (DataFrame): DataFrame containing LNG project time series data.
    - attenuations (list of floats): List of attenuation factors to apply to the first n years of data
                                      for each project, where n is the length of attenuations.

    Returns:
    - DataFrame: DataFrame with the first n years of data attenuated for each project in lng_timeseries.
    """
    # Group lng_timeseries by ProjectID
    grouped = lng_timeseries.groupby('ProjectID')
    modified_data = []

    if 'pollutants' in options:
      pollutants = options['pollutants']
    else:
      pollutants = pollutants_default

    # Iterate over each project
    for project_id, project_data in grouped:
        # Get the length of the attenuation factors
        num_attenuations = len(attenuations)
        project_data_copy = project_data.copy()

        # Apply attenuation to the first n years of data
        for i in range(min(len(project_data_copy), num_attenuations)):
            attenuation_factor = attenuations[i]
            project_data_copy.iloc[i, project_data_copy.columns.isin(pollutants)] *= attenuation_factor
        modified_data.append(project_data_copy)

    return pd.concat(modified_data)


def build_timeseries(lng_data, prj_lifetime=30, attenuations=[1.0], **kwargs):
    """
    Build a timeseries of LNG project pollutant emissions data with possible attenuation

    Parameters:
    - lng_data (DataFrame): DataFrame containing LNG project data.
    - project_lifetime (int, optional): The duration of the project's operating lifetime in years. Default is 30.
    - attenuations (list of floats, optional): List of attenuation factors to apply to the first n years of data
                                      for each project, where n is the length of attenuations. Default is 1.0 (no attenuation)
    - pollutants (list of strings, optional): List of pollutants to include as columns in output dataframe (must also exist in lng_data)

    Returns:
    - DataFrame: Time series DataFrame with one row per year per project over the project's operating lifetime (with possible attenuation)
    """


    return (attenuate_data(
    build_timeseries0(prj_data, prj_lifetime, **kwargs),
    attenuations, **kwargs).
                         # Merge this back with FIPS data
                         merge(prj_data[['SC Project Title', 'FIPS']])
)



def generate_command_string(fid, baseline_emissions_year, analysis_year, discount_rate,
                            configID, pathlist):
    """
    Generate a command string for COBRA based on input variables.

    Parameters:
    - baseline_year (int): The year of the emissions baseline data.
    - control_scenario_year (int): The year of the emissions control scenario data and analysis year.
    - discount_rate (int or float): The discount rate for the COBRA run (%, 0-1).
    - configID (str): The configuration ID for the scenario.
    - pathlist (list of str): A list containing file paths for emissions baseline, control scenario,
                              population, and baseline incidence files with YYYY placeholder

    Returns:
    - str: Command string for COBRA.

    The function generates a command string for COBRA based on the input variables.
    It constructs file paths using the provided year and configuration ID and substitutes
    placeholders 'YYYY' in the file paths with the corresponding year. The command string
    includes paths for emissions baseline, control scenario, population, and baseline
    incidence files, as well as the discount rate and year for valuation.
    """

    byrstr = str(baseline_emissions_year)
    cyrstr = str(analysis_year)

    emissions_baseline_path = pathlist[0].replace('YYYY', byrstr)
    control_scenario_path = f"{pathlist[1]}\\{configID}\\emissions.{configID}.{fid}.csv"
    population_path = pathlist[2].replace('YYYY', cyrstr)
    baseline_incidence_path = pathlist[3].replace('YYYY', cyrstr)
    valuation_path = pathlist[4].replace('YYYY', cyrstr)
    results_path = f"{pathlist[1]}\\{configID}\\results.{configID}.{fid}.csv"

    command_string = (
        f'"C:\\Program Files\\COBRA\\cobra_console.exe" -d "C:\\Program Files\\COBRA\\data\\cobra.db" '
        f'-b "{emissions_baseline_path}" -c "{control_scenario_path}" '
        f'-p "{population_path}" -i "{baseline_incidence_path}" -v "{valuation_path}" -o "{results_path}" '
        f'--discountrate {str(discount_rate)}'
    )
    return command_string



In [2]:
from google.colab import drive
import re
# drive.mount('/content/drive')

# Method parameters ===========================================================
# Scenario settings
experiment_timeframe = np.arange(2023, 2051)   # How many years in the COBRA analysis
prj_lifetime = 35            # Assumed project lifetime in years
baseline_setting = 'stepwise'       # Choose between "simple", "stepwise", or "align"
discount_rate = 2          # For 2%, type 2

# Pollutant modification settings
pollutants = ['NOx', 'SO2', 'PM25', 'VOC']
pollutant_tiers = [2, 3, 1]
stackheight_index = 2        # 2 = LOW stack height
attenuation = [1.0]          # Attenuate the first n years of data for each project

# ID settings
config_id = 'a.finalData.03'   # experiment (a or b), data type, variant

# Test mode
test_mode = False

# Input spreadsheets and directories ==========================================
# LNG projects and pollutants
f0 = '/content/drive/MyDrive/gpDept-ResearchDept/LNG Air Pollution/LNG Health - COBRA project/Version 5 analysis/240610-FinalData-Operating.xlsx'

# Baseline file info and corresponding year
baseline_info = pd.DataFrame({'fname': ['/content/drive/MyDrive/gpDept-ResearchDept/LNG Air Pollution/LNG Health - COBRA project/COBRA (from desktop version) - v5.1/emissions/Emissions_2023.csv',
                                        '/content/drive/MyDrive/gpDept-ResearchDept/LNG Air Pollution/LNG Health - COBRA project/COBRA (from desktop version) - v5.1/emissions/Emissions_2028.csv'
                                        ],
                              'year': [2023, 2028,
                                       ]})

FIPS_crosswalk_file = "/content/drive/MyDrive/gpDept-ResearchDept/LNG Air Pollution/LNG Health - COBRA project/COBRA (from desktop version) - v5.1/data dictionary/SOURCEINDX to FIPS crosswalk.csv"

# Cloud output folder
cloud = "/content/drive/MyDrive/gpDept-ResearchDept/LNG Air Pollution/LNG Health - COBRA project/Version 5 analysis/" + config_id + '/'

# Local filepaths for generating Bash script
local_paths = ["C:\\Program Files\\COBRA\\input files\\emissions\\Emissions_YYYY.csv",
               "C:\\Users\\cobrauser\\Documents",
               "C:\\Program Files\\COBRA\\input files\\default data\\default_YYYY_population_data.csv",
               "C:\\Program Files\\COBRA\\input files\\default data\\default_YYYY_incidence_data.csv",
               "C:\\Program Files\\COBRA\\input files\\default data\\default_YYYY_valuation_data.csv"
               ]

# Script ======================================================================

# Load in LNG project data from Excel
prj_data = load_excel_data(f0, 'input')

# Create LNG pollutant emissions timeseries
prj_timeseries = build_timeseries(
    prj_data, prj_lifetime, attenuation, pollutants=pollutants
)

# Initialize a list to hold all control scenarios (one per year)
modified_dfs = []
bash_commands = []
fnames = []

# For each year in the experiment timeframe, create a control scenario
for i in experiment_timeframe:
    print('\n Creating control scenario for year {} (as compared to baseline below)'.format(i))

    # Load baseline data according to setting
    baseline_data = load_baseline_data(baseline_info, i, baseline_setting)
    # print(baseline_data[0].iloc[121721])
    b_em_year = baseline_data[1]

    # Get pollutant emissions from LNG terminals for year=i
    mods = filter_lng_data_by_year(prj_timeseries, year=i)

    # Add these emissions to the baseline data file according to tier and stackheight for the specified pollutants at the county for each project
    modified_data = apply_modifications(baseline_data[0],
                                        mods,
                                        crosswalk_file=FIPS_crosswalk_file,
                                        pollutants = pollutants,
                                        tiers = pollutant_tiers,
                                        typeindx = stackheight_index)

    # Print how the control scenario differs from the baseline scenario
    print(modified_data.compare(baseline_data[0], keep_equal=True))

    # Append to list
    modified_dfs.append(modified_data)

    # If not test mode, save as csv
    if not test_mode:
      modified_data.to_csv(cloud + '.'.join(['emissions', config_id, str(i), 'csv']))

    analysis_year = max([year for year in [2023, 2028, 2030, 2035, 2040, 2045, 2050] if year <= i], default=None)
    # Generate Bash command for COBRA run
    bash_cmd = generate_command_string(i, b_em_year, analysis_year, discount_rate,
                            config_id, local_paths)

    # Append to list
    bash_commands.append(bash_cmd)

bash_out = '\n'.join(bash_commands)
print(bash_out)

if not test_mode:
  with open (cloud + '.'.join(['execute', config_id, 'bat']), 'w', encoding='utf-8') as ebat:
    ebat.write(bash_out)



 Creating control scenario for year 2023 (as compared to baseline below)
                NOx                      SO2                   PM25  \
               self        other        self      other        self   
39392    326.624823   287.394823   36.691458   2.451458   22.817776   
113768  8766.448541    26.381099  154.868905   0.330749  636.834297   
121721   297.463628    18.163628    0.100042   0.100042  125.978310   
256461  2966.595883  2927.043639   74.183852  54.960775  320.593310   
277287  3575.318444    33.918444   49.893776   0.503776   92.108597   

                           VOC              
             other        self       other  
39392    13.287776   46.244125   12.734125  
113768    2.040380  623.786787   12.816741  
121721    1.778310   52.563380    1.663380  
256461  259.193887  214.944255  188.626947  
277287    6.808597  353.971475    0.861475  

 Creating control scenario for year 2024 (as compared to baseline below)
                NOx                    

In [None]:
if not test_mode:
  with open (cloud + '.'.join(['execute', config_id, 'bat']), 'w', encoding='utf-8') as ebat:
    ebat.write(bash_out)