# Definitions for functions:
* requirements: numpy, matplotlib, pandas

In [1]:
from typing import Tuple
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

def _calculate_target(
        path: str, column: str, method:str, verbose: bool = False,
        adjacency_slope: float = 0.1,
        duration: float = 10,
        temp_threshold: float = 3.5,
        init_tos_buffer: float = 1.0,
        savgol: bool = True
)->float:
    """
    General version of target value calculator.

    Args:
        path (str): Path to individual GC Excel file.
        column (str): Column name to calculate the target values for.
        method (str): Method to calculate the target value. Options are 'delta', 'initial value', 'final value', 'initial slope', 'final slope', 'overall slope'.
        verbose (bool): If True, print the calculated target values.
        adjacency_slope (float): Slope threshold for initial and final slope calculations.
        duration (float): Duration to calculate the final index.
        temp_threshold (float): Temperature threshold for initial index calculation.
        init_tos_buffer (float): Initial time-on-stream buffer for index calculation.
        savgol: Whether to apply Savitzky-Golay filter to the data when calculating initial and final slopes. Defaults to True.

    Returns:
        float: Calculated target value.
    """
    df = pd.read_excel(path, sheet_name='Data')
    df = df.fillna(value=0)  # some data set includes nan at the end of a column.
    
    if column not in df.columns:
        raise ValueError(f"Keyword '{column}' is not included in {df.columns.tolist()}")

    # Get proper indices and values according to 'method'
    tos, temp, col_val, initial_index, final_index, selected_index = \
        _extract_indices_target(path, column, duration, temp_threshold, init_tos_buffer, method, adjacency_slope)

    if   method == 'delta':
        target = col_val[final_index] - col_val[initial_index]
    elif method == 'initial value':
        target = col_val[initial_index]
    elif method == 'final value':
        target = col_val[final_index]
    elif method == 'initial slope':
        # calculate slope using linear fitting
        target = _plot_linear_line_fitting(tos[initial_index], tos[final_index], col_val[initial_index],
                                           col_val[final_index], tos, col_val, savgol, plot=False, show=False)
    elif method == 'final slope':
        # calculate slope using linear fitting
        target = _plot_linear_line_fitting(tos[initial_index], tos[final_index], col_val[initial_index],
                                           col_val[final_index], tos, col_val, savgol, plot=False, show=False)
    elif method == 'overall slope':
        target = (col_val[final_index] - col_val[initial_index]) / (tos[final_index] - tos[initial_index])
    elif method == 'AUC':
        # calculate area under the curve (AUC) using trapezoidal rule
        target = np.trapz(col_val[selected_index], tos[selected_index])

    if verbose:
        print(f"{column}->{method}: {target:.4f}")

    else:
        return target
    
def _extract_indices_target(
        path: str, column: str, duration: float = 10.0, temp_threshold: float = 3.5, init_tos_buffer: float = 1.0,
        method: str = 'delta', adjacency_slope: float = 0.1
) -> Tuple[pd.Series, pd.Series, pd.Series, int, int, np.ndarray]:
    """
    Processes a DataFrame to extract specific columns and calculate indices.

    Args:
        path (str): The path to the Excel file.
        column (str): The column name to be processed.
        duration (float): The duration to calculate the final index.
        temp_threshold (float): Temperature threshold for initial index calculation.
        init_tos_buffer (float): Initial time-on-stream buffer for index calculation.
        method (str): Method to calculate the target value.
        adjacency_slope (float): Slope threshold for initial and final slope calculations.

    Returns:
        Tuple[pd.Series, pd.Series, pd.Series, int, int, np.ndarray]:
            - Time on stream (tos) series.
            - Temperature series.
            - Column values series.
            - Initial index.
            - Final index.
            - Selected index array.
    """
    # Read the Excel file and extract the reference temperature
    df = pd.read_excel(
        path,
        sheet_name='Constants'
    )
    temp_ref = float(df[df['Variable'] == 'Reaction Temperature']['Value'].values[0])

    # Read the Excel file and fill NaN values with 0
    df = pd.read_excel(path, sheet_name='Data').fillna(0)

    # Check if the specified column exists in the DataFrame
    if column not in df.columns:
        raise ValueError(f"Keyword '{column}' is not included in {df.columns.tolist()}")

    # Extract the 'Time', 'Temperature', and specified column values, and sort by 'Time-on-Stream'
    df_sorted = df.sort_values(by=df.filter(like='Time').columns[0])
    tos = df_sorted.filter(like='Time').iloc[:, 0]
    temp = df_sorted.filter(like='Temperature').iloc[:, 0]
    col_val = df_sorted.filter(like=column).iloc[:, 0]

    # Detect and remove duplicates based on 'tos' and 'temp'; sometimes identical pair of data points are included.
    df_unique = pd.DataFrame({
            'Time': tos,
            'Temperature': temp,
            column: col_val
        }).drop_duplicates(subset=['Time', 'Temperature']).reset_index(drop=True)
    tos = df_unique['Time']
    temp = df_unique['Temperature']
    col_val = df_unique[column]

    # Find the initial index where the temperature is close to the reference temperature and time is non-negative
    assert (temp_threshold > 0), "temp_threshold should be positive."
    condition1 = np.abs(temp - temp_ref) <= temp_threshold
    test_val = np.abs(temp - temp_ref)
    condition2 = tos >= 0
    initial_index = np.argwhere(condition1 & condition2).reshape(-1)[0]

    # Adjust the initial index to account for the initial TOS buffer
    initial_index = np.argwhere(tos >= tos[initial_index] + init_tos_buffer).reshape(-1)[0]

    # Find the final index based on the duration from the initial index
    final_index = np.argwhere(tos >= tos[initial_index] + duration).reshape(-1)[0]

    # Find the selected indices within the initial and final index range
    selected_index = np.arange(initial_index, final_index + 1)

    # Modify indices according to the given `method` argument for 'plot_slope'
    if method == 'initial slope':
        # use the same initial_index
        try:
            # choosing final index which is close, in tos, to initial index
            final_index = np.argwhere(tos >= tos[initial_index] + adjacency_slope).reshape(-1)[0]
        except Exception as e:
            print(e, f'has occurred while calculating `final_index` for {method}.')
    elif method == 'final slope':
        # use the same final_index
        try:
            # choosing initial index which is close, in tos, to final index
            initial_index = np.argwhere(tos <= tos[final_index] - adjacency_slope).reshape(-1)[-1]
        except Exception as e:
            print(e, f'has occurred while calculating `initial_index` for {method}.')

    return tos, temp, col_val, initial_index, final_index, selected_index

def _plot_linear_line_fitting(
        t_init, t_final, y_init, y_final, tos, col_val, savgol: bool = True, plot: bool=True, show: bool=False,
        ax: plt.Axes = None
) -> float:
    """
    Plot linear line fitting data points around t_init.

    Args:
        t_init (float): Initial time.
        t_final (float): Final time.
        y_init (float): Initial value.
        y_final (float): Final value.
        tos (pd.Series): Time on stream series.
        col_val (pd.Series): Column values series.
        savgol: Whether to apply Savitzky-Golay filter to the data when calculating initial and final slopes. Defaults to True.
        plot (bool): Whether to plot the fitted line.
        show (bool): Whether to display the plot.
        ax (plt.Axes): Matplotlib Axes object to plot on.

    Returns:
        float: Slope of the fitted line
    """
    if savgol:
        # Apply Savitzky-Golay filter to smooth the data
        from scipy.signal import savgol_filter
        col_val = savgol_filter(col_val, window_length=min(len(col_val),10), polyorder=2)
        # plt.plot(tos, col_val, c='orange', label='Savitzky_Golay')

    # Find the index of t_init
    t_init_index = np.argmin(np.abs(tos - t_init))
    t_final_index = np.argmin(np.abs(tos - t_final))

    # Select data points around t_init for fitting
    start_index = max(0, t_init_index)
    end_index = min(len(tos), t_final_index + 1)
    t_fit = tos[start_index:end_index]
    y_fit = col_val[start_index:end_index]

    if ax is None:
        ax = plt

    if plot:
        # Plot data points used for fitting
        ax.scatter(t_fit, y_fit, c='blue', s=5, label='for fitting')

    # Perform linear fitting
    coeffs = np.polyfit(t_fit, y_fit, 1)
    linear_func = np.poly1d(coeffs)

    if plot:
        # Generate points for plotting the fitted line
        x_plot = np.linspace(t_init - 1, t_final + 1, 100)
        y_plot = linear_func(x_plot)

        # Plot the fitted line
        ax.plot(x_plot, y_plot, c='b', alpha=0.5, label='linear fit')
        # Plot the initial and final points that define data range for fitting
        ax.scatter([t_init, t_final], [y_init, y_final], color='orange', edgecolors='gray')
        # Annotate the slope value
        ax.text(t_final, y_final, f'slope={coeffs[0]:.2f}')

    if show:
        plt.show()
    return coeffs[0]

# Example code to calculate a target metric for a given excel file

In [4]:
path = '0p1_RhWI_0p02mg_500C_20250317_UCSBRR.xlsx'

_calculate_target(
    path=path,
    column='CO Net Production Rate (mol/molRh/s)', # Options: 'CO2 Conversion (%)', 'CH4 Net Production Rate (mol/molRh/s)', 'CO Net Production Rate (mol/molRh/s)', 'Selectivity to CO (%)
    method='AUC', # Options: 'delta', 'initial value', 'final value', 'initial slope', 'final slope', 'overall slope', 'AUC'
    verbose=True,
    adjacency_slope=1.0, # modification not recommended for consistency
    duration=10,         # modification not recommended for consistency
    temp_threshold=3.5,  # modification not recommended for consistency
    init_tos_buffer=0.5, # modification not recommended for consistency
    savgol=False         # modification not recommended for consistency
)

CO Net Production Rate (mol/molRh/s)->AUC: 50.5502
