In [None]:
from appgeopy import *
from my_packages import *
from signaltools import *


# ______________________________________________________________________
def analyze_time_series(
    series, prominence=None, smoothing_window=None, polyorder=3, min_distance=None, detrend_degree=3
):
    """
    Analyzes a time series for peaks and troughs with detrending, smoothing, and filtering.

    Parameters:
        series (pd.Series): Time-series data with a datetime index.
        prominence (float): Prominence value for peak detection. If None, computed dynamically.
        smoothing_window (int): Window length for Savitzky-Golay smoothing. If None, computed dynamically.
        polyorder (int): Polynomial order for Savitzky-Golay smoothing.
        min_distance (int): Minimum distance between alternating peaks and troughs. If None, computed dynamically.
        detrend_degree (int): Degree of polynomial for detrending the series.

    Returns:
        tuple: Detected peaks, troughs, smoothed and original data, and dates.
    """
    original_data = series.values
    dates = series.index

    # Step 1: Detrend the series
    trend, detrended_series = get_seasonal_and_trend_data(series, detrend_degree)

    # Step 2: Smooth the detrended data
    smoothing_window = smoothing_window or compute_smoothing_window(len(detrended_series))
    smoothed_series = smooth_time_series(detrended_series, smoothing_window, polyorder)

    # Step 3: Detect peaks and troughs
    min_distance = min_distance or compute_dynamic_distance(series)
    prominence = prominence or compute_dynamic_prominence(series)
    peaks, troughs = detect_peaks_troughs(smoothed_series, prominence, distance=min_distance)

    # Step 4: Refine peaks and troughs
    refined_peaks, refined_troughs = refine_peaks_troughs(peaks, troughs, smoothed_series, min_distance)

    # Final output
    final_peaks = pd.Series(original_data[refined_peaks], index=dates[refined_peaks])
    final_troughs = pd.Series(original_data[refined_troughs], index=dates[refined_troughs])

    smoothed_series = smoothed_series + trend

    return refined_peaks, refined_troughs, final_peaks, final_troughs, smoothed_series, original_data, dates


# ______________________________________________________________________
def plot_results(dates, original_data, smoothed_data, peaks, troughs, title):
    """
    Plots time-series data with detected peaks and troughs, displaying the corresponding indexes.

    Parameters:
        dates (pd.DatetimeIndex): Dates corresponding to the data.
        original_data (np.array): Original time-series data.
        smoothed_data (np.array): Smoothed time-series data.
        peaks (list): Detected peaks indices.
        troughs (list): Detected troughs indices.
        title (str): Plot title.

    Returns:
        matplotlib.figure.Figure: The figure object for saving or further processing.
    """
    # Set the figure size and calculate the scaling factor
    fig_width, fig_height = visualize.BASE_SIZE
    scaling_factor = visualize.calculate_scaling_factor(fig_width, fig_height)

    # Create the figure and axis
    fig, ax = plt.subplots(figsize=(fig_width * 1.5, fig_height))

    # Plot the original data and smoothed data
    ax.plot(dates, original_data, label="Original Data", color="grey", alpha=1, zorder=1)
    ax.plot(dates, smoothed_data, label="Smoothed Data", color="blue", linewidth=1.5, alpha=0.8, zorder=1)

    # Plot peaks and troughs using scatter plots
    ax.scatter(dates[peaks], original_data[peaks], color="green", marker="s", label="Peaks", s=70, alpha=0.5, zorder=2)
    ax.scatter(dates[peaks], original_data[peaks], color="black", marker="o", s=10, linewidth=0, alpha=0.5, zorder=2)
    ax.scatter(dates[troughs], original_data[troughs], color="red", marker="s", label="Troughs", s=70, alpha=0.5, zorder=2)
    ax.scatter(dates[troughs], original_data[troughs], color="black", marker="o", s=10, linewidth=0, alpha=0.5, zorder=2)

    # Annotate peaks with their corresponding indexes above the markers
    for i in range(len(peaks)):
        ax.text(
            dates[peaks[i]],
            original_data[peaks[i]] + 0.5,
            str(peaks[i]),
            ha="center",
            va="bottom",
            rotation="vertical",
            fontsize=10 * scaling_factor,
        )

    # Annotate troughs with their corresponding indexes below the markers
    for i in range(len(troughs)):
        ax.text(
            dates[troughs[i]],
            original_data[troughs[i]] - 0.5,
            str(troughs[i]),
            ha="center",
            va="top",
            rotation="vertical",
            fontsize=10 * scaling_factor,
        )

    # Configure the axis labels, title, and font scaling using visualize.py
    visualize.configure_axis(
        ax=ax, xlabel="Date", ylabel="Groundwater Levels (m)", title=title, scaling_factor=scaling_factor
    )

    # Configure the legend
    visualize.configure_legend(ax=ax, scaling_factor=0.5, frameon=False)

    # Configure ticks (datetime for x-axis)
    visualize.configure_datetime_ticks(ax=ax, axis="x", major_interval=12, minor_interval=6, date_format="%Y")

    # Adjust layout and set x-axis label rotation
    fig.tight_layout(rect=[0, 0, 1, 0.95])
    plt.setp(ax.get_xticklabels(), rotation=90, ha="center")

    return fig

In [None]:
# ______________________________________________________________________
# Step 1: Load the HDF5 File and Extract Dataset Information
hdf5_fpath = r"20240903_GWL_CRFP.h5"

with h5py.File(hdf5_fpath, "r") as hdf5_file:
    """
    Load HDF5 file to extract the time series data and available datasets.

    - existing_data_dict: Dictionary containing all datasets from the HDF5 file.
    - available_datasets: List of dataset paths in the HDF5 file.
    - datetime_array: Array of datetime objects parsed from the "date" field in the dataset.
    """
    existing_data_dict = h5pytools.hdf5_to_data_dict(hdf5_file)
    available_datasets = h5pytools.list_datasets(hdf5_file)

    # Extract the 'date' array and convert to a datetime index
    datetime_array = pd.to_datetime(existing_data_dict["date"], format="%Y%m%d")

In [None]:
cache = pd.DataFrame(data=None, index=None)

# ======================================================================
#                        Main Program: Data Preprocessing
# ======================================================================

# ______________________________________________________________________
# Step 2: Identify the Stations and Wellcodes in the Dataset
"""
Process the dataset to identify stations and wellcodes:
- stations: Extract unique station names from dataset paths.
- wellcode_byStation: List of wellcodes corresponding to a station, filtered by the number of dictionary items.
"""
stations = sorted(set([elem.split("/")[0] for elem in available_datasets if "date" not in elem]))

# Loop over each station in the dataset
for select_station in tqdm(stations, desc="Processing Stations"):

    # Extract the station's data from the dictionary
    station_data = existing_data_dict[select_station]

    # Filter wellcodes that contain exactly 3 items (indicating wellcode structure)
    wellcode_byStation = [elem for elem, val in station_data.items() if isinstance(val, dict) and len(val) == 3]

    # Loop over each wellcode in the station's data
    for select_wellcode in tqdm(wellcode_byStation, leave=False):

        output_savename = f"{select_station}_{select_wellcode}"

        # ======================================================================
        #                        Main Program: Time Series Analysis
        # ======================================================================

        # ______________________________________________________________________
        # Step 3: Extract Model GWL Data and Convert to Time Series
        """
        Extract the 'model' field for the selected station and wellcode, and convert it into a pandas time series.
        - model_gwl_arr: Array of groundwater level (GWL) model values.
        - model_gwl_series: Time-series representation of the model GWL data.
        """
        model_gwl_arr = station_data[select_wellcode]["measure"]["model"]
        model_gwl_series = pd.Series(data=model_gwl_arr, index=datetime_array)

        if np.unique(model_gwl_series.index.year).size > 2:

            valid_idx_start = model_gwl_series.first_valid_index()
            valid_idx_end = model_gwl_series.last_valid_index()

            valid_gwl_series = model_gwl_series[valid_idx_start:valid_idx_end]
            # ______________________________________________________________________
            # Step 4: Perform Time Series Analysis for Peak and Trough Detection
            """
            Analyze the time series by:
            1. Detrending the series.
            2. Smoothing the series.
            3. Detecting peaks and troughs.
            """
            (
                refined_peaks,
                refined_troughs,
                final_peaks,
                final_troughs,
                smoothed_data,
                original_data,
                dates,
            ) = analyze_time_series(
                valid_gwl_series,
                # smoothing_window=int(len(valid_gwl_series) * 0.01),  # Dynamic smoothing window
                smoothing_window=61,  # Dynamic smoothing window
                detrend_degree=3,
                polyorder=1,
            )

            # ______________________________________________________________________
            output_peak_table = pd.DataFrame(
                {"indexes": refined_peaks, "dates": dates[refined_peaks], "values": original_data[refined_peaks]}
            )

            output_trough_table = pd.DataFrame(
                {"indexes": refined_troughs, "dates": dates[refined_troughs], "values": original_data[refined_troughs]}
            )

            excel_savepath = os.path.join("temp4", f"{output_savename}.xlsx")

            output_peak_table.to_excel(excel_savepath, sheet_name="peaks", index=False)
            data_io.save_df_to_excel(
                df_to_save=output_trough_table,
                filepath=excel_savepath,
                sheet_name="troughs",
                verbose=False,
                index=False,
            )

            # ======================================================================
            #                        Main Program: Visualization
            # ======================================================================

            # ______________________________________________________________________
            # Step 5: Generate and Display Plot for Peaks and Troughs
            """
            Visualize the original and smoothed time series along with the detected peaks and troughs.
            - fig: Figure object containing the plot.
            """
            fig = plot_results(
                dates,
                original_data,
                smoothed_data,
                refined_peaks,
                refined_troughs,
                f"{select_station} {select_wellcode}",  # Plot title based on station and wellcode
            )

            # Optional: Save the plot to a file (commented out by default)
            savepath = f"temp3\\{output_savename}.png"
            visualize.save_figure(fig, savepath)

            # Close the plot to free memory (especially important in loops)
            plt.close()