In [1]:
from my_packages import *

from appgeopy import *

In [2]:
# _______________________________________________________________________
def detect_peaks_troughs(signal, time_index, prominence=True, dist=True):
    """
    Detect peaks and troughs in a given signal.

    Parameters:
    - signal (array-like): The signal data where peaks and troughs are to be detected.
    - time_index (array-like): Corresponding time index for the signal.
    - prominence (bool): Whether to consider prominence in peak detection (default=True).
    - dist (bool): Whether to consider distance in peak detection (default=True).

    Returns:
    - peaks, properties_peaks, peak_times: Detected peaks and their properties.
    - troughs, properties_troughs, trough_times: Detected troughs and their properties.
    """
    # Detect peaks in the signal
    peaks, properties_peaks = scipy.signal.find_peaks(signal, prominence=prominence, distance=dist)
    peak_times = time_index[peaks]

    # Detect troughs by inverting the signal
    troughs, properties_troughs = scipy.signal.find_peaks(-signal, prominence=prominence, distance=dist)
    trough_times = time_index[troughs]

    return peaks, properties_peaks, peak_times, troughs, properties_troughs, trough_times


# ______________________________________________________________________
def get_seasonal_and_trend_data(series, detrend_degree=2):
    """
    Extracts the trend and seasonal components from a given time series using polynomial regression.

    Parameters:
        series (pd.Series): Time-series data.
        detrend_degree (int): Degree of polynomial to use for detrending.

    Returns:
        tuple: The trend component and the detrended series.
    """
    numeric_time_idx = datetime_handle.numeric_time_index(series)
    finite_values = series[~series.isnull()].values

    # Polynomial trend estimation
    trend, _ = analysis.get_polynomial_trend(
        x=numeric_time_idx,
        y=finite_values,
        order=detrend_degree,
        x_estimate=np.arange(len(series)),
    )
    trend.index = series.index

    # Detrending the series
    detrended_series = series - trend

    return trend, detrended_series


# _______________________________________________________________________
def get_threshold(dict_prop, prom_proportion=1, dist_proportion=1):
    """
    Calculate thresholds for prominence and distance based on a proportion of the minimum values.

    Parameters:
    - dict_prop (dict): Dictionary of properties containing 'prominences' and 'left_bases'.
    - proportion (float): Proportion of the minimum value to use for threshold calculation (default=0.8).

    Returns:
    - list: Threshold values for prominence and distance.
    """
    prominence_array = dict_prop["prominences"][np.where(dict_prop["prominences"] > 0)]
    prominence_thresh = np.min(prominence_array) * prom_proportion

    distance_array = dict_prop["left_bases"][np.where(dict_prop["left_bases"] > 10)]
    dist_thresh = np.min(distance_array) * dist_proportion
    return [prominence_thresh, dist_thresh]


# _______________________________________________________________________
def filter_extremes_per_year(input_df, date_col, value_col, extreme_type="peak"):
    """
    Filters the extremes (peaks or troughs) by year, retaining only the highest peak or lowest trough per year.

    Parameters:
    - input_df (pd.DataFrame): DataFrame containing the time-series data (peaks or troughs).
    - date_col (str): Column name for the date (datetime format).
    - value_col (str): Column name for the values (peaks or troughs).
    - extreme_type (str): Type of extreme ('peak' or 'trough'). Determines whether to retain highest or lowest value.

    Returns:
    - pd.DataFrame: A DataFrame with the highest peak or lowest trough retained for each year.
    """
    df = input_df.copy()
    df["Year"] = pd.DatetimeIndex(df[date_col]).year

    # Select either the highest peak or the lowest trough by grouping by year
    if extreme_type == "peak":
        filtered_df = df.loc[df.groupby("Year")[value_col].idxmax()]
    elif extreme_type == "trough":
        filtered_df = df.loc[df.groupby("Year")[value_col].idxmin()]

    return filtered_df.drop(columns=["Year"])


# _______________________________________________________________________
def filter_extremes_by_range(input_df, date_col, value_col, extreme_type="peak", months_range=2):
    """
    Filters the extremes (peaks or troughs) within a given range of months around each year.
    Retains the two highest peaks or two lowest troughs within the search window, and ensures that no duplicate
    peaks or troughs are selected in overlapping windows.

    Parameters:
    - input_df (pd.DataFrame): DataFrame containing the time-series data (peaks or troughs).
    - date_col (str): Column name for the date (datetime format).
    - value_col (str): Column name for the values (peaks or troughs).
    - extreme_type (str): Type of extreme ('peak' or 'trough'). Determines whether to retain the highest or lowest values.
    - months_range (int): Number of months before and after the current year to include in the search window.

    Returns:
    - pd.DataFrame: A DataFrame with the selected peaks or troughs for each window.
    """
    df = input_df.copy()
    df[date_col] = pd.to_datetime(df[date_col])

    # Create a column for Year and Month for easier manipulation
    df["Year"] = pd.DatetimeIndex(df[date_col]).year
    df["Month"] = pd.DatetimeIndex(df[date_col]).month

    # Initialize list to hold selected extremes
    selected_extremes = []

    # Iterate over each year in the dataset
    for year in sorted(df["Year"].unique()):
        # Define the search window: year +/- x months
        start_date = pd.Timestamp(year=year, month=1, day=1) - pd.DateOffset(months=months_range)
        end_date = pd.Timestamp(year=year, month=12, day=31) + pd.DateOffset(months=months_range)

        # Subset the data within the search window
        window_df = df[(df[date_col] >= start_date) & (df[date_col] <= end_date)]

        # If there is no data in the current window, skip this year
        if window_df.empty:
            continue

        # Select either the top two peaks or the lowest two troughs
        if extreme_type == "peak":
            top_two_extremes = window_df.nlargest(2, value_col)
        elif extreme_type == "trough":
            top_two_extremes = window_df.nsmallest(2, value_col)

        # Append the selected extremes (top two) to the result list
        selected_extremes.append(top_two_extremes)

        # Remove the selected extremes from the original dataframe to avoid selecting them again in the next window
        df = df.drop(top_two_extremes.index)

    # Combine the selected extremes from all windows
    selected_extremes_df = pd.concat(selected_extremes).drop(columns=["Year", "Month"])

    return selected_extremes_df


# _______________________________________________________________________
def filter_consecutive(input_peaks, input_troughs, time_peak, value_peak, time_trough, value_trough):
    """
    Filters out consecutive peaks or troughs, ensuring alternating patterns of peaks and troughs.

    Parameters:
    - input_peaks (pd.DataFrame): DataFrame containing peak data.
    - input_troughs (pd.DataFrame): DataFrame containing trough data.
    - time_peak (str): Column name for peak timestamps.
    - value_peak (str): Column name for peak values.
    - time_trough (str): Column name for trough timestamps.
    - value_trough (str): Column name for trough values.

    Returns:
    - filtered_peaks, filtered_troughs (pd.DataFrame): Filtered DataFrames of peaks and troughs.
    """
    peaks = input_peaks.copy()
    troughs = input_troughs.copy()

    # Combine peaks and troughs into one DataFrame, assign 'Type' column for identification
    combined = pd.concat(
        [
            peaks[[time_peak, value_peak]].rename(columns={time_peak: "Time", value_peak: "Value"}).assign(Type="Peak"),
            troughs[[time_trough, value_trough]]
            .rename(columns={time_trough: "Time", value_trough: "Value"})
            .assign(Type="Trough"),
        ],
        ignore_index=True,
    )

    combined = combined.sort_values(by="Time").reset_index(drop=True)

    # Filter alternating peaks and troughs
    filtered = []
    last_type = None

    for _, row in combined.iterrows():
        curr_type = row["Type"]
        curr_value = row["Value"]

        if last_type is None or curr_type != last_type:
            filtered.append(row)
            last_type = curr_type
        else:
            if curr_type == "Peak" and curr_value > filtered[-1]["Value"]:
                filtered[-1] = row
            elif curr_type == "Trough" and curr_value < filtered[-1]["Value"]:
                filtered[-1] = row

    # Convert filtered list back to DataFrame, separate back into peaks and troughs
    filtered_df = pd.DataFrame(filtered)

    filtered_peaks = filtered_df[filtered_df["Type"] == "Peak"].rename(columns={"Time": time_peak, "Value": value_peak})

    filtered_troughs = filtered_df[filtered_df["Type"] == "Trough"].rename(
        columns={"Time": time_trough, "Value": value_trough}
    )

    return [
        filtered_peaks.drop("Type", axis=1).reset_index(drop=True),
        filtered_troughs.drop("Type", axis=1).reset_index(drop=True),
    ]

In [3]:
# _______________________________________________________________________
# Load HDF5 File and Extract Dataset Information
hdf5_fpath = r"20240903_GWL_CRFP.h5"

with h5py.File(hdf5_fpath, "r") as hdf5_file:
    # Extract existing data and available datasets
    existing_data_dict = h5pytools.hdf5_to_data_dict(hdf5_file)
    available_datasets = h5pytools.list_datasets(hdf5_file)

    # Extract the 'date' array and convert to a datetime index
    date_strings = [date.decode("utf-8") for date in existing_data_dict["date"]]
    datetime_array = pd.to_datetime(date_strings, format="%Y%m%d")

# _______________________________________________________________________
# Extract the list of stations and wellcodes for processing
stations = sorted(set([elem.split("/")[0] for elem in available_datasets if "date" not in elem]))

In [None]:
error_station = []

save_Excel = True
save_Figure = True

for i in range(1, 10000):
    savefolder = f"Attemp{i:03}"
    if not os.path.exists(savefolder):
        # Your code to create or use the savefolder goes here
        break


# station = "DONGFANG"
for station in tqdm(stations[::]):
    try:

        # _______________________________________________________________________
        # Extract and process station data
        station_data = existing_data_dict[station]
        wellcodes = [elem for elem, val in station_data.items() if isinstance(val, dict) and len(val) == 3]
        # wellcode = wellcodes[1]
        for wellcode in wellcodes:

            # _______________________________________________________________________
            # Extract time-series data and process the valid data range
            model_gwl_arr = station_data[wellcode]["measure"]["model"]
            model_gwl_series = pd.Series(data=model_gwl_arr, index=datetime_array)
            valid_gwl_series = model_gwl_series.loc[
                model_gwl_series.first_valid_index() : model_gwl_series.last_valid_index()
            ]
            # valid_gwl_series = model_gwl_series.loc[pd.to_datetime("2016-01-01") : pd.to_datetime("2023-01-01")]

            if valid_gwl_series.index.year.unique().size > 2:

                trend, detrended_series = get_seasonal_and_trend_data(series=valid_gwl_series, detrend_degree=3)

                # _______________________________________________________________________
                # Apply smoothing on the time series
                smoothed_series = smoothing.simple_moving_average(num_arr=detrended_series, window_size=11)

                # _______________________________________________________________________
                # Detect peaks and troughs
                peaks, properties_peaks, peak_times, troughs, properties_troughs, trough_times = detect_peaks_troughs(
                    signal=smoothed_series.values, time_index=smoothed_series.index
                )

                # _______________________________________________________________________
                # Calculate thresholds for peaks and troughs
                peak_prom, peak_dist = get_threshold(properties_peaks, prom_proportion=0.8, dist_proportion=0.8)
                trough_prom, trough_dist = get_threshold(properties_troughs, prom_proportion=0.5, dist_proportion=0.5)

                peak_dist = 2 if peak_dist < 1 else peak_dist
                trough_dist = 2 if trough_dist < 1 else trough_dist
                # _______________________________________________________________________
                # Redetect peaks and troughs based on thresholds
                peaks, _ = scipy.signal.find_peaks(smoothed_series.values, prominence=peak_prom, distance=peak_dist)
                peak_times = smoothed_series.index[peaks]
                troughs, _ = scipy.signal.find_peaks(
                    -smoothed_series.values, prominence=trough_prom, distance=trough_dist
                )
                trough_times = smoothed_series.index[troughs]
                # _______________________________________________________________________
                # Convert to DataFrames
                signal_peaks = pd.DataFrame(data={"time": peak_times, "value": valid_gwl_series[peak_times].values})
                signal_troughs = pd.DataFrame(
                    data={"time": trough_times, "value": valid_gwl_series[trough_times].values}
                )

                # _______________________________________________________________________
                # Filter extremes per year for peaks and troughs
                # filtered_peaks = filter_extremes_per_year(
                #     signal_peaks, date_col="time", value_col="value", extreme_type="peak"
                # )
                filtered_peaks = filter_extremes_by_range(
                    signal_peaks, date_col="time", value_col="value", extreme_type="peak", months_range=3
                )
                # filtered_troughs = filter_extremes_per_year(
                #     signal_troughs, date_col="time", value_col="value", extreme_type="trough"
                # )
                filtered_troughs = filter_extremes_by_range(
                    signal_troughs, date_col="time", value_col="value", extreme_type="trough", months_range=3
                )

                # _______________________________________________________________________
                # Apply filtering for consecutive peaks and troughs
                final_peaks, final_troughs = filter_consecutive(
                    input_peaks=filtered_peaks,
                    # input_peaks=signal_peaks,
                    input_troughs=filtered_troughs,
                    # input_troughs=signal_troughs,
                    time_peak="time",
                    value_peak="value",
                    time_trough="time",
                    value_trough="value",
                )

                if save_Excel:
                    fld2saveExcel = os.path.join(savefolder, "Peaks_Troughs")
                    os.makedirs(fld2saveExcel, exist_ok=True)

                    savepath = os.path.join(fld2saveExcel, f"{station}_{wellcode}.xlsx")

                    if not os.path.isfile(savepath):
                        final_peaks.to_excel(savepath, index=False, sheet_name="peaks")
                        data_io.save_df_to_excel(
                            df_to_save=final_troughs, filepath=savepath, sheet_name="troughs", verbose=False
                        )
                    else:
                        print("Target file has already existed!")
                        continue

                # _______________________________________________________________________

                fig_width, fig_height = (11.7 * 3 / 2, 8.3 * 2 / 3)
                fig = plt.figure(figsize=(fig_width, fig_height))

                ax = fig.add_subplot(111)
                # _______________________________________________________________________
                # Plot the valid groundwater level series
                ax.plot(valid_gwl_series, color="black", zorder=1)

                # _______________________________________________________________________

                # Plot all detected peaks (in grey with transparency)
                ax.plot(
                    signal_peaks.set_index("time"),
                    marker="o",
                    linestyle=" ",
                    markersize=14,
                    zorder=1,
                    color="none",
                    markeredgecolor="black",
                    alpha=0.5,
                )

                # Plot filtered peaks (by year, in blue)
                ax.plot(
                    filtered_peaks.set_index("time"),
                    marker="s",
                    linestyle=" ",
                    markersize=12,
                    zorder=2,
                    color="blue",
                    alpha=0.2,
                )

                # Plot final filtered peaks (after consecutive filtering, in lime green)
                ax.plot(
                    final_peaks.set_index("time"),
                    marker="^",
                    linestyle=(0, (1, 2)),
                    markersize=10,
                    zorder=3,
                    color="lime",
                    markeredgecolor="black",
                    alpha=1,
                    label="Peaks",
                )

                # _______________________________________________________________________

                # Plot all detected troughs (in grey with transparency)
                ax.plot(
                    signal_troughs.set_index("time"),
                    marker="o",
                    linestyle=" ",
                    markersize=14,
                    color="none",
                    markeredgecolor="black",
                    zorder=1,
                    alpha=0.5,
                )

                # Plot filtered troughs (by year, in red)
                ax.plot(
                    filtered_troughs.set_index("time"),
                    marker="s",
                    linestyle=" ",
                    markersize=12,
                    color="darkorange",
                    zorder=2,
                    alpha=0.2,
                )

                # Plot final filtered troughs (after consecutive filtering, in magenta)
                ax.plot(
                    final_troughs.set_index("time"),
                    marker="^",
                    linestyle=(0, (1, 2)),
                    markersize=10,
                    color="magenta",
                    markeredgecolor="black",
                    zorder=3,
                    alpha=1,
                    label="Troughs",
                )

                # _______________________________________________________________________

                # Configure datetime ticks for the x-axis
                visualize.configure_axis(
                    ax=ax,
                    xlabel="",
                    ylabel="Groundwater Levels (m)",
                    scaling_factor=1.2,
                    title=f"{station} - {wellcode}",
                )
                visualize.configure_datetime_ticks(ax=ax, axis="x")
                visualize.configure_legend(ax=ax, scaling_factor=1, frameon=False, fontsize_base=12)

                # _______________________________________________________________________

                # Add grid and set layout for better readability
                ax.grid(axis="x", which="major", linestyle="-", linewidth=1, color="grey")
                ax.grid(axis="x", which="minor", linestyle="--", linewidth=1, color="lightgrey")
                ax.set_axisbelow(True)
                ax.set_xlim(datetime(2000, 1, 1), datetime(2025, 1, 1))

                # _______________________________________________________________________

                # Optimize layout and show the plot
                fig.tight_layout()
                # Apply rotation to the x-tick labels on the shared axis (ax3)
                plt.setp(ax.get_xticklabels(), rotation=90, ha="center")

                # _______________________________________________________________________
                if save_Figure:
                    fld2savefig = os.path.join(savefolder, "Figures")
                    os.makedirs(fld2savefig, exist_ok=True)
                    output_fig_path = os.path.join(fld2savefig, f"{station}_{wellcode}.png")
                    visualize.save_figure(fig, output_fig_path)

                plt.close()
                # _______________________________________________________________________
    except Exception as e:
        print(f"{station}_{wellcode}", e)
        error_station.append(f"{station}_{wellcode}")
        pass

 35%|████████████████████████████                                                     | 36/104 [01:40<03:09,  2.79s/it]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\FAFALAB\AppData\Roaming\Python\Python38\site-packages\IPython\core\interactiveshell.py", line 3417, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-4-fb7b04a6c3d4>", line 231, in <module>
    visualize.save_figure(fig, output_fig_path)
  File "C:\Users\FAFALAB\miniconda3\envs\davidncu\lib\site-packages\appgeopy\visualize.py", line 421, in save_figure
    fig.savefig(
  File "C:\Users\FAFALAB\AppData\Roaming\Python\Python38\site-packages\matplotlib\figure.py", line 3046, in savefig
    self.canvas.print_figure(fname, **kwargs)
  File "C:\Users\FAFALAB\AppData\Roaming\Python\Python38\site-packages\matplotlib\backend_bases.py", line 2299, in print_figure
    bbox_inches = self.figure.get_tightbbox(
  File "C:\Users\FAFALAB\AppData\Roaming\Python\Python38\site-packages\matplotlib\figure.py", line 1684, in get_tightbbox
    bbox = a.get_tightbbox(renderer)
  File "C:\Users\FAFALAB\AppData\Roaming\Py

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\FAFALAB\AppData\Roaming\Python\Python38\site-packages\IPython\core\interactiveshell.py", line 3417, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-4-fb7b04a6c3d4>", line 231, in <module>
    visualize.save_figure(fig, output_fig_path)
  File "C:\Users\FAFALAB\miniconda3\envs\davidncu\lib\site-packages\appgeopy\visualize.py", line 421, in save_figure
    fig.savefig(
  File "C:\Users\FAFALAB\AppData\Roaming\Python\Python38\site-packages\matplotlib\figure.py", line 3046, in savefig
    self.canvas.print_figure(fname, **kwargs)
  File "C:\Users\FAFALAB\AppData\Roaming\Python\Python38\site-packages\matplotlib\backend_bases.py", line 2299, in print_figure
    bbox_inches = self.figure.get_tightbbox(
  File "C:\Users\FAFALAB\AppData\Roaming\Python\Python38\site-packages\matplotlib\figure.py", line 1684, in get_tightbbox
    bbox = a.get_tightbbox(renderer)
  File "C:\Users\FAFALAB\AppData\Roaming\Py

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Error in callback <function flush_figures at 0x0000014E92C1D940> (for post_execute):
Traceback (most recent call last):
  File "C:\Users\FAFALAB\AppData\Roaming\Python\Python38\site-packages\IPython\core\interactiveshell.py", line 3417, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-4-fb7b04a6c3d4>", line 231, in <module>
    visualize.save_figure(fig, output_fig_path)
  File "C:\Users\FAFALAB\miniconda3\envs\davidncu\lib\site-packages\appgeopy\visualize.py", line 421, in save_figure
    fig.savefig(
  File "C:\Users\FAFALAB\AppData\Roaming\Python\Python38\site-packages\matplotlib\figure.py", line 3046, in savefig
    self.canvas.print_figure(fname, **kwargs)
  File "C:\Users\FAFALAB\AppData\Roaming\Python\Python38\site-packages\matplotlib\backend_bases.py", line 2299, in print_figure
    bbox_inches = self.figure.get_tightbbox(
  File "C:\Users\FAFALAB\AppData\Roaming\Python\Python38\site-packages\matplotlib\figure.py", line 1684, in get_tight

#### SINGLE STATION MODIFICATION