In [1]:
from appgeopy import *
from my_packages import *

In [2]:
# ------------------------------------------------------------------------------
# Define Functions for Repeated Operations
# ------------------------------------------------------------------------------
def get_seasonal_and_trend_data(series):
    """Extract trend and seasonal components from a series."""
    numeric_time_idx = datetime_handle.numeric_time_index(series)
    finite_values = series[~series.isnull()].values

    # Polynomial trend
    trend, _ = analysis.get_polynomial_trend(
        x=numeric_time_idx,
        y=finite_values,
        order=2,
        x_estimate=np.arange(len(series)),
    )
    trend.index = series.index

    # Detrend Data
    detrended_series = series - trend

    # Seasonality Analysis
    seasonality_info = analysis.find_seasonality(
        time_series_data=detrended_series
    )
    seasonality_info = seasonality_info[seasonality_info["Period (days)"] > 7]
    seasonality_info = seasonality_info.nlargest(n=50, columns="Amplitude")

    return trend, detrended_series, seasonality_info

In [3]:
def process_well_data(gwl_hdf5_file, select_dataset):
    """
    Process data for a single well, removing outliers, extracting trends,
    fitting a sinusoidal model, and correcting phase shifts.

    Parameters:
    - ename (str): The name of the station or entity.
    - wellcode (str): The code identifying the specific well.
    - hdf5_file (str): The path to the HDF5 file containing the data.

    Returns:
    - wellcode (str): The well code.
    - df_fromHDF5 (pd.DataFrame): The DataFrame containing the processed data.
    """
    try:
        # Open the HDF5 file in read mode to retrieve existing data for the well
        with h5py.File(gwl_hdf5_file, "r") as hdf5_file:
            # Read the existing dataset associated with the well code
            value_arr = hdf5_file[select_dataset][...]

        # Create a DataFrame from the retrieved HDF5 data with datetime index
        df_fromHDF5 = pd.DataFrame(
            {"time": datetime_idx, "daily_value": value_arr}
        )
        df_fromHDF5 = df_fromHDF5.set_index("time")

        # Calculate mean and standard deviation
        series_average = np.nanmean(df_fromHDF5)
        series_stdev = np.nanstd(df_fromHDF5)

        # Remove outliers beyond 3 standard deviations
        condition = (df_fromHDF5 >= (series_average - 3 * series_stdev)) & (
            df_fromHDF5 <= (series_average + 3 * series_stdev)
        )

        df_fromHDF5 = df_fromHDF5.where(condition, np.nan)

        # Trim the DataFrame to the first and last valid indices
        df_trimmed = df_fromHDF5.loc[
            df_fromHDF5.first_valid_index() : df_fromHDF5.last_valid_index()
        ]

        series = df_trimmed.iloc[:, 0]

        # Extract trend and seasonality
        trend, detrended_series, seasonality_info = get_seasonal_and_trend_data(
            series
        )

        # Prepare Sinusoidal Model Inputs
        (
            time_values,
            observed_values,
            amplitudes,
            periods,
            phase_shifts,
            baseline,
        ) = modeling.prepare_sinusoidal_model_inputs(
            time_series_data=detrended_series,
            seasonality_info=seasonality_info.query("Frequency != 0"),
        )

        # Fit Sinusoidal Model and Correct Phase Shift
        fitted_signal = modeling.fit_sinusoidal_model(
            time_values=time_values,
            observed_values=observed_values,
            amplitudes=amplitudes,
            periods=periods,
            phase_shifts=phase_shifts,
            baseline=baseline,
            predict_time=np.arange(len(df_trimmed)),
        )

        # Correct phase shift in the fitted signal
        corrected_signal_series = pd.Series(
            analysis.correct_phase_shift(detrended_series, fitted_signal),
            index=df_trimmed.index,
        )

        # Combine trend and corrected signal to get the modeled series
        df_trimmed["model"] = trend + corrected_signal_series
        df_fromHDF5["model"] = df_fromHDF5.index.map(df_trimmed["model"])

    except Exception as e:
        print(f"Error processing data for {select_dataset}: {e}")
        df_fromHDF5["model"] = df_fromHDF5["daily_value"]
        # Handle error appropriately (e.g., logging, returning None, etc.)

    return df_fromHDF5

In [4]:
def generate_dict_from_list(elements, value):
    if len(elements) == 1:
        return {
            elements[0]: value
        }  # Base case: return the NumPy array for the last element
    return {
        elements[0]: generate_dict_from_list(elements[1:], value)
    }  # Recursively build the dictionary

In [5]:
gwl_hdf5_file = "20240828_GWL_CRFP.h5"
# List available datasets in the HDF5 file for reference
with h5py.File(gwl_hdf5_file, "r") as hdf5_file:
    available_datasets = gwatertools.h5pytools.list_datasets(hdf5_file)
    datetime_idx = pd.to_datetime(hdf5_file["date"][...], format="%Y%m%d")

available_datasets[:5]

['ANHE/10070111/measure/daily_value',
 'ANHE/10070121/measure/daily_value',
 'ANHE/10070131/measure/daily_value',
 'ANHE/10070141/measure/daily_value',
 'ANNAN/09140111/measure/daily_value']

In [6]:
all_stations_measurement_data = {}
all_stations_metadata = {}

for select_dataset in tqdm(available_datasets[:-1]):

    # Open the HDF5 file in read mode to retrieve existing data for the well
    with h5py.File(gwl_hdf5_file, "r") as hdf5_file:
        # Read the existing dataset associated with the well code
        value_arr = hdf5_file[select_dataset][...]

    df_fromHDF5 = process_well_data(gwl_hdf5_file, select_dataset)
    # _____________________________________________
    station_name, well_code, data_group, data_type = select_dataset.split("/")
    # _____________________________________________

    if station_name not in all_stations_measurement_data.keys():
        # _____________________________________________
        all_stations_measurement_data[station_name] = {
            well_code: {data_group: {"model": df_fromHDF5["model"].values}}
        }
        # _____________________________________________
        all_stations_metadata[station_name] = {
            "UpdatedTime": datetime.now().strftime("%Y/%m/%d, %H:%M:%S")
        }

        # _____________________________________________
    else:
        all_stations_measurement_data[station_name].update(
            {well_code: {data_group: {"model": df_fromHDF5["model"].values}}}
        )

    all_stations_metadata[station_name][well_code] = {
        data_group: {
            "model": {
                "Description": "Denoise and fill missing values in the GWL data using second-order polynomial trend & fourier transform analysis",
                "FIRST_OBS": df_fromHDF5["model"]
                .first_valid_index()
                .strftime("%Y%m%d"),
                "LAST_OBS": df_fromHDF5["model"]
                .last_valid_index()
                .strftime("%Y%m%d"),
            }
        }
    }

 17%|█████████████▊                                                                 | 55/316 [12:42<1:30:47, 20.87s/it]

Error processing data for DOULIU/090111M1/measure/daily_value: RANSAC could not find a valid consensus set. All `max_trials` iterations were skipped because each randomly chosen sub-sample failed the passing criteria. See estimator attributes for diagnostics (n_skips*).


100%|█████████████████████████████████████████████████████████████████████████████▊| 315/316 [1:05:46<00:12, 12.53s/it]


Error processing data for date: ufunc 'true_divide' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''


ValueError: not enough values to unpack (expected 4, got 1)

In [10]:
# shutil.copy2(src=gwl_hdf5_file, dst=gwl_hdf5_file.replace(".h5", "_secure.h5"))

# gwl_hdf5_file = 
# # Extract existing data and metadata
# with h5py.File(gwl_hdf5_file, "r") as hdf5_file:
#     existing_data_dict = gwatertools.h5pytools.hdf5_to_data_dict(hdf5_file)
#     existing_metadata_dict = gwatertools.h5pytools.hdf5_to_metadata_dict(
#         hdf5_file
#     )

# # Update dictionaries
# updated_data_dict = gwatertools.h5pytools.update_data_dict(
#     existing_data_dict, all_stations_measurement_data
# )
# updated_metadata_dict = gwatertools.h5pytools.update_metadata_dict(
#     existing_metadata_dict, all_stations_metadata
# )

# Write updated data and metadata back to the HDF5 file
with h5py.File("20240828_GWL_CRFP_v2.h5", "w") as hdf5_file:
    metadata_to_hdf5(hdf5_file, updated_metadata_dict)
    data_to_hdf5(hdf5_file, updated_data_dict)