In [1]:
import dask
import dask.dataframe as dd
from appgeopy import *
from my_packages import *
from tqdm.notebook import tqdm  # Use tqdm for notebooks

dask.config.set(scheduler="processes")

<dask.config.set at 0x2856b329070>

In [2]:
def extract_data_from_hdf5(file_path):
    """
    Extracts data and metadata from an HDF5 file.

    Parameters:
    - file_path (str): Path to the HDF5 file.

    Returns:
    - tuple: (data_dict, insar_datetime) extracted from the file.
    """
    with h5py.File(file_path, "r") as hdf5_file:
        data_dict = gwatertools.h5pytools.hdf5_to_data_dict(hdf5_file)
        insar_datetime_arr = data_dict["InSAR_datetime"]
        insar_datetime = pd.to_datetime(insar_datetime_arr, format="%Y%m%d")
    return data_dict, insar_datetime


# ________________________________________________________________________
def create_numeric_timerange(insar_datetime):
    """
    Creates a full time range and corresponding numeric time indices.

    Parameters:
    - insar_datetime (pd.Series): Series of InSAR datetime values.

    Returns:
    - pd.Series: Numeric full time range series indexed by full time range dates.
    """
    # Create a continuous datetime range and corresponding numeric indices
    full_timerange = datetime_handle.get_fulltime(insar_datetime)
    numeric_full_timerange = datetime_handle.numeric_time_index(full_timerange)
    numeric_full_timerange_series = pd.Series(data=numeric_full_timerange, index=full_timerange)
    # Extract numeric indices for the InSAR datetimes
    cdisp_numeric_timerange = numeric_full_timerange_series[insar_datetime]
    return cdisp_numeric_timerange


# ________________________________________________________________________
def peaktrough_byWellCode(well_data):
    """
    Creates a DataFrame from peak and trough data for a specific well.

    Parameters:
    - well_data (dict): Dictionary containing well data.

    Returns:
    - pd.DataFrame: DataFrame with peak and trough times and values.
    """
    # Extract peak and trough data
    peaks = well_data.get("peaks", {})
    troughs = well_data.get("troughs", {})

    peak_date = peaks.get("date", [])
    peak_value = peaks.get("value", [])
    trough_date = troughs.get("date", [])
    trough_value = troughs.get("value", [])

    # Create DataFrame using extracted data
    peaktrough_df = pd.DataFrame(
        {
            "peak_time": pd.to_datetime(peak_date, format="%Y%m%d"),
            "peaks": peak_value,
            "trough_time": pd.to_datetime(trough_date, format="%Y%m%d"),
            "troughs": trough_value,
        }
    )
    return peaktrough_df


# ________________________________________________________________________
def get_average_velocity(x, y):
    """
    Computes the average velocity from x and y data using polynomial trend analysis.

    Parameters:
    - x (array-like): Array of x values (time indices).
    - y (array-like): Array of y values (displacement data).

    Returns:
    - float: The average velocity.
    """
    try:
        _, coeff = analysis.get_polynomial_trend(x, y, 1)
        return coeff[-1] * 365.25
    except ValueError as e:
        print(f"Error computing velocity for y={y}: {e}")
        return np.nan  # Return NaN if polynomial fitting fails

In [3]:
# Main script execution
gwl_hdf5_file = "20240903_GWL_CRFP.h5"

# Extract data from the HDF5 file
existing_data_dict, insar_datetime = extract_data_from_hdf5(gwl_hdf5_file)

In [7]:
# _____________________ SECTION: Initialize Processed Data _____________________________

# Initialize the processed data dictionary that will store results for each station
processed_data = {}

error_stations = []

# Extract the available GWL stations from the dictionary keys (excluding 'date' keys)
available_stations = [elem for elem in existing_data_dict.keys() if "date" not in elem]

# _____________________ SECTION: Main Loop - Processing Stations _____________________________


# Iterate over each station to process data
for select_station in tqdm(available_stations, desc="Processing Stations", leave=False):

    # Extract station data from the existing data dictionary
    station_data = existing_data_dict[select_station]
    # processed_data[select_station] = {}
    processed_data_byStation = {}

    # Extract wellcodes that contain dictionaries and have exactly 3 items
    wellcode_byStation = [elem for elem, val in station_data.items() if isinstance(val, dict) and len(val) == 3]

    # _____________________ SECTION: Inner Loop - Processing Wellcodes _____________________________

    # Iterate over each wellcode within the station
    for select_wellcode in tqdm(
        wellcode_byStation,
        desc=f"Processing Wellcodes in {select_station}",
        leave=False,
    ):
        # Extract well data
        well_data = station_data[select_wellcode]
        
        # Create DataFrame using extracted peak and trough data
        peaktrough_df = peaktrough_byWellCode(well_data)

        # Extract CDISP array and create DataFrame
        cdisp_array = station_data.get("CDISP", [])
        cdisp_df = pd.DataFrame(data=cdisp_array, columns=insar_datetime)

        # Initialize dictionary to store velocity data
        velocity_dict = {
            "date": [],
            "abs_difference": [],
            "linear_velocity": [],
        }

        _velocity_cache = None

        # ____________________ SECTION: Processing Peak-Trough Pairs ____________________

        # Calculate velocity for each peak-trough pair
        for row in peaktrough_df.itertuples(index=False):
            peak_time, peak, trough_time, trough = (
                row.peak_time,
                row.peaks,
                row.trough_time,
                row.troughs,
            )

            # Define time range for displacement data extraction
            start_time, end_time = min(peak_time, trough_time), max(peak_time, trough_time)
            temp = cdisp_df.loc[:, start_time:end_time]

            if not temp.empty:
                # Use Dask to compute velocities across the time range
                temp_dask = dd.from_pandas(temp, npartitions=20)
                cdisp_numeric_timerange = create_numeric_timerange(temp.columns)

                meta = pd.Series(dtype="float64")
                velocities_dask = temp_dask.map_partitions(
                    lambda df: df.apply(
                        lambda y: get_average_velocity(cdisp_numeric_timerange.values, y),
                        axis=1,
                    ),
                    meta=meta,
                ).compute()

                # Ensure the computed velocity data is not empty
                if not velocities_dask.empty:
                    # Store dates and differences in the dictionary
                    velocity_dict["date"].append(f"{start_time.strftime('%Y%m%d')}_{end_time.strftime('%Y%m%d')}")
                    velocity_dict["abs_difference"].append(abs(peak - trough))

                    # Simplified velocity cache update logic
                    velocities_np = velocities_dask.to_numpy().reshape(1, -1)
                    _velocity_cache = (
                        np.append(_velocity_cache, velocities_np, axis=0)
                        if _velocity_cache is not None
                        else velocities_np
                    )

        velocity_dict["linear_velocity"] = _velocity_cache
        processed_data_byStation[select_wellcode] = {"peaks_troughs": velocity_dict}

    try:
        # ___________________ SECTION: Save Processed Data ______________________
        # Write updated data and metadata back to a new HDF5 file
        savename = f"{datetime.now().strftime('%Y%m%d')}_{select_station}.h5"
        savepath = os.path.join("temp", savename)
        with h5py.File(savepath, "w") as hdf5_file:
            gwatertools.h5pytools.data_to_hdf5(hdf5_file, processed_data_byStation)
        processed_data.update({select_station:processed_data_byStation})
    except Exception as e:
        error_stations.append(select_station)
        pass

# Processed data is now stored in processed_data dictionary for further use or analysis

Processing Stations:   0%|          | 0/1 [00:00<?, ?it/s]

Processing Wellcodes in ANHE:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# ________________________ SECTION: Final Output Write __________________________

gwl_hdf5_file = "20240904_GWL_Drop_and_Velocity.h5"

# # Copy the original HDF5 file to a secure version
# shutil.copy2(src=gwl_hdf5_file, dst=gwl_hdf5_file.replace(".h5", "_secure.h5"))

# Extract existing data and metadata
with h5py.File(gwl_hdf5_file, "r") as hdf5_file:
    existing_data_dict = gwatertools.h5pytools.hdf5_to_data_dict(hdf5_file)
    existing_metadata_dict = gwatertools.h5pytools.hdf5_to_metadata_dict(
        hdf5_file
    )

# Update dictionaries with new measurement data
updated_data_dict = gwatertools.h5pytools.update_data_dict(
    existing_data_dict, processed_data
)

# Write updated data and metadata back to a new HDF5 file
output_file_name = f"{datetime.now().strftime('%Y%m%d')}_GWL_Drop_and_Velocity.h5"
with h5py.File(output_file_name, "w") as hdf5_file:
    gwatertools.h5pytools.metadata_to_hdf5(hdf5_file, existing_metadata_dict)
    gwatertools.h5pytools.data_to_hdf5(hdf5_file, updated_data_dict)