In [None]:
from joblib import Parallel, delayed
from my_packages import *

from appgeopy import *

2024/09/19 - **Part 2**

1. Extract the dataframe from part 1

2. Calculate the linear velocies of cumulative displacements corresponding to peaks and troughs

3. Save to HDF5 file

In [None]:
# define a simple function to decode the datetime string
string_decode_func = np.vectorize(lambda x: x.decode("utf-8"))

# ________________________________________________________________________________


def transform_to_dataframe(dict_byWellCode, extreme_type):
    # Validate extreme_type
    if extreme_type not in ["peaks", "troughs"]:
        raise ValueError("extreme_type must be either 'peaks' or 'troughs'")

    date_string = dict_byWellCode[extreme_type]["date"]
    string2date = pd.to_datetime(string_decode_func(date_string), format="%Y%m%d")
    value_array = dict_byWellCode[extreme_type]["value"]
    return pd.DataFrame({"time": string2date, "value": value_array}).assign(Type=extreme_type)


# ________________________________________________________________________________


def create_numeric_timerange(insar_datetime, freq="D"):
    """
    Creates a full time range and corresponding numeric time indices for InSAR datetimes.

    Parameters:
    - insar_datetime (pd.Series): Series of InSAR datetime values.
    - freq (str): Frequency for generating the full time range, default is daily ('D').

    Returns:
    - pd.Series: Numeric full time range series indexed by full time range dates.
    """
    # Ensure the series is sorted and has no null values
    insar_datetime = insar_datetime.dropna().sort_values()

    # Use pandas `date_range` to generate a full datetime range
    full_timerange = pd.date_range(start=insar_datetime.min(), end=insar_datetime.max(), freq=freq)

    # Create a range of numbers as indices
    numeric_indices = pd.Series(range(len(full_timerange)), index=full_timerange)

    # Return the numeric indices that correspond to the original InSAR datetimes
    return numeric_indices[insar_datetime]


# ________________________________________________________________________________
def get_average_velocity(x, y):
    """
    Computes the average velocity from x and y data using polynomial trend analysis.

    Parameters:
    - x (array-like): Array of x values (time indices).
    - y (array-like): Array of y values (displacement data).

    Returns:
    - float: The average velocity.
    """
    try:
        _, coeff = analysis.get_polynomial_trend(x, y, 1)
        return coeff[-1] * 365.25
    except ValueError as e:
        print(f"Error computing velocity for y={y}: {e}")
        return np.nan  # Return NaN if polynomial fitting fails

In [None]:
cdisp_peakstroughs_df = pd.read_pickle(r"20240919_CDISP_byPeaksTroughs_allStations_store.pkl", compression="zip")
cdisp_peakstroughs_df.head(5)

In [None]:
available_stations = cdisp_peakstroughs_df["Station"].unique()
available_stations[:5]

In [None]:
data_dict_cache = []

# select_station = available_stations[0]
for select_station in tqdm(available_stations):
    df_byStation = cdisp_peakstroughs_df.query("Station==@select_station")

    wellcodes = df_byStation["WellCode"].unique()

    # _______________________________________________________________________
    # select_wellcode = wellcodes[0]
    for select_wellcode in wellcodes:

        dfStation_byWellCode = df_byStation.query("WellCode==@select_wellcode").reset_index(drop=True)

        # Use joblib to parallelize the velocity computation across all rows
        def calculate_row(row):
            return get_average_velocity(numeric_datetime.to_numpy(), row)

        peaktrough_datetime_array = (
            dfStation_byWellCode.loc[:, "peaktrough_pairs"]
            .apply(lambda arr: np.array([elem.strftime("%Y%m%d") for elem in arr]))
            .values
        )
        peaktrough_datetime_array = np.vstack(peaktrough_datetime_array).astype("S8")

        velocity_array_cache = []

        for index in range(len(dfStation_byWellCode)):
            # _______________________________________________________________________
            cdisp_byPeaksTroughs = dfStation_byWellCode.loc[index, "cdisp_df"]
            numeric_datetime = create_numeric_timerange(cdisp_byPeaksTroughs.columns)
            df_to_calculate = cdisp_byPeaksTroughs.subtract(cdisp_byPeaksTroughs.iloc[:, 0], axis=0)
            # _______________________________________________________________________
            # Parallelize the apply step using joblib
            num_cores = 8  # Use all available cores
            result = Parallel(n_jobs=num_cores)(delayed(calculate_row)(row) for _, row in df_to_calculate.iterrows())
            # _______________________________________________________________________
            # Convert the result back to a Numpy Array
            velocity_arr = np.array(result)
            velocity_array_cache.append(velocity_arr)

        output_velocity_arr = np.vstack(velocity_array_cache)

        output_data_dict = {
            select_station: {
                select_wellcode: {"velocities": {"date": peaktrough_datetime_array, "value": output_velocity_arr}}
            }
        }

        data_dict_cache.append(output_data_dict)

In [None]:
# load old HDF5 data

gwl_hdf5_file = "20240919_GWL_CRFP_peakstroughs.h5"

# Extract existing data and metadata
with h5py.File(gwl_hdf5_file, "r") as hdf5_file:
    existing_data_dict = gwatertools.h5pytools.hdf5_to_data_dict(hdf5_file)
    existing_metadata_dict = gwatertools.h5pytools.hdf5_to_metadata_dict(hdf5_file)

In [None]:
new_data_dict = gwatertools.h5pytools.merge_dicts(*data_dict_cache)
new_metadata_dict = {
    "Update005": "2024/09/19 : Calculate linear velocities of all PS points for each peak-trough time range"
}

In [None]:
# Update dictionaries
updated_data_dict = gwatertools.h5pytools.update_data_dict(existing_data_dict, new_data_dict)
updated_metadata_dict = gwatertools.h5pytools.update_metadata_dict(existing_metadata_dict, new_metadata_dict)

In [None]:
today_string = datetime.now().strftime("%Y%m%d")

with h5py.File(f"{today_string}_GWL_peakstroughs&velocities.h5", "w") as hdf5_file:
    gwatertools.h5pytools.metadata_to_hdf5(hdf5_file, existing_metadata_dict)
    gwatertools.h5pytools.data_to_hdf5(hdf5_file, updated_data_dict)