In [1]:
from my_packages import *

from appgeopy import *

2024/09/19 - **Part 1**

1. Store the tables of cumulative displacement corresponding to peak-trough pairs in each well code

2. All tables will be saved as a dataframe in pickle file format

3. Then this dataframe will be used by another script to calculate linear velocities and save to HDF5 file

In [2]:
# define a simple function to decode the datetime string
string_decode_func = np.vectorize(lambda x: x.decode("utf-8"))

# ________________________________________________________________________________


def transform_to_dataframe(dict_byWellCode, extreme_type):
    # Validate extreme_type
    if extreme_type not in ["peaks", "troughs"]:
        raise ValueError("extreme_type must be either 'peaks' or 'troughs'")

    date_string = dict_byWellCode[extreme_type]["date"]
    string2date = pd.to_datetime(string_decode_func(date_string), format="%Y%m%d")
    value_array = dict_byWellCode[extreme_type]["value"]
    return pd.DataFrame({"time": string2date, "value": value_array}).assign(Type=extreme_type)


# ________________________________________________________________________________


def create_numeric_timerange(insar_datetime, freq="D"):
    """
    Creates a full time range and corresponding numeric time indices for InSAR datetimes.

    Parameters:
    - insar_datetime (pd.Series): Series of InSAR datetime values.
    - freq (str): Frequency for generating the full time range, default is daily ('D').

    Returns:
    - pd.Series: Numeric full time range series indexed by full time range dates.
    """
    # Ensure the series is sorted and has no null values
    insar_datetime = insar_datetime.dropna().sort_values()

    # Use pandas `date_range` to generate a full datetime range
    full_timerange = pd.date_range(start=insar_datetime.min(), end=insar_datetime.max(), freq=freq)

    # Create a range of numbers as indices
    numeric_indices = pd.Series(range(len(full_timerange)), index=full_timerange)

    # Return the numeric indices that correspond to the original InSAR datetimes
    return numeric_indices[insar_datetime]


# ________________________________________________________________________________

In [3]:
# open HDF5 file and get data

gwl_hdf5_file = "20240919_GWL_CRFP_peakstroughs.h5"

with h5py.File(gwl_hdf5_file, "r") as hdf5_file:
    datasets = gwatertools.h5pytools.list_datasets(hdf5_file)
    existing_data_dict = gwatertools.h5pytools.hdf5_to_data_dict(hdf5_file)

In [4]:
# get InSAR datetime array

insar_datetime = pd.to_datetime(string_decode_func(existing_data_dict["InSAR_datetime"]), format="%Y%m%d")
insar_datetime[:5]

DatetimeIndex(['2016-05-08', '2016-05-20', '2016-06-01', '2016-06-13',
               '2016-07-07'],
              dtype='datetime64[ns]', freq=None)

In [5]:
available_stations = sorted(set([elem.split("/")[0] for elem in datasets if "date" not in elem]))
available_stations[:5]

['ANHE', 'ANNAN', 'BEIGANG', 'BOZI', 'CAICUO']

In [9]:
cdisp_allWellCode = pd.DataFrame(data=None)

# Helper function to process peak-trough pairs and cumulative displacement
def process_peak_trough_pairs(cdisp_df, combined):
    cache = {"peaktrough_pairs": [], "cdisp_df": []}
    
    for i in range(0, len(combined), 2):
        j = i + 1
        peak_time, peak_value, _ = combined.loc[i, :].values
        trough_time, trough_value, _ = combined.loc[j, :].values

        # Get cumulative displacement data between peak and trough
        cdisp_byTime = cdisp_df.loc[:, peak_time:trough_time]

        if cdisp_byTime.empty or cdisp_byTime.columns.size <= 5:
            continue  # Skip small or empty cdisp_byTime

        # Store the valid data
        cache["peaktrough_pairs"].append([peak_time, trough_time])
        cache["cdisp_df"].append(cdisp_byTime)
    
    return pd.DataFrame(cache)

# Loop through stations
for select_station in tqdm(available_stations, desc="Processing Stations", leave=False):
    data_dict_byStation = existing_data_dict.get(select_station)

    # Skip if no CDISP data is found in the station
    cdisp_arr = data_dict_byStation.get("CDISP")
    if cdisp_arr is None:
        continue

    # Retrieve available well codes in each station
    wellcodes = [elem for elem in data_dict_byStation if isinstance(data_dict_byStation[elem], dict)]
    
    # Get cumulative displacement array corresponding to the station
    cdisp_df = pd.DataFrame(data=cdisp_arr, columns=insar_datetime)
    
    # Loop through wellcodes
    for select_wellcode in wellcodes:
        data_dict_byWellCode = data_dict_byStation.get(select_wellcode)

        # Skip if no peaks data is found for the well code
        if "peaks" not in data_dict_byWellCode:
            continue

        # Retrieve peaks and troughs
        peak_df = transform_to_dataframe(data_dict_byWellCode, "peaks")
        trough_df = transform_to_dataframe(data_dict_byWellCode, "troughs")

        # Combine peaks and troughs, sorted by time
        combined = pd.concat([peak_df, trough_df], ignore_index=True).sort_values(by="time").reset_index(drop=True)

        # Process peak-trough pairs and store cumulative displacement
        cdisp_df_byWellCode = process_peak_trough_pairs(cdisp_df, combined)

        # Add station and well code metadata
        cdisp_df_byWellCode["Station"] = select_station
        cdisp_df_byWellCode["WellCode"] = select_wellcode

        # Append result to the main DataFrame
        cdisp_allWellCode = pd.concat([cdisp_allWellCode, cdisp_df_byWellCode], ignore_index=True)

                                                                                                                       

In [10]:
today_string = datetime.now().strftime("%Y%m%d")
cdisp_allWellCode.to_pickle(f"{today_string}_CDISP_byPeaksTroughs_allStations_store.pkl", compression="zip")