**2024/10/22**

- Part 1: This script generates HDF5 file which contains measurement values from MLCWs
- Part 2: Add metadata

In [1]:
import pinyin
from appgeopy import *
from my_packages import *

pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

#### PART 1

In [2]:
def differ_to_ref(series, convert_to=None):
    """
    Calculate differential values relative to the first measurement in the series.

    Parameters:
    - series (pd.Series): A pandas Series containing the measurements.
    - convert_to (str or None): Unit to convert the differential values to.
                                Accepts 'milimeter', 'centimeter', or None for 'meters'.

    Returns:
    - np.ndarray: The differential values converted to the specified unit.

    Raises:
    - ValueError: If 'convert_to' is not 'milimeter', 'centimeter', or None.

    """
    # Determine the conversion multiplier based on the desired unit.
    if convert_to == "milimeter":
        multiplier = 1000
    elif convert_to == "centimeter":
        multiplier = 100
    elif convert_to is None:
        multiplier = 1  # Keep the original unit (meters).
    else:
        raise ValueError("Invalid 'convert_to' value. Must be 'milimeter', 'centimeter', or None for meters.")

    # Calculate differential values relative to the first measurement and apply conversion.
    return np.array((series - series.iloc[0]) * multiplier, dtype=np.float16)


# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
def cdisp_from_base(input_array):
    cdisp_ref_to_base = np.nancumsum(input_array[::-1], dtype=np.float64)
    return cdisp_ref_to_base[::-1]


# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
def numeric_depth(input_array):
    string_to_num = [eval(ele.split("_")[-1].split("m")[0]) for ele in input_array]
    round_num = np.round(string_to_num, 3)
    return round_num

In [None]:
data_dir = r"E:\SUBSIDENCE_PROJECT_DATA\地陷資料整理\地陷井\監測井_資料清理結果"

# Retrieve all well data file paths
data_fpath = glob(os.path.join(data_dir, "*.xz"))

all_mlcw_station = []

# select_fpath = data_fpath[5]
# select_fpath = r"E:\SUBSIDENCE_PROJECT_DATA\地陷資料整理\地陷井\監測井_資料清理結果\僑義.xz"

# Loop through each well data file for processing.
for select_fpath in tqdm(data_fpath):

    # Extract the basename of the file without the extension (e.g., "僑義").
    basename = os.path.basename(select_fpath).split(".")[0]

    # Convert the Chinese basename to its pinyin equivalent and convert it to uppercase.
    # This is useful for creating standard, readable identifiers.
    ename = pinyin.get(basename, format="strip").upper()

    # Read the well data from the .xz file using pandas, resulting in a DataFrame.
    well_df = pd.read_pickle(select_fpath)
    # Ensure the index (date) of the DataFrame is in datetime format for time-based operations.
    well_df.index = pd.to_datetime(well_df.index)

    # Calculate differences between adjacent columns (dates) to assess daily changes in measurements.
    # This step is essential to determine the deformation or compaction over time.
    column_diffs = well_df.diff(axis=1)
    # Preserve the first column of original values as there is no prior measurement for comparison.
    column_diffs.iloc[:, 0] = well_df.iloc[:, 0]

    # Fill any missing values (NaN) in the DataFrame by carrying the last valid observation forward.
    # This ensures that gaps in data do not disrupt subsequent analysis.
    filled_diffs = column_diffs.fillna(axis=0, method="ffill")

    # Differential values relative to the first measurement.
    # This uses a custom function `differ_to_ref` to convert measurements relative to the initial date.
    # It adjusts values to show changes from the initial state of each time series.
    diffs_ref2first_transposed = filled_diffs.apply(lambda x: differ_to_ref(x), axis=0).transpose()

    ## Cumulative Compaction Calculation ##
    # Calculate cumulative compaction from the bottom of the well upwards.
    # Uses a custom function `cdisp_from_base` to sum deformations from the base to the top.
    cdisp_ref2base = diffs_ref2first_transposed.apply(cdisp_from_base, axis="index")

    # Convert the column names (which are datetime objects) into string format (e.g., "Nyyyymmdd").
    # This is useful for later stages where the date format needs to be uniform and readable.
    cdisp_ref2base.columns = [datetime_handle.datetime_to_string(x) for x in cdisp_ref2base.columns]

    # Reset the index to include depth as a column in the DataFrame.
    # Rename the index column to "Depth" for clarity.
    cdisp_ref2base_final = cdisp_ref2base.reset_index(drop=False).rename({"index": "Depth"}, axis=1)

    # Convert the "Depth" column values to numeric format (e.g., floats) for further calculations.
    mlcw_depth_arr = numeric_depth(cdisp_ref2base_final["Depth"].tolist())

    # Extract the date values (strings) for each measurement.
    # Assumes dates in columns start with "N", indicating a specific date format.
    mlcw_datetime_arr = [col[1:] for col in cdisp_ref2base.columns if col.startswith("N")]

    # Prepare a dictionary structure to store processed well data for easy access.
    station_data = {
        ename: {
            "values": {"col_diff": column_diffs.transpose().to_numpy(), "ref2base": cdisp_ref2base.to_numpy()},
            "date": mlcw_datetime_arr,
            "depth": mlcw_depth_arr,
        }
    }

    # Append the processed data for the current well to the main list.
    all_mlcw_station.append(station_data)
# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
all_mlcw_station_ddict = gwatertools.merge_dicts(*all_mlcw_station)
# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
today_string = datetime.now().strftime("%Y%m%d")

# Write updated data and metadata back to the HDF5 file
with h5py.File(f"{today_string}_MLCW_CRFP.h5", "w") as hdf5_file:
    # gwatertools.h5pytools.metadata_to_hdf5(hdf5_file, updated_metadata_dict)
    gwatertools.h5pytools.data_to_hdf5(hdf5_file, all_mlcw_station_ddict)

#### PART 2

In [None]:
metadata_fpath = r"E:\SUBSIDENCE_PROJECT_DATA\地陷資料整理\地陷井\well_meta.xlsx"

df = pd.read_excel(metadata_fpath)
df.columns

In [None]:
select_columns = [
    "CountyName",
    "GroundWaterZoneCode",
    "GroundWaterZoneName",
    "LandSubsidenceMonitoringWellIdentifier",
    "LandSubsidenceMonitoringWellName",
    "LocationByWGS84_Latitude",
    "LocationByWGS84_Longitude",
    "SetTime",
    "TownName",
    "檔案名稱",
    "表格對照",
]

In [None]:
# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
all_mlcw_station = []
# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
all_mlcw_station.append(
    {
        "Description": """2024/10/22: Convert MLCW data into HDF5 format along with well's metadata. `col_diff`: differential values between rings to isolate the measurements of each single ring. `ref2base`: cumulative compaction reference to the base of the well"""
    }
)
# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

data_dir = r"E:\SUBSIDENCE_PROJECT_DATA\地陷資料整理\地陷井\監測井_資料清理結果"

# Retrieve all well data file paths
data_fpath = glob(os.path.join(data_dir, "*.xz"))

# select_fpath = data_fpath[5]

for select_fpath in tqdm(data_fpath):
    basename = os.path.basename(select_fpath).split(".")[0]
    ename = pinyin.get(basename, format="strip").upper()

    df_byBaseName = df.query("檔案名稱==@basename").loc[:, select_columns].reset_index()

    if len(df_byBaseName) > 0:

        extract_data_byBaseName = df_byBaseName.to_dict()
        # Extracting the data for index 20
        ddict_byBaseName = {key: value[0] for key, value in extract_data_byBaseName.items()}

        del ddict_byBaseName["index"]

        all_mlcw_station.append({ename: ddict_byBaseName})
    else:
        print(select_fpath)

# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
all_mlcw_station_metadata = gwatertools.merge_dicts(*all_mlcw_station)
# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
gwl_hdf5_file = r"20241022_MLCW_CRFP.h5"
# Extract existing data and metadata
with h5py.File(gwl_hdf5_file, "r") as hdf5_file:
    existing_data_dict = gwatertools.h5pytools.hdf5_to_data_dict(hdf5_file)
    existing_metadata_dict = gwatertools.h5pytools.hdf5_to_metadata_dict(hdf5_file)
# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
updated_metadata_dict = gwatertools.h5pytools.update_metadata_dict(existing_metadata_dict, all_mlcw_station_metadata)
# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
today_string = datetime.now().strftime("%Y%m%d")

# Write updated data and metadata back to the HDF5 file
with h5py.File(f"{today_string}_MLCW_CRFP_v2.h5", "w") as hdf5_file:
    gwatertools.h5pytools.metadata_to_hdf5(hdf5_file, updated_metadata_dict)
    gwatertools.h5pytools.data_to_hdf5(hdf5_file, existing_data_dict)