Apply new functions in **`h5pytools`** to implement the transformation

In [1]:
from appgeopy import *
from my_packages import *

In [2]:
# Set the current working directory (script folder)
script_folder = os.getcwd()

In [3]:
# Define the path to the HDF5 file to be updated
gwl_hdf5_file = r"D:\1000_SCRIPTS\002_PostQE_Scripts\20240819_Save_GWL_HDF5\20240828_GWL_CRFP.h5"
os.path.isfile(gwl_hdf5_file)

True

In [4]:
# Load station information from an Excel file
station_info_excel = pd.read_excel(
    r"D:\VINHTRUONG\004_MODELING\001_STUDY_AREA\GroundwaterObservation\@DOWNLOAD_WRA_GWOB_YEARBOOK_PROJECT\Well_Information_CRAF_Active_Inactive_OneSheetOnly.xlsx"
)

In [5]:
# Create a backup of the original HDF5 file
shutil.copy2(src=gwl_hdf5_file, dst=gwl_hdf5_file.replace(".h5", "_secure.h5"))

# List available datasets in the HDF5 file for reference
with h5py.File(gwl_hdf5_file, "r") as hdf5_file:
    available_datasets = gwatertools.h5pytools.list_datasets(hdf5_file)
    datetime_idx = pd.to_datetime(hdf5_file["date"][...], format="%Y%m%d")

stations_in_datasets = sorted(
    set([ele.split("/")[0] for ele in available_datasets])
)

In [6]:
# Define the main folder containing new groundwater data to be imported
new_data_mainfolder = r"D:\VINHTRUONG\004_MODELING\001_STUDY_AREA\GroundwaterObservation\@DOWNLOAD_WRA_GWOB_YEARBOOK_PROJECT\@groundwater_level_PDF\GW_DATA_gweb.wra.gov.tw\COMBINE\2023"
os.chdir(new_data_mainfolder)

# Initialize containers for new data and error logs
stations_fld = [
    f for f in os.listdir() if os.path.isdir(f)
]  # List of station folders

In [7]:
all_stations_measurement_data = {}
all_stations_metadata = {}

# _____________________________________________
# Loop through each station directory specified in `stations_fld`
for select_station in tqdm(stations_fld):
    # Retrieve all Excel files for the current station
    files_byStation = glob(select_station + "\\*.xlsx")
    # Split the station name into components (ename, cname, abbrev)
    ename, cname, abbrev = select_station.upper().split("_")

    # Check if the station is already available in the existing datasets
    flag = ename.upper() in stations_in_datasets

    # Initialize dictionaries to hold measurement data and metadata for the current well
    well_measurement_data = {}
    well_metadata = {"UpdatedTime":datetime.now().strftime("%Y/%m/%d, %H:%M:%S")}

    # _____________________________________________
    # If the station is available in the datasets, update it; otherwise, add new entries
    if flag:
        # _____________________________________________
        # Loop through each file related to the current station
        for select_file in files_byStation:
            # Extract the well code from the filename (assumed to be the base name without extension)
            wellcode = os.path.basename(select_file).split(".")[0]

            # Find the dataset that matches the current well code
            dataset_by_wellcode = next(
                (ele for ele in available_datasets if wellcode in ele), None
            )

            # Open the HDF5 file in read mode to retrieve existing data for the well
            with h5py.File(gwl_hdf5_file, "r") as hdf5_file:
                # Read the existing dataset associated with the well code
                value_arr = hdf5_file[dataset_by_wellcode][...]

            # Create a DataFrame from the retrieved HDF5 data with datetime index
            df_fromHDF5 = pd.DataFrame(
                {"time": datetime_idx, "daily_value": value_arr}
            )
            df_fromHDF5 = df_fromHDF5.set_index("time")

            # _____________________________________________
            # Load new monitoring data from the Excel file into a DataFrame
            df_NewData = pd.read_excel(
                select_file, parse_dates=[0], index_col=[0]
            )
            old_column_name = df_NewData.columns[0]
            df_NewData = df_NewData.rename({old_column_name:"daily_value"}, axis=1)

            # Combine existing data (from HDF5) with new data (from Excel)
            # New data takes precedence over existing data
            df_fromHDF5_filled = df_fromHDF5.fillna(df_NewData)

            # _____________________________________________
            # Update the well measurement data dictionary with combined data
            well_measurement_data[wellcode] = {
                "measure": {
                    "daily_value": df_fromHDF5_filled["daily_value"].values
                }
            }
            # _____________________________________________
            # Update well metadata with the first and last observation dates from combined data
            first_obs = df_fromHDF5_filled.first_valid_index().strftime(
                "%Y/%m/%d"
            )
            last_obs = df_fromHDF5_filled.last_valid_index().strftime(
                "%Y/%m/%d"
            )
            well_metadata[wellcode] = {
                "FIRST_OBS": first_obs,
                "LAST_OBS": last_obs,
            }
    else:
        # _____________________________________________
        # If station is not available in the datasets, initialize metadata from an external source
        station_info = station_info_excel.query("ENAME == @ename.lower()")

        # Initialize station metadata, defaulting to "null" if no station info is found
        if station_info.empty:
            station_metadata = {"metadata": "null"}
        else:
            # Extract relevant station info such as coordinates, address, etc.
            x_twd97, y_twd97 = station_info.iloc[0][["X_TWD97", "Y_TWD97"]]
            address = station_info["ADDRESS"].iloc[0]
            num_of_wells = len(files_byStation)

            # Populate station metadata with details
            station_metadata = {
                "Chinese": cname,
                "Abbreviation": abbrev,
                "EPSG": 3826,
                "X": x_twd97,
                "Y": y_twd97,
                "BasinENG": "Choshuichi Fan",
                "BasinCHN": "濁水溪沖積扇",
                "Num_of_Wells": num_of_wells,
                "Address": address,
                "CreatedTime": datetime.now().strftime("%Y/%m/%d, %H:%M:%S"),
            }

        # Update the overall metadata dictionary with the current station's metadata
        well_metadata.update(station_metadata)
        # _____________________________________________
        # Loop through each file related to the current station to handle well-specific information
        for select_file in files_byStation:
            # Extract the well code from the filename
            wellcode = os.path.basename(select_file).split(".")[0]
            # Query well-specific information from the station info dataset
            well_info = station_info.query("WELL_CODE == @wellcode")

            # Initialize or update well metadata based on the presence of well info
            well_metadata[wellcode] = (
                {"metadata": "null"}
                if well_info.empty
                else {
                    "WellName": well_info["WELL_NAME"].iloc[0],
                    "Well_Elev(m)": well_info["WELL_ELEV(m)"].iloc[0],
                    "Well_Depth(m)": well_info["WELL_DEPTH(m)"].iloc[0],
                    "Well_Screen(m)": well_info["WELL_SCREEN(m)"].iloc[0],
                    "Status": "Active"
                    if well_info["ACTIVE"].iloc[0] == 1
                    else "Inactive",
                }
            )
            # _____________________________________________
            # Load new monitoring data from the Excel file into a DataFrame
            df_NewData = pd.read_excel(
                select_file, parse_dates=[0], index_col=[0]
            )
            # Create a temporary DataFrame with a datetime index and map new data to it
            temp = pd.DataFrame(index=datetime_idx)
            temp["daily_value"] = temp.index.map(df_NewData.iloc[:, 0])
            # _____________________________________________
            # Update well metadata with observation dates based on the temporary DataFrame
            first_obs = temp.first_valid_index().strftime("%Y/%m/%d")
            last_obs = temp.last_valid_index().strftime("%Y/%m/%d")
            well_metadata[wellcode].update(
                {
                    "FIRST_OBS": first_obs,
                    "LAST_OBS": last_obs,
                }
            )
            # _____________________________________________
            # Update well measurement data with new values from the temporary DataFrame
            well_measurement_data[wellcode] = {
                "measure": {"daily_value": temp["daily_value"].values}
            }

    # _____________________________________________
    # Update the overall measurement and metadata dictionaries with current station's data
    all_stations_measurement_data[ename] = well_measurement_data
    all_stations_metadata[ename] = well_metadata

100%|██████████████████████████████████████████████████████████████████████████████████| 92/92 [00:07<00:00, 12.04it/s]


#### Update the HDF5 file with new data from earlier dictionaries.

In [14]:
all_stations_measurement_data['ANHE']

{'10070111': {'measure': {'daily_value': array([1.86, 1.67, 1.17, ...,  nan,  nan,  nan])}},
 '10070121': {'measure': {'daily_value': array([-7.6 , -7.72, -7.81, ...,   nan,   nan,   nan])}},
 '10070131': {'measure': {'daily_value': array([-10.89, -11.02, -11.31, ...,    nan,    nan,    nan])}},
 '10070141': {'measure': {'daily_value': array([-12.73, -12.73, -12.71, ...,    nan,    nan,    nan])}}}

In [15]:
all_stations_metadata['ANHE']

{'UpdatedTime': '2024/08/29, 11:54:19',
 '10070111': {'FIRST_OBS': '2001/01/01', 'LAST_OBS': '2024/08/20'},
 '10070121': {'FIRST_OBS': '2001/01/01', 'LAST_OBS': '2024/08/20'},
 '10070131': {'FIRST_OBS': '2001/01/01', 'LAST_OBS': '2024/08/20'},
 '10070141': {'FIRST_OBS': '2001/01/01', 'LAST_OBS': '2024/08/20'}}

In [8]:
HDF5_fpath = os.path.join(script_folder, r"20240828_GWL_CRFP.h5")
shutil.copy2(src=HDF5_fpath, dst=HDF5_fpath.replace(".h5", "_secure.h5"))

# Extract existing data and metadata
with h5py.File(HDF5_fpath, "r") as hdf5_file:
    existing_data_dict = gwatertools.h5pytools.hdf5_to_data_dict(hdf5_file)
    existing_metadata_dict = gwatertools.h5pytools.hdf5_to_metadata_dict(
        hdf5_file
    )

In [9]:
# Update dictionaries
updated_data_dict = gwatertools.h5pytools.update_data_dict(
    existing_data_dict, all_stations_measurement_data
)
updated_metadata_dict = gwatertools.h5pytools.update_metadata_dict(
    existing_metadata_dict, all_stations_metadata
)

In [10]:
# Write updated data and metadata back to the HDF5 file
with h5py.File(HDF5_fpath, "w") as hdf5_file:
    metadata_to_hdf5(hdf5_file, updated_metadata_dict)
    data_to_hdf5(hdf5_file, updated_data_dict)