In [None]:
import logging

from appgeopy import *
from my_packages import *

In [None]:
# Set the current working directory (script folder)
script_folder = os.getcwd()

# SECTION 1: Setup and Backup
# ----------------------------
# Define the path to the HDF5 file to be updated
gwl_hdf5_file = r"D:\1000_SCRIPTS\002_PostQE_Scripts\20240819_Save_GWL_HDF5\20240826_GWL_CRFP.h5"

# Create a backup of the original HDF5 file
shutil.copy2(src=gwl_hdf5_file, dst=gwl_hdf5_file.replace(".h5", "_secure.h5"))

# List available datasets in the HDF5 file for reference
with h5py.File(gwl_hdf5_file, "r") as hdf5_file:
    available_datasets = gwatertools.h5pytools.list_datasets(hdf5_file)

# SECTION 2: Data Collection Setup
# --------------------------------
# Define the main folder containing new groundwater data to be imported
new_data_mainfolder = r"D:\VINHTRUONG\004_MODELING\001_STUDY_AREA\GroundwaterObservation\@DOWNLOAD_WRA_GWOB_YEARBOOK_PROJECT\@groundwater_level_PDF\GW_DATA_gweb.wra.gov.tw\COMBINE\2023"
os.chdir(new_data_mainfolder)

# Initialize containers for new data and error logs
stations_fld = [f for f in os.listdir() if os.path.isdir(f)]  # List of station folders
new_data = {}  # Dictionary to store new data
error_log = {}  # Dictionary to log errors

# SECTION 3: Data Extraction and Merging
# --------------------------------------
# Iterate through each station folder to extract and merge new data
for select_station in tqdm(stations_fld):
    try:
        # Extract English name, Chinese name, and abbreviation from the station folder name
        ename, cname, abbrev = select_station.upper().split("_")
        temp = {ename: {}}  # Temporary storage for current station data

        # List all Excel files in the current station folder
        files_byStation = glob(select_station + "\\*.xlsx")

        for select_file in files_byStation:
            wellcode = os.path.basename(select_file).split(".")[0]  # Extract well code from filename

            # Retrieve existing monitoring data from the HDF5 file
            df_fromHDF5 = gwatertools.h5pytools.export_data_to_dataframe(
                file_name=gwl_hdf5_file,
                location_name=ename,
                sensor_name=wellcode,
            )

            if df_fromHDF5 is not None:
                df_fromHDF5 = df_fromHDF5.set_index("datetime")

                # Load new monitoring data from the Excel file
                df_NewData = pd.read_excel(select_file, parse_dates=[0], index_col=[0])

                # Combine existing and new data, prioritizing new data where available
                df_fromHDF5_filled = df_fromHDF5.combine_first(df_NewData)

                # Update the temporary data storage
                temp[ename].update({wellcode: df_fromHDF5_filled["value"]})
            else:
                logging.warning(f"No data found for {ename}, {wellcode} in HDF5.")

        # Update the main new_data dictionary with the current station data
        new_data.update(temp)
    except Exception as e:
        # Log errors encountered during data extraction and merging
        logging.error(f"Error processing station {select_station}: {e}")
        error_log.update({select_station: e})

In [None]:
# SECTION 4: Prepare Update Dictionary
# ------------------------------------
# Transform the new data into the format required for updating the HDF5 file
updates_dict = {}

for station, wellcode in new_data.items():
    try:
        # Generate metadata for each well in the station
        well_metadata = {
            well: {
                "FIRST_OBS": series.first_valid_index().strftime("%Y%m%d"),
                "LAST_OBS": series.last_valid_index().strftime("%Y%m%d"),
            }
            for well, series in wellcode.items()
        }
        well_data = {well: series.values for well, series in wellcode.items()}
        
        # Gather all unique dates from the well data
        dates = sorted(
            set(
                [
                    series.index.strftime("%Y%m%d").tolist()
                    for series in wellcode.values()
                ][0]
            )
        )

        # Structure the update dictionary for each station
        updates_dict[station] = {
            "sensor_data": well_data,
            "metadata": {  # Station-level metadata
                "Updated Date": pd.Timestamp.now().strftime("%Y-%m-%d"),
            },
            "sensor_metadata": well_metadata,  # Metadata specific to each sensor
        }

        # Include date information in the sensor data
        updates_dict[station]["sensor_data"]["date"] = np.array(dates, dtype='S10')
    except Exception as e:
        print(station, e)  # Print errors for debugging
        pass

In [None]:
# Log the details of the updates that will be applied
logging.info(f"Updates prepared: {updates_dict}")

# SECTION 5: Apply Updates to HDF5 File
# -------------------------------------
# Use the predefined function to update the HDF5 file with new data and metadata
gwatertools.h5pytools.update_hdf5(gwl_hdf5_file, updates_dict)

# Log successful completion of updates
logging.info("HDF5 file has been successfully updated.")