Apply new functions in **`h5pytools`** to implement the transformation

In [2]:
from appgeopy import *
from my_packages import *

In [3]:
# Set the working directory to the script folder
script_folder = os.getcwd()

In [4]:
# Change the current working directory to the main folder containing groundwater level data
mainfolder = r"D:\VINHTRUONG\004_MODELING\001_STUDY_AREA\GroundwaterObservation\@DOWNLOAD_WRA_GWOB_YEARBOOK_PROJECT\@groundwater_level_PDF\GWL_DATA_NEW_Nov2023"
os.chdir(mainfolder)
os.getcwd()  # Verify the current working directory

'D:\\VINHTRUONG\\004_MODELING\\001_STUDY_AREA\\GroundwaterObservation\\@DOWNLOAD_WRA_GWOB_YEARBOOK_PROJECT\\@groundwater_level_PDF\\GWL_DATA_NEW_Nov2023'

In [5]:
# Load station information from an Excel file
station_info_excel = pd.read_excel(
    r"D:\VINHTRUONG\004_MODELING\001_STUDY_AREA\GroundwaterObservation\@DOWNLOAD_WRA_GWOB_YEARBOOK_PROJECT\Well_Information_CRAF_Active_Inactive_OneSheetOnly.xlsx"
)
# Display the first 5 rows of station information for verification
station_info_excel.head(5)

Unnamed: 0,AREA_CODE,AREA_NAME,WELL_CODE,DISTRICT_IDENTIFIER,LAYER_IDENTIFIER,OLDNEW_IDENTIFIER,CNAME,ENAME,ECODE,WELL_NAME,X_TWD97,Y_TWD97,ADDRESS,WELL_ELEV(m),WELL_DEPTH(m),WELL_SCREEN(m),NOTE,ACTIVE
0,50,濁水溪沖積扇,7010111,70101,1,1,國聖,guosheng,GSG,國聖(1),206194.06,2665352.132,彰化縣彰化市國聖里中山路三段608號(國聖國小),21.053,24.0,8.00~14.00 24.00~30.00,1995/01~,1
1,50,濁水溪沖積扇,7010121,70101,2,1,國聖,guosheng,GSG,國聖(2),206194.06,2665352.132,彰化縣彰化市國聖里中山路三段608號(國聖國小),21.176,131.21,120.00~126.00,1994/11~,1
2,50,濁水溪沖積扇,7010131,70101,3,1,國聖,guosheng,GSG,國聖(3),206194.06,2665352.132,彰化縣彰化市國聖里中山路三段608號(國聖國小),21.528,200.0,185.00~197.00,1997/01~,1
3,50,濁水溪沖積扇,7010211,70102,1,1,東芳,dongfang,DFG,東芳(1),200779.08,2662059.143,彰化縣彰化市東芳里彰馬路45號(東芳國小),10.866,132.0,101.00~125.00,1997/07~,1
4,50,濁水溪沖積扇,7010221,70102,2,1,東芳,dongfang,DFG,東芳(2),200779.08,2662059.143,彰化縣彰化市東芳里彰馬路45號(東芳國小),10.86,181.0,162.00~174.00,1997/06~,1


In [6]:
# ________________________________________________________________________________________________
# Initialize metadata dictionary for all stations
all_stations_metadata = {
    "CreatedTime": datetime.now().strftime("%Y/%m/%d, %H:%M:%S"),
    "Description": "Transform GWL data from Excel to HDF5",
}

common_datetime_index = pd.date_range(start="2001-01-01", end="2024-12-31")

all_stations_measurement_data = {
    "date": common_datetime_index.strftime("%Y%m%d").tolist()
}

# Get list of groundwater level folders
gwl_folders = [f for f in os.listdir(mainfolder) if os.path.isdir(f)]

for select_folder in tqdm(gwl_folders):
    ename, cname, abbrev = select_folder.upper().split("_")

    # ________________________________________________________________________________________________
    # PREPARE METADATA FOR STATIONS
    # ________________________________________________________________________________________________

    station_info = station_info_excel.query("ENAME == @ename.lower()")

    # Initialize station metadata
    if station_info.empty:
        station_metadata = {"metadata": "null"}
    else:
        x_twd97, y_twd97 = station_info.iloc[0][["X_TWD97", "Y_TWD97"]]
        address = station_info["ADDRESS"].iloc[0]
        num_of_wells = len(glob(f"{select_folder}/*.xlsx"))

        station_metadata = {
            "Chinese": cname,
            "Abbreviation": abbrev,
            "EPSG": 3826,
            "X": x_twd97,
            "Y": y_twd97,
            "BasinENG": "Choshuichi Fan",
            "BasinCHN": "濁水溪沖積扇",
            "Num_of_Wells": num_of_wells,
            "Address": address,
            "CreatedTime": datetime.now().strftime("%Y/%m/%d, %H:%M:%S"),
        }

    all_stations_metadata[ename] = station_metadata

    # ________________________________________________________________________________________________
    # PREPARE METADATA AND MEASUREMENT_DATA FOR EACH WELL
    # ________________________________________________________________________________________________

    # Initialize dictionaries for well metadata and measurement data
    well_metadata = {}
    well_measurement_data = {}

    # List of well files in the station folder
    well_files = glob(f"{select_folder}/*.xlsx")

    # Efficient loop through well files using list comprehension and map
    for wellcode, select_file in [
        (os.path.basename(f).split(".")[0], f) for f in well_files
    ]:
        well_info = station_info.query("WELL_CODE == @wellcode")

        well_metadata[wellcode] = (
            {"metadata": "null"}
            if well_info.empty
            else {
                "WellName": well_info["WELL_NAME"].iloc[0],
                "Well_Elev(m)": well_info["WELL_ELEV(m)"].iloc[0],
                "Well_Depth(m)": well_info["WELL_DEPTH(m)"].iloc[0],
                "Well_Screen(m)": well_info["WELL_SCREEN(m)"].iloc[0],
                "Status": "Active"
                if well_info["ACTIVE"].iloc[0] == 1
                else "Inactive",
            }
        )

        # Load and align groundwater level data
        df = pd.read_excel(select_file, parse_dates=[0], index_col=[0])
        temp = pd.DataFrame(index=common_datetime_index)
        temp["daily_value"] = temp.index.map(df.iloc[:, 0])

        # Update well metadata with observation dates
        first_obs = temp.first_valid_index().strftime("%Y/%m/%d")
        last_obs = temp.last_valid_index().strftime("%Y/%m/%d")
        well_metadata[wellcode].update(
            {"FIRST_OBS": first_obs, "LAST_OBS": last_obs}
        )

        # Update well measurement data
        well_measurement_data[wellcode] = {
            "measure": {"daily_value": temp["daily_value"].values}
        }

    # ________________________________________________________________________________________________
    # UPDATE GLOBAL DICTIONARIES
    # ________________________________________________________________________________________________

    all_stations_measurement_data[ename] = well_measurement_data

    # Merge well metadata into station metadata
    all_stations_metadata[ename].update(well_metadata)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.69s/it]
