In [2]:
import pinyin.cedict
from appgeopy import *
from my_packages import *

#### open leveling data table

In [3]:
leveling_df = pd.read_excel(r"E:\SUBSIDENCE_PROJECT_DATA\地陷資料整理\水準點\LevelingBenchmarks_DavidNCU\metadata.xlsx")
leveling_df.head(5)

Unnamed: 0,樁號,中英對照表,點名,鄉鎮市區,97縱座標,97橫座標,備註,始有記錄時間,最終記錄時間,id
0,陸檢8212,LJ8212,閃光號誌,水上鄉,2592421,188518.0,,1997-05-15,2019-06-15,0
1,陸檢8674,LJ8674,三聖宮,北門鄉,2575550,161121.0,,2005-05-15,2019-06-15,1
2,內部071,NB071,港漧桿11之9,東石鄉,2595263,169106.0,101遺失,1997-05-15,2011-08-15,2
3,內部075,NB075,上茄苳,後壁鄉,2587251,185970.0,99年遺失,1997-05-15,2009-01-15,3
4,內部076,NB076,後壁陸橋,後壁鄉,2585766,184936.0,,1997-05-15,2019-06-15,4


In [4]:
leveling_df.columns

Index(['樁號', '中英對照表', '點名', '鄉鎮市區', '97縱座標', '97橫座標', '備註', '始有記錄時間', '最終記錄時間',
       'id'],
      dtype='object')

In [5]:
station_numbers = leveling_df["樁號"]
station_codes = leveling_df["中英對照表"]
point_names = leveling_df["點名"]
districts = leveling_df["鄉鎮市區"]

#### files expected to exist

In [6]:
guess_station_fullname = sorted(
    [f"{stat_number} {point_name}.xlsx" for stat_number, point_name in zip(station_numbers, point_names)]
)
guess_station_fullname[:5], len(guess_station_fullname)

(['86-2 學甲國小.xlsx',
  '8676-1 溪底寮.xlsx',
  '8682-1 永吉橋.xlsx',
  '98-2 樹農橋水閘.xlsx',
  'B05 馬光北二號橋.xlsx'],
 1858)

#### the leveling files downloaded from website `LandSubsidence-wra-gov-tw`

In [7]:
download_file_fld = r"E:\SUBSIDENCE_PROJECT_DATA\website_LandSubsidence-wra-gov-tw\Leveling_Download_20251111"
files = glob(os.path.join(download_file_fld, "*", "*.xlsx"))
file_basename = sorted([os.path.basename(f) for f in files])
file_basename[:5], len(file_basename)

(['86-2 學甲國小.xlsx',
  '8676-1 溪底寮.xlsx',
  '8682-1 永吉橋.xlsx',
  '98-2 樹農橋水閘.xlsx',
  'B05 馬光北二號橋.xlsx'],
 1830)

#### `expected files`∩`downloaded files`

In [8]:
intersected_files = sorted(set(guess_station_fullname).intersection(set(file_basename)))
intersected_files[:5], len(intersected_files)

(['86-2 學甲國小.xlsx',
  '8676-1 溪底寮.xlsx',
  '8682-1 永吉橋.xlsx',
  '98-2 樹農橋水閘.xlsx',
  'B05 馬光北二號橋.xlsx'],
 1825)

In [9]:
valid_files = sorted([glob(os.path.join(download_file_fld, "*", fname))[0] for fname in intersected_files])
len(valid_files)

1825

#### start processing files

In [10]:
metadata_list = []
measure_data_list = []

for select_file in tqdm(valid_files[:]):
    # Read the Excel file and parse dates in the second column
    file_data = pd.read_excel(select_file, parse_dates=[1])
    base = os.path.splitext(os.path.basename(select_file))[0]
    station_number = base.split(" ")[0]

    if len(file_data) > 0:

        # Get metadata based on benchmark ID
        metadata_by_benchmark_id = leveling_df.query("樁號==@station_number")

        # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        date_arr = file_data["日期"].apply(lambda x: x.strftime("%Y%m%d")).tolist()
        height_arr = file_data["高程"].tolist()
        measure_data_cache = {station_number: {"date": date_arr, "values": height_arr}}
        measure_data_list.append(measure_data_cache)
        # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

        # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        metadata_cache = {
            station_number: {
                "Number": metadata_by_benchmark_id.get("樁號").iloc[0],
                "CE_Name": metadata_by_benchmark_id.get("中英對照表").iloc[0],
                "Name": metadata_by_benchmark_id.get("點名").iloc[0],
                "Township": metadata_by_benchmark_id.get("鄉鎮市區").iloc[0],
                "X_TWD97": metadata_by_benchmark_id.get("97橫座標").iloc[0],
                "Y_TWD97": metadata_by_benchmark_id.get("97縱座標").iloc[0],
                "First_Record": datetime.strptime(date_arr[0], "%Y%m%d").strftime("%Y-%m-%d"),
                "Last_Record": datetime.strptime(date_arr[-1], "%Y%m%d").strftime("%Y-%m-%d"),
                "EPSG": 3826,
                "Notes": metadata_by_benchmark_id.get("備註").iloc[0],
            }
        }
        metadata_list.append(metadata_cache)
        # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

  0%|          | 0/1825 [00:00<?, ?it/s]

In [11]:
merged_meta_data = gwatertools.merge_dicts(*metadata_list)
merged_measure_data = gwatertools.merge_dicts(*measure_data_list)

In [12]:
today_string = datetime.now().strftime("%Y%m%d")

# Write updated data and metadata back to the HDF5 file
with h5py.File(f"{today_string}_All_LevelingData_LandSubsidence-wra-gov-tw.h5", "w") as hdf5_file:
    gwatertools.h5pytools.metadata_to_hdf5(hdf5_file, merged_meta_data)
    gwatertools.h5pytools.data_to_hdf5(hdf5_file, merged_measure_data)

In [13]:
len(merged_measure_data.keys())

1788