In [1]:
import pinyin
import pinyin.cedict
from appgeopy import *
from my_packages import *

In [8]:
def process_measure_data(precipitation_df, station_name):
    # Extract data for the selected station
    df_byStation = precipitation_df.loc[:, [select_station]]

    # Ensure the data has an hourly frequency, filling in missing values if necessary
    df_byStation = df_byStation.asfreq("H")
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # Compute the daily sum by resampling the hourly data to daily frequency
    df_byStation_dailySum = df_byStation.resample("D").sum()

    # Extract the daily summed values as a NumPy array for further processing
    df_byStation_dailySum_arr = df_byStation_dailySum.iloc[:, 0].values

    # Extract the daily timestamps as a NumPy array
    daily_time_arr = df_byStation_dailySum.index.strftime("%Y%m%d").to_numpy()
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # Compute the monthly sum by resampling the daily data to the start of each month ("MS")
    df_byStation_monthlySum = df_byStation.resample("MS").sum()

    # Extract the monthly summed values as a NumPy array for further processing
    df_byStation_monthlySum_arr = df_byStation_monthlySum.iloc[:, 0].values

    # Extract the monthly timestamps as a NumPy array
    monthly_time_arr = df_byStation_monthlySum.index.strftime("%Y%m%d").to_numpy()
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    measure_byStation_dict = {
        "daily_values": df_byStation_dailySum_arr,
        "daily_date": daily_time_arr,
        "monthly_values": df_byStation_monthlySum_arr,
        "monthly_date": monthly_time_arr,
    }
    return measure_byStation_dict


def get_station_metadata(df, station_name):
    info_byStation = df.query("站號==@station_name")

    # Extract station metadata
    station_name = pinyin.get(info_byStation.get("站名", np.nan).values[0], format="strip").upper()
    station_type = info_byStation.get("站種", np.nan).values[0]
    station_height = info_byStation.get("海拔高度(m)", np.nan).values[0]
    x_wgs84 = info_byStation.get("經度", np.nan).values[0]
    y_wgs84 = info_byStation.get("緯度", np.nan).values[0]
    station_city = info_byStation.get("城市", np.nan).values[0]
    station_address = info_byStation.get("地址", np.nan).values[0]
    station_startdate = info_byStation.get("資料起始日期", np.nan).values[0]
    station_stopdate = info_byStation.get("撤站日期", np.nan).values[0]
    station_note = info_byStation.get("備註", np.nan).values[0]
    station_previous_code = info_byStation.get("原站號", np.nan).values[0]
    station_new_code = info_byStation.get("新站號", np.nan).values[0]
    x_twd97 = info_byStation.get("TWD97_x", np.nan).values[0]
    y_twd97 = info_byStation.get("TWD97_y", np.nan).values[0]

    # Store metadata in a structured dictionary
    metadata_byStation = {
        "EName": station_name,
        "Type": station_type,
        "Elevation(m)": station_height,
        "X_WGS84": x_wgs84,
        "Y_WGS84": y_wgs84,
        "City": station_city,
        "Address": station_address,
        "Start Date": station_startdate,
        "Stop Date": station_stopdate,
        "Notes": station_note,
        "Previous Code": station_previous_code,
        "New Code": station_new_code,
        "X_TWD97": x_twd97,
        "Y_TWD97": y_twd97,
    }

    return metadata_byStation

In [9]:
fpath = r"E:\SUBSIDENCE_PROJECT_DATA\20230915水位雨量地陷資料\sorted_data\降雨資料\rainfall_data.xz"
df = pd.read_pickle(fpath)
data_station_code = [ele.upper() for ele in df.columns.tolist()]
df.columns = data_station_code
df = df.sort_index(axis=0)

In [10]:
station_info = pd.read_excel(r"E:\SUBSIDENCE_PROJECT_DATA\20230915水位雨量地陷資料\sorted_data\降雨資料\降雨資料.xlsx")
station_code = [ele.upper() for ele in station_info.iloc[:, 0]]

In [11]:
mutual_stations = sorted(set(data_station_code).intersection(set(station_code)))
len(mutual_stations)

70

#### target: convert rainfall data into HDF5 file --> monthly data, sum of rainfall in one month

In [12]:
# Initialize dictionaries to store processed measurement data and metadata
rainfall_measure_data = {}
rainfall_station_metadata = {}
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Select the first station from the list of station codes
# select_station = data_station_code[0]
for select_station in tqdm(data_station_code):
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    measure_dict_byStation = process_measure_data(precipitation_df=df, station_name=select_station)
    metadata_dict_byStation = get_station_metadata(df=station_info, station_name=select_station)
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    rainfall_measure_data[select_station] = measure_dict_byStation
    rainfall_station_metadata[select_station] = metadata_dict_byStation

  0%|          | 0/71 [00:00<?, ?it/s]

In [13]:
# - - - - - - - - - - - - - - - - -
# Add new description and metadata
# - - - - - - - - - - - - - - - - -
today_string = datetime.now().strftime("%Y%m%d")

# Write updated data and metadata back to the HDF5 file
with h5py.File(f"{today_string}_Rainfall_CRFP_monthly_v1.h5", "w") as hdf5_file:
    gwatertools.h5pytools.data_to_hdf5(hdf5_file, rainfall_measure_data)
    gwatertools.h5pytools.metadata_to_hdf5(hdf5_file, rainfall_station_metadata)

unique_year = df.index.year.unique()
for year in unique_year:
    temp = df.loc[str(year), :]
    print(year, len(temp))

1992 5440
1993 8760
1994 8760
1995 8760
1996 8783
1997 8760
1998 8760
1999 8760
2000 8784
2001 8760
2002 8760
2003 8760
2004 8784
2005 8760
2006 8760
2007 8760
2008 8784
2009 8760
2010 8760
2011 8760
2012 8784
2013 8760
2014 8760
2015 8760
2016 8784
2017 8760
2018 8760
2019 8760
2020 8784
2021 8759
2022 8760
2023 745