In [1]:
from appgeopy import *
from my_packages import *

pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 999)

In [2]:
def string_to_float_converter(value):
    """
    Convert string value to float, and handle invalid or missing values.

    Args:
        value (str): Input value in string format.

    Returns:
        float: Converted value in float format or NaN if value is invalid or missing.
    """
    try:
        converted_value = float(value)
    except:
        if "*" in value:
            converted_value = np.nan
        else:
            value = value[:-1]
            converted_value = float(value)

    return converted_value


def find_month_indices(series):
    """
    Detect the indices in a pandas Series where the values contain the first three letters of any month.

    Args:
    series (pd.Series): The pandas Series to search.

    Returns:
    list: A list of indices where the Series contains the first three letters of any month.
    """
    # List of the first three letters of each month
    month_abbrs = [
        "Jan",
        "Feb",
        "Mar",
        "Apr",
        "May",
        "Jun",
        "Jul",
        "Aug",
        "Sep",
        "Oct",
        "Nov",
        "Dec",
    ]

    # Find the indices where any of the month abbreviations appear
    indices = [
        i
        for i, value in series.items()
        if any(month in str(value) for month in month_abbrs)
    ]

    return indices


def find_station_info_indices(series):
    """
    Detect the indices in a pandas Series where the values contain station information keywords.

    Args:
    series (pd.Series): The pandas Series to search.

    Returns:
    list: A list of indices where the Series contains station information keywords.
    """
    # List of station information keywords
    station_info_abbrs = ["流域編號", "測站編號", "流域名稱", "河流名稱", "測站名稱"]

    # Find the indices where any of the station information keywords appear
    indices = [
        i
        for i, value in series.items()
        if any(info in str(value) for info in station_info_abbrs)
    ]

    return indices


def get_station_info(input_dataframe):
    """
    Extract station information from the input DataFrame.

    Args:
    input_dataframe (pd.DataFrame): The pandas DataFrame to extract information from.

    Returns:
    list: A list containing basin number, basin name, tributary, station number, and station name.
    """
    (
        basin_num_idx,
        station_number_idx,
        basin_name_idx,
        tributary_idx,
        station_name_idx,
    ) = find_station_info_indices(input_dataframe.iloc[0, :])

    basin_number = input_dataframe.iloc[1, basin_num_idx]
    basin_name = input_dataframe.iloc[1, basin_name_idx]
    tributary = input_dataframe.iloc[1, tributary_idx]
    station_number = input_dataframe.iloc[1, station_number_idx]
    station_name = input_dataframe.iloc[1, station_name_idx]

    return [basin_number, basin_name, tributary, station_number, station_name]

In [4]:
# Define the folder paths
savefolder = r"D:\002_DATA\__RIVER_STAGE\PROCESSED_DATA"  # Folder where processed data will be saved

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

fpath = r"D:\002_DATA\__RIVER_STAGE\RAW_DATA\2021_River runoff_North.xlsx"

# Extract processing year and region from the file name
processing_year = int(os.path.basename(fpath).split("_")[0])
region = os.path.basename(fpath).split("_")[-1].split(".")[0].upper()

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
select_sheetname = "Table 18"
df = pd.read_excel(fpath, sheet_name=select_sheetname, header=None)

In [5]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,流域編號\nBasin No.,,測站編號\nStation No.,,,,流域名稱\nBasin,,,河流名稱\nTributary,,,,測站名稱\nStation,,,,流域面積Km²\nDrainage Area,,
1,1140,,H097,,,,淡水河,,,??魚堀溪,,,,大林橋,,,,0.9,,
2,114,,97,,,,Danshui River,,,Beishi River,,,,DA-LINE,,,,,,
3,Month 月\nDay 日,一 月\nJan.,,二 月\nFeb.,,三 月\nMar.,,四 月\nApr.,五 月\nMay,,六 月\nJune,七 月\nJuly,八 月\nAug.,,九 月\nSept.,,十 月\nOct.,,十一月\nNov.,十二月\nDec.
4,1,171.93,,171.81,,171.8,,171.78,171.94,,172.4,171.87,172.14,,171.81,,171.81,,171.95,172.18
5,2,171.91,,171.8,,171.8,,171.77,171.9,,172.43,171.85,172.03,,171.8,,171.81,,171.93,172.09
6,3,171.9,,171.8,,171.81,,171.76,171.88,,172.42,171.84,172.3,,171.82,,171.79,,171.92,172.03
7,4,171.9,,171.79,,171.8,,171.76,171.86,,173.02,171.82,172.53,,171.79,,171.78,,171.94,171.98
8,5,171.96,,171.79,,171.79,,171.76,171.84,,172.85,171.82,172.35,,171.78,,171.85,,171.97,171.99
9,6,172.08,,171.78,,171.78,,171.75,171.83,,172.61,171.82,172.2,,171.78,,171.93,,171.94,171.97


In [14]:
def remove_newline(input_string):
    input_string = str(input_string)
    return input_string.split("\n")[0] if "\n" in input_string else input_string


def remove_space(input_string):
    input_string = str(input_string)
    return input_string.split(" ")[0] if "\n" in input_string else input_string


def remove_questionmark(input_string):
    input_string = str(input_string)
    return (
        input_string.replace("?", "") if "?" in input_string else input_string
    )

In [15]:
# Extract station information from the DataFrame
(
    basin_number,
    basin_name,
    tributary,
    station_number,
    station_name,
) = get_station_info(df)

basin_number = remove_newline(basin_number)
basin_name = remove_newline(basin_name)
tributary = remove_newline(tributary)
tributary = remove_questionmark(tributary)
station_number = remove_space(station_number)
station_name = remove_newline(station_name)

basin_number, basin_name, tributary, station_number, station_name

('1140', '淡水河', '魚堀溪', 'H097', '大林橋')

In [None]:
# Find the first day index in the DataFrame
_temp = df.iloc[:, 0]
first_day_idx = _temp[_temp == 1].index[0]
first_day_idx

In [None]:
# Define the folder path where the processed data will be saved
target_savefolder = os.path.join(
    savefolder,
    f"{region}",
    f"{basin_number}_{basin_name}",
    f"{tributary}",
    f"{processing_year}",
)
if not os.path.exists(target_savefolder):
    os.makedirs(target_savefolder)  # Create the folder if it doesn't exist

# Find the first day index in the DataFrame
_temp = df.iloc[:, 0]
first_day_idx = _temp[_temp == 1].index[0]

# Find the indices of the columns that contain month names
row_contain_months = df.iloc[3, :]
months_index = find_month_indices(row_contain_months)

# if len(months_index) == 12:
#     cache_data = {"time": [], "daily_value": []}

#     # Loop through each month and extract daily values
#     for k, month_index in enumerate(months_index):
#         this_month = k + 1

#         start_date = pd.Timestamp(year=processing_year, month=this_month, day=1)
#         end_date = start_date + pd.offsets.MonthEnd(1)
#         days_in_month = pd.date_range(start=start_date, end=end_date, freq="D")
#         N = len(days_in_month)
#         values = df.iloc[first_day_idx : first_day_idx + N, month_index].apply(string_to_float_converter)

#         cache_data["time"].extend(days_in_month)
#         cache_data["daily_value"].extend(values)

#     # Create a new DataFrame for the extracted data and save it to an Excel file
#     new_datatable = pd.DataFrame(data=cache_data)

#     savepath = os.path.join(target_savefolder, f"{station_number}_{station_name}.xlsx")
#     new_datatable.to_excel(savepath, index=False)