In [1]:
from appgeopy import *
from my_packages import *

pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 999)

In [2]:
def string_to_float_converter(value):
    """
    Convert string value to float, and handle invalid or missing values.

    Args:
        value (str): Input value in string format.

    Returns:
        float: Converted value in float format or NaN if value is invalid or missing.
    """
    try:
        converted_value = float(value)
    except:
        if "*" in value:
            converted_value = np.nan
        else:
            value = value[:-1]
            converted_value = float(value)

    return converted_value


def find_month_indices(series):
    """
    Detect the indices in a pandas Series where the values contain the first three letters of any month.

    Args:
    series (pd.Series): The pandas Series to search.

    Returns:
    list: A list of indices where the Series contains the first three letters of any month.
    """
    # List of the first three letters of each month
    month_abbrs = [
        "Jan",
        "Feb",
        "Mar",
        "Apr",
        "May",
        "Jun",
        "Jul",
        "Aug",
        "Sep",
        "Oct",
        "Nov",
        "Dec",
    ]

    # Find the indices where any of the month abbreviations appear
    indices = [
        i
        for i, value in series.items()
        if any(month in str(value) for month in month_abbrs)
    ]

    return indices


def find_station_info_indices(series):
    """
    Detect the indices in a pandas Series where the values contain station information keywords.

    Args:
    series (pd.Series): The pandas Series to search.

    Returns:
    list: A list of indices where the Series contains station information keywords.
    """
    # List of station information keywords
    station_info_abbrs = ["流域編號", "測站編號", "流域名稱", "河流名稱", "測站名稱"]

    # Find the indices where any of the station information keywords appear
    indices = [
        i
        for i, value in series.items()
        if any(info in str(value) for info in station_info_abbrs)
    ]

    return indices


def get_station_info(input_dataframe):
    """
    Extract station information from the input DataFrame.

    Args:
    input_dataframe (pd.DataFrame): The pandas DataFrame to extract information from.

    Returns:
    list: A list containing basin number, basin name, tributary, station number, and station name.
    """
    (
        basin_num_idx,
        station_number_idx,
        basin_name_idx,
        tributary_idx,
        station_name_idx,
    ) = find_station_info_indices(input_dataframe.iloc[0, :])

    basin_number = input_dataframe.iloc[1, basin_num_idx]
    basin_name = input_dataframe.iloc[1, basin_name_idx]
    tributary = input_dataframe.iloc[1, tributary_idx]
    station_number = input_dataframe.iloc[1, station_number_idx]
    station_name = input_dataframe.iloc[1, station_name_idx]

    return [basin_number, basin_name, tributary, station_number, station_name]


def remove_newline(input_string):
    input_string = str(input_string)
    return input_string.split("\n")[0] if "\n" in input_string else input_string


def remove_space(input_string):
    input_string = str(input_string)
    return input_string.split(" ")[0] if "\n" in input_string else input_string


def remove_questionmark(input_string):
    input_string = str(input_string)
    return (
        input_string.replace("?", "") if "?" in input_string else input_string
    )

In [3]:
from tqdm.notebook import tqdm

# Folder where processed data will be saved
savefolder = r"D:\002_DATA\__RIVER_STAGE\PROCESSED_DATA"

# Folder containing raw data files
raw_data_folder = r"D:\002_DATA\__RIVER_STAGE\RAW_DATA"

# # # # # # # # # # Get raw data files # # # # # # # # # #

# Get a list of all raw data files in the folder
raw_data_files = glob(raw_data_folder + "\\*.xlsx")

# # # # # # # # # # Process each raw data file # # # # # # # # # #

file_info_summary_cache = {
    "BASIN": [],
    "TRIBUTARY": [],
    "STATION": [],
    "YEAR": [],
    "DATA": [],
}

# Loop through each raw data file
for fpath in tqdm(raw_data_files, desc="Raw Data Files", position=0, ncols=500):

    # Extract processing year and region from the file name
    processing_year = int(os.path.basename(fpath).split("_")[0])
    region = os.path.basename(fpath).split("_")[-1].split(".")[0].upper()

    # Get available sheet names from the Excel file
    available_sheetnames = data_io.get_sheetnames(fpath)

    try:
        # Loop through each sheet in the Excel file
        for select_sheetname in tqdm(
            available_sheetnames, desc="Sheetnames", position=0, ncols=500
        ):
            # Read the sheet into a DataFrame
            df = pd.read_excel(fpath, sheet_name=select_sheetname, header=None)

            # Extract station information from the DataFrame
            (
                basin_number,
                basin_name,
                tributary,
                station_number,
                station_name,
            ) = get_station_info(df)

            try:
                basin_number = remove_newline(basin_number)
                basin_name = remove_newline(basin_name)
                tributary = remove_newline(tributary)
                tributary = remove_questionmark(tributary)
                station_number = remove_space(station_number)
                station_name = remove_newline(station_name)
            except Exception as e:
                print(e)
                pass

            # Define the folder path where the processed data will be saved
            target_savefolder = os.path.join(
                savefolder,
                f"{region}",
                f"{basin_number}_{basin_name}",
                f"{tributary}",
                f"{processing_year}",
            )

            if not os.path.exists(target_savefolder):
                os.makedirs(
                    target_savefolder
                )  # Create the folder if it doesn't exist

            savepath = os.path.join(
                target_savefolder, f"{station_number}_{station_name}.xlsx"
            )

            if not os.path.isfile(savepath):
                # Find the row containing month names
                idx_row_contain_months = [
                    i
                    for i, value in df.iloc[:, 0].iteritems()
                    if "Month" in str(value)
                ][0]
                row_contain_months = df.iloc[idx_row_contain_months, :]
                months_index = find_month_indices(row_contain_months)

                # Find the first day index in the DataFrame
                first_day_idx = idx_row_contain_months + 1

                if len(months_index) == 12:
                    cache_data = {"time": [], "daily_value": []}

                    # Loop through each month and extract daily values
                    for k, month_index in enumerate(months_index):
                        this_month = k + 1

                        start_date = pd.Timestamp(
                            year=processing_year, month=this_month, day=1
                        )
                        end_date = start_date + pd.offsets.MonthEnd(1)
                        days_in_month = pd.date_range(
                            start=start_date, end=end_date, freq="D"
                        )
                        N = len(days_in_month)
                        values = df.iloc[
                            first_day_idx : first_day_idx + N, month_index
                        ].apply(string_to_float_converter)

                        cache_data["time"].extend(days_in_month)
                        cache_data["daily_value"].extend(values)

                    # Create a new DataFrame for the extracted data and save it to an Excel file
                    new_datatable = pd.DataFrame(data=cache_data)
                    new_datatable.to_excel(savepath, index=False)

                for value, key in zip(
                    [
                        f"{basin_number}_{basin_name}",
                        f"{tributary}",
                        f"{station_number}_{station_name}",
                        f"{processing_year}",
                        len(new_datatable),
                    ],
                    file_info_summary_cache.keys(),
                ):
                    file_info_summary_cache[key].append(value)

    except Exception as e:
        print(
            f"Error processing file: {region} - {processing_year} - {select_sheetname}"
        )
        print(e)
        print()
        pass

Raw Data Files:   0%|                                                                                         …

Sheetnames:   0%|                                                                                             …

Sheetnames:   0%|                                                                                             …

Sheetnames:   0%|                                                                                             …

Sheetnames:   0%|                                                                                             …

Sheetnames:   0%|                                                                                             …

Sheetnames:   0%|                                                                                             …

Sheetnames:   0%|                                                                                             …

Sheetnames:   0%|                                                                                             …

Sheetnames:   0%|                                                                                             …

Sheetnames:   0%|                                                                                             …

Sheetnames:   0%|                                                                                             …

Sheetnames:   0%|                                                                                             …

Sheetnames:   0%|                                                                                             …

Sheetnames:   0%|                                                                                             …

Sheetnames:   0%|                                                                                             …

Sheetnames:   0%|                                                                                             …

Sheetnames:   0%|                                                                                             …

Sheetnames:   0%|                                                                                             …

Sheetnames:   0%|                                                                                             …