In [1]:
# Import necessary modules from my_packages.py
from my_packages import *  # Standard imports from a central location
from tqdm import tqdm  # For progress bar

In [2]:
def find_excel_files(root_folder):
    """
    Walks through all folders and subfolders within the specified root folder
    and searches for all Excel files with the .xlsx extension.

    Parameters:
        root_folder (str): The path of the root folder to start the search.

    Returns:
        list: A list of paths to all found Excel files.
    """
    excel_files = []

    # Walk through the directory structure
    for dirpath, dirnames, filenames in os.walk(root_folder):
        for filename in filenames:
            # Check if the file is an Excel file with .xlsx extension
            if filename.endswith(".xlsx"):
                file_path = os.path.join(dirpath, filename)
                excel_files.append(file_path)

    return excel_files


def find_files_with_keyword(root_folder, keyword):
    """
    Walks through all folders and subfolders within the specified root folder
    and searches for files that contain the specified keyword in their names.

    Parameters:
        root_folder (str): The path of the root folder to start the search.
        keyword (str): The keyword to search for in file names.

    Returns:
        list: A list of paths to all files containing the keyword.
    """
    matching_files = []

    # Walk through the directory structure
    for dirpath, dirnames, filenames in os.walk(root_folder):
        for filename in filenames:
            # Check if the keyword is in the filename
            if keyword in filename:
                file_path = os.path.join(dirpath, filename)
                matching_files.append(file_path)

    return matching_files

In [3]:
# Define the directory where combined results will be saved
savefolder = r"D:\VINHTRUONG\004_MODELING\001_STUDY_AREA\GroundwaterObservation\@DOWNLOAD_WRA_GWOB_YEARBOOK_PROJECT\@groundwater_level_PDF\GW_DATA_gweb.wra.gov.tw\COMBINE\2023"

In [4]:
# Main directory containing yearly subfolders of data
main_dir = r"D:\VINHTRUONG\004_MODELING\001_STUDY_AREA\GroundwaterObservation\@DOWNLOAD_WRA_GWOB_YEARBOOK_PROJECT\@groundwater_level_PDF\GW_DATA_gweb.wra.gov.tw\Data_2023_Output"

In [5]:
# Find all Excel files in the main directory
found_excel_files = find_excel_files(main_dir)

# Get a sorted list of all unique well codes from the found files
all_available_WellCode = sorted(
    set([os.path.basename(f) for f in found_excel_files])
)

# Select the first well code from the list to process
select_WellCode = all_available_WellCode[0]
for select_WellCode in tqdm(all_available_WellCode):
    
    # Find all files that contain the selected well code in their filename
    files_byWellCode = find_files_with_keyword(
        root_folder=main_dir, keyword=select_WellCode
    )
    
    # Extract the station folder name from the first file's path
    station_folder = files_byWellCode[0].split("\\")[-2]
    
    # Read all Excel files that match the selected well code into a single DataFrame
    cache = pd.concat([pd.read_excel(f) for f in files_byWellCode])
    
    # Sort the DataFrame by time and set 'time' as the index
    cache = cache.sort_values(by="time")
    cache = cache.set_index("time")
    
    # Replace any values greater than 1000 or less than -1000 with NaN
    cache = cache.applymap(lambda x: np.nan if x > 1_000 or x < -1_000 else x)
    
    # Resample the data to daily frequency and take the mean for each day
    cache = cache.resample("D").mean()
    
    # Define the temporary save folder based on the selected station
    temp_savefld = os.path.join(savefolder, station_folder)
    
    # Create the save folder if it doesn't already exist
    os.makedirs(temp_savefld, exist_ok=True)
    
    # Save the processed data to an Excel file in the save folder
    cache.to_excel(os.path.join(temp_savefld, select_WellCode))

100%|████████████████████████████████████████████████████████████████████████████████| 230/230 [14:38<00:00,  3.82s/it]
