In [2]:
import os
import pandas as pd
from datetime import datetime
from Constants import stations_mapping

def clean_excel_files(directory,station_name):
    all_data = []
    files = [f for f in os.listdir(directory) if f.endswith('.xlsx') and not f.startswith('~$')]
    
    for file in files:
        file_path = os.path.join(directory, file)
        try:
            df = pd.read_excel(file_path, engine='openpyxl')
        except Exception as e:
            print(f"Skipping file {file} due to error: {e}")
            continue
        
        # Remove rows 1-8 as they are empty
        df = df.iloc[8:]

        mask_8888 = df == 8888
        mask_9999 = df == 9999
        
        # Sets the values to pd.NA where the mask is True.
        df = df.mask(mask_8888 | mask_9999, pd.NA)
        
        # Select columns B to J (assuming 0-based index, so columns 1 to 9)
        columns_to_convert = df.columns[1:10]
        
        # Convert these specific columns to numeric, setting errors='coerce' to handle non-numeric data
        df[columns_to_convert] = df[columns_to_convert].apply(pd.to_numeric, errors='coerce')
        
        # Interpolate the selected columns
        df[columns_to_convert] = df[columns_to_convert].interpolate(axis=0)


        # Convert column A to datetime, setting errors='coerce' to handle non-date entries
        df[df.columns[0]] = pd.to_datetime(df[df.columns[0]], format='%d-%m-%Y', errors='coerce')
        
        # Drop rows where the first column (dates) is NaT (Not a Time)
        df = df.dropna(subset=[df.columns[0]])
        
        # Rename the first column to 'Date'
        df = df.rename(columns={
            df.columns[0]: 'Date',          
            df.columns[1]: f'Tn({station_name})',           
            df.columns[2]: f'Tx({station_name})',
            df.columns[3]: f'Tavg({station_name})',          
            df.columns[4]: f'RH_avg({station_name})',           
            df.columns[5]: f'RR({station_name})',
            df.columns[6]: f'ss({station_name})',          
            df.columns[7]: f'ff_x({station_name})',           
            df.columns[8]: f'ddd_x({station_name})',
            df.columns[9]: f'ff_avg({station_name})',           
            df.columns[10]: f'ddd_car({station_name})'
        })
        
        # Reset index
        df = df.reset_index(drop=True)
        
        all_data.append(df)

        df.drop(df.columns[10], axis=1, inplace=True)
    
    if not all_data:
        print("No valid Excel files found.")
        return None
    
    # Merge all data
    merged_df = pd.concat(all_data, ignore_index=True)
    
    # Check if 'Date' column exists
    if 'Date' not in merged_df.columns:
        print("The 'Date' column is missing in the merged DataFrame.")
        return None
    
    # Sort by Date
    merged_df = merged_df.sort_values(by='Date')
    
    # Reset index after sorting
    merged_df = merged_df.reset_index(drop=True)

    merged_df = merged_df.drop_duplicates(subset=['Date'])
    
    # Check for all months in the period 2016.01 - 2024.05
    start_date = datetime(2016, 1, 1)
    end_date = datetime(2024, 5, 31)
    date_range = pd.date_range(start=start_date, end=end_date, freq='M')
    date_range_set = set(date_range.strftime('%Y-%m'))
    
    data_months = merged_df['Date'].dt.strftime('%Y-%m').unique()
    data_months_set = set(data_months)
    
    missing_months = date_range_set - data_months_set
    
    if missing_months:
        print(f"{station_name}: missing months: {sorted(list(missing_months))}")
    else:
        print(f"{station_name}:All months in the period are present.")
    
    return merged_df


def main():
    try:
        for station_name, directory in stations_mapping.items():
            merged_data = clean_excel_files(directory,station_name)
            if merged_data is not None:
                # Save the merged data to a new Excel file
                merged_data.to_excel(f'merged_data_{station_name}.xlsx', index=False)
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()


Iskandar:All months in the period are present.
Japura:All months in the period are present.
Nunukan:All months in the period are present.
Oesman:All months in the period are present.
Sumatera Selatan:All months in the period are present.
