In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import os

def process_station_data(file_path):
    """
    Reads, processes, and cleans station data from a single file.

    Args:
        file_path (str): The full path to the data file.

    Returns:
        pandas.DataFrame: The processed DataFrame, or None if the file is discarded.
    """
    try:
        df = pd.read_csv(file_path)

        # 1. Check for missing values (initial)
        # print(f"Initial missing values in {os.path.basename(file_path)}:\n{df.isnull().sum()}")

        # 2. Process 'mask' column
        mask_cols = ['tmp', 'dew', 'wnd_angle', 'wnd_rate', 'slp']
        if 'mask' in df.columns:
            for i, col in enumerate(mask_cols):
                if col in df.columns:
                    # Assuming 'mask' is a string of 5 characters ('0' or non-'0')
                    # and corresponds to the order tmp, dew, wnd_angle, wnd_rate, slp
                    # This part might need adjustment based on the actual format of your 'mask'
                    df.loc[df['mask'].str[i] != '0', col] = np.nan
        else:
            print(f"Warning: 'mask' column not found in {os.path.basename(file_path)}")


        # Convert 'time' to datetime and sort
        if 'time' in df.columns:
            df['time'] = pd.to_datetime(df['time'])
            df = df.sort_values(by='time').reset_index(drop=True)

            # 3. Process 'time_diff' and add dummy rows
            if 'time_diff' in df.columns:
                # Assuming 'time_diff' is in minutes or a unit that can be converted to Timedelta
                # You might need to adjust the unit based on your data
                df['time_diff'] = pd.to_timedelta(df['time_diff'], unit='m')

                # Detect missing time intervals and add dummy rows
                # This is a simplified approach. A more robust method might be needed
                # depending on the expected frequency of your data.
                # For example, if data should be every 10 minutes:
                # expected_interval = pd.Timedelta(minutes=10)
                # missing_intervals = df['time'].diff() > expected_interval
                # print(f"Missing intervals detected in {os.path.basename(file_path)}: {missing_intervals.sum()}")

                # A more general approach to reindex and fill missing timestamps:
                if not df.empty:
                    start_time = df['time'].min()
                    end_time = df['time'].max()
                    # Assuming data should be at regular intervals. Replace '10T' with your actual frequency (e.g., 'H' for hourly)
                    # You might need to infer the frequency or get it from metadata.
                    # For demonstration, let's assume a frequent interval (e.g., 1 minute) to catch small gaps.
                    # A more realistic interval based on 'time_diff' distribution would be better.
                    # Let's try to infer the mode of the time differences as the expected interval
                    time_diffs = df['time'].diff().dropna()
                    if not time_diffs.empty:
                        expected_interval = time_diffs.mode()[0]
                        if pd.isna(expected_interval) or expected_interval == pd.Timedelta(seconds=0):
                             # Fallback to a default interval if mode is not meaningful
                             expected_interval = pd.Timedelta(minutes=10) # Default to 10 minutes, adjust as needed

                        full_time_range = pd.date_range(start=start_time, end=end_time, freq=expected_interval)
                        df = df.set_index('time').reindex(full_time_range).reset_index()
                        df = df.rename(columns={'index': 'time'})
                    else:
                         print(f"Warning: Could not infer time frequency for {os.path.basename(file_path)}. Skipping reindexing.")

            else:
                print(f"Warning: 'time_diff' column not found in {os.path.basename(file_path)}")

        else:
             print(f"Warning: 'time' column not found in {os.path.basename(file_path)}")


        # 4. Apply linear interpolation and discard files with excessive missing data
        interpolated_df = df.copy()
        for col in mask_cols:
            if col in interpolated_df.columns:
                # Calculate the size of continuous NaN blocks
                # This requires identifying groups of consecutive NaNs
                nan_groups = interpolated_df[col].isnull().astype(int).groupby(interpolated_df[col].notnull().astype(int).cumsum()).sum()

                # Check if any continuous NaN block exceeds 24 hours of expected data points
                # Assuming 10-minute interval, 24 hours is 24 * 60 / 10 = 144 data points
                # This threshold needs to be adjusted based on your data's frequency
                # Let's use the expected_interval inferred earlier
                if 'expected_interval' in locals() and not pd.isna(expected_interval) and expected_interval != pd.Timedelta(seconds=0):
                     max_allowed_nan_duration = pd.Timedelta(hours=24)
                     max_allowed_nan_count = int(max_allowed_nan_duration / expected_interval)

                     if any(nan_groups > max_allowed_nan_count):
                         print(f"Discarding file {os.path.basename(file_path)} due to excessive continuous missing data in column '{col}'.")
                         return None # Discard the file

                interpolated_df[col] = interpolated_df[col].interpolate(method='linear')

        # print(f"Missing values after processing in {os.path.basename(file_path)}:\n{interpolated_df.isnull().sum()}")

        return interpolated_df

    except Exception as e:
        print(f"Error processing file {os.path.basename(file_path)}: {e}")
        return None

# --- Main processing loop ---
data_folder = '/content/drive/MyDrive/weather_5k' # Replace with your actual folder path
output_folder = '/content/drive/MyDrive/processed_station_data' # Replace with your desired output folder path

# Create output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    print(f"Created output folder: {output_folder}")

processed_files_count = 0
discarded_files_count = 0

if not os.path.exists(data_folder):
    print(f"Error: Data folder not found at {data_folder}")
else:
    file_list = [f for f in os.listdir(data_folder) if f.endswith('.csv')] # Assuming files are CSVs
    total_files = len(file_list)
    print(f"Found {total_files} files to process.")

    for i, file_name in enumerate(file_list):
        file_path = os.path.join(data_folder, file_name)
        print(f"Processing file {i+1}/{total_files}: {file_name}")
        processed_df = process_station_data(file_path)

        if processed_df is not None:
            # Save the processed DataFrame to a new file
            output_file_path = os.path.join(output_folder, file_name)
            processed_df.to_csv(output_file_path, index=False)
            print(f"Successfully processed and saved file: {file_name}")
            processed_files_count += 1
        else:
            discarded_files_count += 1

    print(f"\nFinished processing. Successfully processed and saved {processed_files_count} files. Discarded {discarded_files_count} files.")