In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats
from datetime import datetime
import glob


In [3]:
def load_and_combine_buoy_data(location_code, data_dir):
    """
    Load and combine all buoy data files for a specific location.
    
    Parameters:
    -----------
    location_code : str
        Location identifier (e.g., '0N90E')
    data_dir : str
        Directory containing the data files
        
    Returns:
    --------
    dict
        Dictionary containing DataFrames for each variable type
    """
    print(f"Loading data for buoy location {location_code}...")
    
    # Define variable types and their corresponding filenames
    var_types = {
        'radiation': f'rad{location_code.lower()}',
        'rainfall': f'rain{location_code.lower()}',
        'humidity': f'rh{location_code.lower()}',
        'sst': f'sst{location_code.lower()}',
        'temperature': f't{location_code.lower()}',
        'wind': f'w{location_code.lower()}'
    }
    
    data_dict = {}
    
    # Load each variable type
    for var_type, file_prefix in var_types.items():
        # Look for matching files (could be .csv, .txt, etc.)
        file_pattern = os.path.join(data_dir, f"{file_prefix}*")
        matching_files = glob.glob(file_pattern)
        
        if matching_files:
            file_path = matching_files[0]
            try:
                # Load the file
                df = pd.read_csv(file_path)
                print(f"Successfully loaded {var_type} data from {file_path}")
                
                # Convert date column to datetime
                if 'Date' in df.columns:
                    df['Date'] = pd.to_datetime(df['Date'])
                    df.set_index('Date', inplace=True)
                
                # Store in dictionary
                data_dict[var_type] = df
                
            except Exception as e:
                print(f"Error loading {var_type} data: {e}")
        else:
            print(f"No {var_type} data file found matching pattern: {file_pattern}")
    
    return data_dict


In [4]:
def apply_quality_filtering(df, variable_type):
    """
    Filter data based on quality codes (Q) and source codes (S).
    
    Parameters:
    -----------
    df : DataFrame
        DataFrame containing the data
    variable_type : str
        Type of variable (for logging purposes)
        
    Returns:
    --------
    DataFrame
        Filtered DataFrame
    """
    if df is None or df.empty:
        return df
    
    original_length = len(df)
    
    # Make a copy to avoid modifying the original
    df_filtered = df.copy()
    
    # Filter based on quality codes
    if 'Q' in df_filtered.columns:
        # Create a boolean mask for quality filtering
        # Q=0: Missing data - remove
        # Q=1: Highest quality - keep
        # Q=2: Default quality - keep
        # Q=3: Adjusted data - keep but flag
        # Q=4: Lower quality - keep but flag
        # Q=5: Sensor failed - remove
        
        # Remove missing data and failed sensors
        quality_mask = (df_filtered['Q'] > 0) & (df_filtered['Q'] < 5)
        df_filtered = df_filtered[quality_mask].copy()
        
        # Add flag for adjusted data and lower quality
        if 'data_quality' not in df_filtered.columns:
            df_filtered['data_quality'] = 'high'
        
        # Mark adjusted data
        if 'Q' in df_filtered.columns:
            df_filtered.loc[df_filtered['Q'] == 3, 'data_quality'] = 'adjusted'
            df_filtered.loc[df_filtered['Q'] == 4, 'data_quality'] = 'low'
        
        quality_filtered_length = len(df_filtered)
        print(f"Quality filtering for {variable_type}: Kept {quality_filtered_length}/{original_length} rows ({quality_filtered_length/original_length*100:.2f}%)")
    
    # Filter based on source codes
    # Focus on source codes 5 (Recovered from Instrument RAM) as it seems to be the most common and reliable
    if 'S' in df_filtered.columns:
        # Prioritize data from RAM (delayed mode) over telemetry
        df_filtered['source_priority'] = 1  # Default priority
        df_filtered.loc[df_filtered['S'] == 5, 'source_priority'] = 5  # Recovered from RAM (highest)
        df_filtered.loc[df_filtered['S'] == 6, 'source_priority'] = 4  # Derived from RAM
        df_filtered.loc[df_filtered['S'] == 7, 'source_priority'] = 3  # Temporally interpolated from RAM
        df_filtered.loc[df_filtered['S'] == 8, 'source_priority'] = 2  # Spatially interpolated from RAM
        
        # Keep this information for reference but don't filter based on source yet
        # We'll use it when merging/combining data sources
    
    return df_filtered

In [5]:
def handle_missing_values(df, variable_type, variables=None):
    """
    Handle missing values using appropriate strategies for each variable type.
    
    Parameters:
    -----------
    df : DataFrame
        DataFrame containing the data
    variable_type : str
        Type of variable ('radiation', 'rainfall', etc.)
    variables : list
        List of specific variables to process (optional)
        
    Returns:
    --------
    DataFrame
        DataFrame with imputed values
    """
    if df is None or df.empty:
        return df
    
    # Make a copy to avoid modifying the original
    df_imputed = df.copy()
    
    # Get the variables to process
    if variables is None:
        # Process all numeric columns except Q and S
        variables = [col for col in df_imputed.columns if col not in ['Q', 'S', 'data_quality', 'source_priority'] 
                    and pd.api.types.is_numeric_dtype(df_imputed[col])]
    
    print(f"Handling missing values for {variable_type} variables: {variables}")
    
    # Apply specific imputation strategies based on variable type
    if variable_type == 'radiation':
        # For radiation data (SWRad, StDev, Max) - use time-based interpolation
        for var in variables:
            missing_count = df_imputed[var].isna().sum()
            if missing_count > 0:
                print(f"  Imputing {missing_count} missing values for {var}")
                
                # First try time-based interpolation (for short gaps)
                df_imputed[var] = df_imputed[var].interpolate(method='time', limit=3)
                
                # For remaining gaps, use forward fill with a limit
                remaining_missing = df_imputed[var].isna().sum()
                if remaining_missing > 0:
                    df_imputed[var] = df_imputed[var].fillna(method='ffill', limit=2)
                    
                # Calculate how many were filled
                final_missing = df_imputed[var].isna().sum()
                filled_count = missing_count - final_missing
                print(f"    Filled {filled_count}/{missing_count} values. {final_missing} remain missing.")
    
    elif variable_type == 'rainfall':
        # For rainfall (Prec) - missing values often mean no rain
        for var in variables:
            if var == 'Prec':
                missing_count = df_imputed[var].isna().sum()
                if missing_count > 0:
                    print(f"  Imputing {missing_count} missing values for {var} with zeros")
                    df_imputed[var] = df_imputed[var].fillna(0)
            else:
                # For other rainfall-related variables (StDev, %Time)
                missing_count = df_imputed[var].isna().sum()
                if missing_count > 0:
                    print(f"  Imputing {missing_count} missing values for {var}")
                    df_imputed[var] = df_imputed[var].interpolate(method='linear', limit=2)
    
    elif variable_type == 'humidity':
        # For humidity (RH) - tends to be stable, use ffill+bfill
        for var in variables:
            missing_count = df_imputed[var].isna().sum()
            if missing_count > 0:
                print(f"  Imputing {missing_count} missing values for {var}")
                df_imputed[var] = df_imputed[var].fillna(method='ffill', limit=2)
                remaining_missing = df_imputed[var].isna().sum()
                if remaining_missing > 0:
                    df_imputed[var] = df_imputed[var].fillna(method='bfill', limit=2)
                final_missing = df_imputed[var].isna().sum()
                filled_count = missing_count - final_missing
                print(f"    Filled {filled_count}/{missing_count} values. {final_missing} remain missing.")
    
    elif variable_type in ['sst', 'temperature']:
        # For temperature data - linear interpolation in both directions
        for var in variables:
            missing_count = df_imputed[var].isna().sum()
            if missing_count > 0:
                print(f"  Imputing {missing_count} missing values for {var}")
                df_imputed[var] = df_imputed[var].interpolate(method='linear', limit_direction='both', limit=3)
                final_missing = df_imputed[var].isna().sum()
                filled_count = missing_count - final_missing
                print(f"    Filled {filled_count}/{missing_count} values. {final_missing} remain missing.")
    
    elif variable_type == 'wind':
        # For wind data - linear interpolation
        for var in variables:
            missing_count = df_imputed[var].isna().sum()
            if missing_count > 0:
                print(f"  Imputing {missing_count} missing values for {var}")
                df_imputed[var] = df_imputed[var].interpolate(method='linear', limit_direction='both', limit=2)
                final_missing = df_imputed[var].isna().sum()
                filled_count = missing_count - final_missing
                print(f"    Filled {filled_count}/{missing_count} values. {final_missing} remain missing.")
    
    return df_imputed

In [6]:
def preprocess_buoy_data(data_dict, location_code, output_dir="output", cleaned_dir="cleaned"):
    """
    Preprocess all buoy data variables.
    
    Parameters:
    -----------
    data_dict : dict
        Dictionary containing DataFrames for each variable type
    location_code : str
        Location identifier (e.g., '0N90E')
    output_dir : str
        Directory for saving visualization outputs
    cleaned_dir : str
        Directory for saving cleaned data
        
    Returns:
    --------
    dict
        Dictionary containing cleaned DataFrames
    """
    # Create output directories
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(cleaned_dir, exist_ok=True)
    loc_output_dir = os.path.join(output_dir, location_code)
    os.makedirs(loc_output_dir, exist_ok=True)
    
    cleaned_data = {}
    
    # Process radiation data
    if 'radiation' in data_dict:
        print("\nProcessing Short Wave Radiation data...")
        df_rad = data_dict['radiation'].copy()
        
        # Apply quality filtering
        df_rad_filtered = apply_quality_filtering(df_rad, "radiation")
        
        # Apply missing value handling
        df_rad_clean = handle_missing_values(df_rad_filtered, "radiation", ['SWRad', 'StDev', 'Max'])
        
        # Process SWRad variable
        if 'SWRad' in df_rad_clean.columns:
            # Handle outliers
            df_rad_clean = handle_outliers(df_rad_clean, 'SWRad')
            
            # Visualize
            plot_time_series(df_rad_clean, 'SWRad', 'Short Wave Radiation', 'W/m²', location_code, loc_output_dir)
            plot_seasonal_patterns(df_rad_clean, 'SWRad', 'Short Wave Radiation', 'W/m²', location_code, loc_output_dir)
            plot_annual_trends(df_rad_clean, 'SWRad', 'Short Wave Radiation', 'W/m²', location_code, loc_output_dir)
            
            # Store cleaned data
            cleaned_data['SWRad'] = df_rad_clean[['SWRad']].copy()
            
            # Save to CSV
            df_rad_clean.to_csv(f"{cleaned_dir}/{location_code}_SWRad_clean.csv")
            print(f"Saved cleaned radiation data to {cleaned_dir}/{location_code}_SWRad_clean.csv")
    
    # Process rainfall data
    if 'rainfall' in data_dict:
        print("\nProcessing Rainfall data...")
        df_rain = data_dict['rainfall'].copy()
        
        # Apply quality filtering
        df_rain_filtered = apply_quality_filtering(df_rain, "rainfall")
        
        # Apply missing value handling
        df_rain_clean = handle_missing_values(df_rain_filtered, "rainfall", ['Prec', 'StDev', '%Time'])
        
        # Process Prec variable
        if 'Prec' in df_rain_clean.columns:
            # Handle outliers
            df_rain_clean = handle_outliers(df_rain_clean, 'Prec')
            
            # Visualize
            plot_time_series(df_rain_clean, 'Prec', 'Rainfall', 'mm', location_code, loc_output_dir)
            plot_seasonal_patterns(df_rain_clean, 'Prec', 'Rainfall', 'mm', location_code, loc_output_dir)
            plot_annual_trends(df_rain_clean, 'Prec', 'Rainfall', 'mm', location_code, loc_output_dir)
            
            # Store cleaned data
            cleaned_data['Prec'] = df_rain_clean[['Prec']].copy()
            
            # Save to CSV
            df_rain_clean.to_csv(f"{cleaned_dir}/{location_code}_Prec_clean.csv")
            print(f"Saved cleaned rainfall data to {cleaned_dir}/{location_code}_Prec_clean.csv")
    
    # Process humidity data
    if 'humidity' in data_dict:
        print("\nProcessing Relative Humidity data...")
        df_rh = data_dict['humidity'].copy()
        
        # Apply quality filtering
        df_rh_filtered = apply_quality_filtering(df_rh, "humidity")
        
        # Apply missing value handling
        df_rh_clean = handle_missing_values(df_rh_filtered, "humidity", ['RH'])
        
        # Process RH variable
        if 'RH' in df_rh_clean.columns:
            # Handle outliers
            df_rh_clean = handle_outliers(df_rh_clean, 'RH')
            
            # Visualize
            plot_time_series(df_rh_clean, 'RH', 'Relative Humidity', '%', location_code, loc_output_dir)
            plot_seasonal_patterns(df_rh_clean, 'RH', 'Relative Humidity', '%', location_code, loc_output_dir)
            plot_annual_trends(df_rh_clean, 'RH', 'Relative Humidity', '%', location_code, loc_output_dir)
            
            # Store cleaned data
            cleaned_data['RH'] = df_rh_clean[['RH']].copy()
            
            # Save to CSV
            df_rh_clean.to_csv(f"{cleaned_dir}/{location_code}_RH_clean.csv")
            print(f"Saved cleaned humidity data to {cleaned_dir}/{location_code}_RH_clean.csv")
    
    # Process SST data - prioritize sst0n90e.csv over t0n90e.csv for SST
    if 'sst' in data_dict:
        print("\nProcessing Sea Surface Temperature data...")
        df_sst = data_dict['sst'].copy()
        
        # Apply quality filtering
        df_sst_filtered = apply_quality_filtering(df_sst, "sst")
        
        # Apply missing value handling
        df_sst_clean = handle_missing_values(df_sst_filtered, "sst", ['SST'])
        
        # Process SST variable
        if 'SST' in df_sst_clean.columns:
            # Handle outliers
            df_sst_clean = handle_outliers(df_sst_clean, 'SST')
            
            # Visualize
            plot_time_series(df_sst_clean, 'SST', 'Sea Surface Temperature', '°C', location_code, loc_output_dir)
            plot_seasonal_patterns(df_sst_clean, 'SST', 'Sea Surface Temperature', '°C', location_code, loc_output_dir)
            plot_annual_trends(df_sst_clean, 'SST', 'Sea Surface Temperature', '°C', location_code, loc_output_dir)
            
            # Store cleaned data
            cleaned_data['SST'] = df_sst_clean[['SST']].copy()
            
            # Save to CSV
            df_sst_clean.to_csv(f"{cleaned_dir}/{location_code}_SST_clean.csv")
            print(f"Saved cleaned SST data to {cleaned_dir}/{location_code}_SST_clean.csv")
    
    # Process temperature profile data
    if 'temperature' in data_dict:
        print("\nProcessing Temperature Profile data...")
        df_temp = data_dict['temperature'].copy()
        
        # Remove the SST column since we're using the one from sst0n90e.csv
        if 'SST' in df_temp.columns and 'SST' in cleaned_data:
            print("  Removing duplicate SST column from temperature profile data (using the one from SST data)")
            df_temp = df_temp.drop(columns=['SST'])
        
        # Find temperature columns (look for columns with 'TEMP' prefix)
        temp_cols = [col for col in df_temp.columns if col.startswith('TEMP_')]
        
        if temp_cols:
            print(f"Found {len(temp_cols)} temperature depth measurements")
            
            # Apply quality filtering (if quality columns exist)
            df_temp_filtered = apply_quality_filtering(df_temp, "temperature")
            
            # Apply missing value handling for all temperature columns
            df_temp_clean = handle_missing_values(df_temp_filtered, "temperature", temp_cols)
            
            # Process each depth
            for col in temp_cols:
                # Extract depth from column name
                depth = col.split('_')[1].replace('m', '')
                print(f"Processing temperature at depth {depth}m")
                
                if pd.api.types.is_numeric_dtype(df_temp_clean[col]):
                    # Handle outliers
                    df_temp_clean = handle_outliers(df_temp_clean, col)
                    
                    # Visualize
                    plot_time_series(df_temp_clean, col, f'Water Temperature {depth}m', '°C', location_code, loc_output_dir)
                    
                    # Store cleaned data
                    cleaned_data[f'TEMP_{depth}'] = df_temp_clean[[col]].copy()
            
            # Create a simplified dataframe with selected depths (if needed)
            selected_depths = ['10.0m', '100.0m', '300.0m'] if len(temp_cols) > 3 else temp_cols
            selected_cols = [f'TEMP_{depth}' for depth in selected_depths if f'TEMP_{depth}' in df_temp_clean.columns]
            
            if selected_cols:
                # Plot temperature profiles
                plot_temperature_profile(df_temp_clean, selected_cols, location_code, loc_output_dir)
            
            # Save to CSV
            df_temp_clean.to_csv(f"{cleaned_dir}/{location_code}_TEMP_clean.csv")
            print(f"Saved cleaned temperature profile data to {cleaned_dir}/{location_code}_TEMP_clean.csv")
    
    # Process wind data
    if 'wind' in data_dict:
        print("\nProcessing Wind data...")
        df_wind = data_dict['wind'].copy()
        
        # Apply quality filtering
        # Note: Wind data sometimes lacks Q and S columns
        if 'Q' in df_wind.columns or 'S' in df_wind.columns:
            df_wind_filtered = apply_quality_filtering(df_wind, "wind")
        else:
            print("  No quality or source columns found in wind data")
            df_wind_filtered = df_wind.copy()
        
        # Apply missing value handling
        wind_cols = [col for col in ['UWND', 'VWND', 'WSPD', 'WDIR'] if col in df_wind_filtered.columns]
        df_wind_clean = handle_missing_values(df_wind_filtered, "wind", wind_cols)
        
        # Process wind components
        for col in wind_cols:
            if col in df_wind_clean.columns:
                # Handle outliers
                df_wind_clean = handle_outliers(df_wind_clean, col)
        
        # Visualize wind speed
        if 'WSPD' in df_wind_clean.columns:
            plot_time_series(df_wind_clean, 'WSPD', 'Wind Speed', 'm/s', location_code, loc_output_dir)
            plot_seasonal_patterns(df_wind_clean, 'WSPD', 'Wind Speed', 'm/s', location_code, loc_output_dir)
            plot_annual_trends(df_wind_clean, 'WSPD', 'Wind Speed', 'm/s', location_code, loc_output_dir)
            
            # Store cleaned data
            cleaned_data['WSPD'] = df_wind_clean[['WSPD']].copy()
        
        # Wind direction visualization (if both components available)
        if 'UWND' in df_wind_clean.columns and 'VWND' in df_wind_clean.columns:
            plot_wind_rose(df_wind_clean, location_code, loc_output_dir)
        
        # Save to CSV
        df_wind_clean.to_csv(f"{cleaned_dir}/{location_code}_WIND_clean.csv")
        print(f"Saved cleaned wind data to {cleaned_dir}/{location_code}_WIND_clean.csv")
    
    # Create a combined dataset with key variables
    print("\nCreating combined dataset...")
    combine_key_variables(cleaned_data, location_code, cleaned_dir)
    
    return cleaned_data

In [7]:
def handle_outliers(df, variable, method='zscore', threshold=3):
    """Handle outliers in the specified variable."""
    df_result = df.copy()
    
    # Skip if variable doesn't exist or is non-numeric
    if variable not in df_result.columns:
        return df_result
        
    if not pd.api.types.is_numeric_dtype(df_result[variable]):
        print(f"Skipping outlier detection for non-numeric variable: {variable}")
        return df_result
    
    # Skip if too many NaN values
    nan_count = df_result[variable].isna().sum()
    if nan_count > len(df_result) * 0.5:
        print(f"Skipping outlier detection for {variable}: too many NaN values ({nan_count} / {len(df_result)})")
        return df_result
    
    # Get original count
    valid_data = df_result[variable].dropna()
    original_count = len(valid_data)
    
    if original_count == 0:
        return df_result
    
    # Detect outliers
    try:
        if method == 'zscore':
            z_scores = np.abs(stats.zscore(valid_data))
            outliers = z_scores > threshold
            outlier_indices = valid_data.index[outliers]
        elif method == 'iqr':
            Q1 = valid_data.quantile(0.25)
            Q3 = valid_data.quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - threshold * IQR
            upper_bound = Q3 + threshold * IQR
            outlier_indices = valid_data[(valid_data < lower_bound) | (valid_data > upper_bound)].index
        else:
            print(f"Unknown outlier detection method: {method}")
            return df_result
    except Exception as e:
        print(f"Error detecting outliers for {variable}: {e}")
        return df_result
    
    # Mark outliers
    if len(outlier_indices) > 0:
        print(f"Detected {len(outlier_indices)} outliers in {variable} ({len(outlier_indices)/original_count*100:.2f}%)")
        
        # Create an 'is_outlier_[var]' column
        outlier_col = f"is_outlier_{variable.replace('(', '').replace(')', '').replace('.', '_')}"
        df_result[outlier_col] = False
        df_result.loc[outlier_indices, outlier_col] = True
        
        # For modeling preparation, we might want to replace outliers with NaN
        # rather than removing them, so the time series structure is preserved
        df_result.loc[outlier_indices, variable] = np.nan
        
        print(f"Marked outliers in column '{outlier_col}' and replaced values with NaN")
    else:
        print(f"No outliers detected in {variable}")
    
    return df_result


In [8]:
def plot_time_series(df, variable, var_name, unit, location_code, output_dir):
    """Plot time series for a variable."""
    plt.figure(figsize=(14, 6))
    plt.plot(df.index, df[variable], 'o-', alpha=0.5, markersize=2)
    
    plt.title(f'{var_name} at {location_code}')
    plt.ylabel(f'{var_name} ({unit})')
    plt.xlabel('Date')
    plt.grid(True)
    
    # Add a 30-day rolling average to show trend
    if len(df) > 30:
        valid_data = df[variable].dropna()
        if len(valid_data) > 30:
            rolling_avg = valid_data.rolling(window=30, center=True).mean()
            plt.plot(valid_data.index, rolling_avg, 'r-', linewidth=2, label='30-day Rolling Average')
            plt.legend()
    
    plt.tight_layout()
    var_file = variable.replace('(', '').replace(')', '').replace('.', '_')
    plt.savefig(f'{output_dir}/{var_file}_time_series.png')
    plt.close()

In [9]:
def plot_seasonal_patterns(df, variable, var_name, unit, location_code, output_dir):
    """Plot seasonal patterns for a variable."""
    # Skip if not enough data
    if len(df) < 30:
        print(f"Skipping seasonal analysis for {variable}: insufficient data")
        return
    
    # Resample to monthly data
    try:
        monthly_data = df[variable].resample('M').mean()
        
        # Create month and year columns
        monthly_df = pd.DataFrame(monthly_data)
        monthly_df['month'] = monthly_df.index.month
        monthly_df['year'] = monthly_df.index.year
        
        # Plot monthly patterns
        monthly_pattern = monthly_df.groupby('month')[variable].mean()
        months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
        
        plt.figure(figsize=(12, 6))
        monthly_pattern.plot(kind='bar')
        plt.title(f'Monthly {var_name} Pattern at {location_code}')
        plt.ylabel(f'{var_name} ({unit})')
        plt.xlabel('Month')
        plt.xticks(np.arange(12), months, rotation=45)
        plt.grid(True, axis='y')
        
        plt.tight_layout()
        var_file = variable.replace('(', '').replace(')', '').replace('.', '_')
        plt.savefig(f'{output_dir}/{var_file}_monthly_pattern.png')
        plt.close()
        
        # Boxplot of monthly values (showing variation within each month)
        if len(monthly_df) >= 12:  # Only if we have enough data
            plt.figure(figsize=(14, 6))
            sns.boxplot(x='month', y=variable, data=monthly_df)
            plt.title(f'Monthly {var_name} Distribution at {location_code}')
            plt.ylabel(f'{var_name} ({unit})')
            plt.xlabel('Month')
            plt.xticks(np.arange(12), months, rotation=45)
            plt.grid(True, axis='y')
            
            plt.tight_layout()
            plt.savefig(f'{output_dir}/{var_file}_monthly_boxplot.png')
            plt.close()
    except Exception as e:
        print(f"Error in seasonal analysis for {variable}: {e}")

In [10]:
def plot_annual_trends(df, variable, var_name, unit, location_code, output_dir):
    """Plot annual trends for a variable."""
    # Skip if not enough data
    if len(df) < 365:
        print(f"Skipping annual trend analysis for {variable}: insufficient data")
        return
    
    try:
        # Resample to annual data
        annual_data = df[variable].resample('Y').mean()
        
        # Skip if we have too few years
        if len(annual_data) < 3:
            print(f"Skipping annual trend analysis for {variable}: less than 3 years of data")
            return
        
        # Plot annual trend
        plt.figure(figsize=(14, 6))
        annual_data.plot()
        
        plt.title(f'Annual {var_name} Trend at {location_code}')
        plt.ylabel(f'{var_name} ({unit})')
        plt.xlabel('Year')
        plt.grid(True)
        
        # Add trend line
        years_numeric = np.arange(len(annual_data))
        
        # Only calculate trend if we have enough valid data points
        valid_data = annual_data.dropna()
        if len(valid_data) >= 3:
            numeric_idx = np.arange(len(valid_data))
            slope, intercept, r_value, p_value, std_err = stats.linregress(numeric_idx, valid_data)
            
            trend_line = intercept + slope * numeric_idx
            plt.plot(valid_data.index, trend_line, 'r--', 
                    label=f'Trend: {slope:.4f} per year (p={p_value:.4f}, R²={r_value**2:.4f})')
            plt.legend()
        
        plt.tight_layout()
        var_file = variable.replace('(', '').replace(')', '').replace('.', '_')
        plt.savefig(f'{output_dir}/{var_file}_annual_trend.png')
        plt.close()
    except Exception as e:
        print(f"Error in annual trend analysis for {variable}: {e}")

In [11]:
def plot_temperature_profile(df, temp_cols, location_code, output_dir):
    """Plot temperature profile at different depths."""
    try:
        # Get average temperature at each depth
        avg_temps = df[temp_cols].mean()
        depths = [float(col.split('_')[1].replace('m', '')) for col in temp_cols]
        
        # Plot temperature profile
        plt.figure(figsize=(8, 10))
        plt.plot(avg_temps, depths, 'o-', linewidth=2)
        plt.title(f'Average Temperature Profile at {location_code}')
        plt.xlabel('Temperature (°C)')
        plt.ylabel('Depth (m)')
        plt.grid(True)
        plt.gca().invert_yaxis()  # Invert y-axis to show depth increasing downward
        
        plt.tight_layout()
        plt.savefig(f'{output_dir}/temperature_profile.png')
        plt.close()
    except Exception as e:
        print(f"Error plotting temperature profile: {e}")

In [12]:
def plot_wind_rose(df, location_code, output_dir):
    """Plot wind rose diagram using wind components."""
    try:
        # Check if required libraries are installed
        try:
            from windrose import WindroseAxes
        except ImportError:
            print("windrose package not found. Skipping wind rose plot.")
            return
            
        # Skip if not enough data
        if len(df) < 30:
            print("Skipping wind rose plot: insufficient data")
            return
            
        # Calculate wind speed and direction if not available
        if 'WSPD' not in df.columns or 'WDIR' not in df.columns:
            if 'UWND' in df.columns and 'VWND' in df.columns:
                # Calculate wind speed and direction from U and V components
                uwnd = df['UWND'].values
                vwnd = df['VWND'].values
                
                # Skip rows with missing values
                mask = ~(np.isnan(uwnd) | np.isnan(vwnd))
                uwnd = uwnd[mask]
                vwnd = vwnd[mask]
                
                if len(uwnd) < 30:
                    print("Skipping wind rose plot: insufficient valid data")
                    return
                
                wspd = np.sqrt(uwnd**2 + vwnd**2)
                wdir = (270 - np.arctan2(vwnd, uwnd) * 180 / np.pi) % 360
                
                # Create temporary DataFrame with calculated values
                temp_df = pd.DataFrame({
                    'wspd': wspd,
                    'wdir': wdir
                })
            else:
                print("Skipping wind rose plot: required wind components not available")
                return
        else:
            # Use available wind speed and direction
            wspd = df['WSPD'].dropna().values
            wdir = df['WDIR'].dropna().values
            
            # Create temporary DataFrame with values
            temp_df = pd.DataFrame({
                'wspd': wspd,
                'wdir': wdir
            })
        
        # Create wind rose
        plt.figure(figsize=(10, 10))
        ax = WindroseAxes.from_ax()
        ax.bar(temp_df['wdir'], temp_df['wspd'], normed=True, opening=0.8, edgecolor='white')
        ax.set_legend(title='Wind Speed (m/s)')
        plt.title(f'Wind Rose at {location_code}')
        
        plt.tight_layout()
        plt.savefig(f'{output_dir}/wind_rose.png')
        plt.close()
    except Exception as e:
        print(f"Error creating wind rose plot: {e}")


In [13]:
def combine_key_variables(cleaned_data, location_code, cleaned_dir):
    """Combine key variables into a single dataset."""
    try:
        # Get key variables
        key_vars = ['SST', 'Prec', 'RH', 'WSPD', 'SWRad']
        available_vars = [var for var in key_vars if var in cleaned_data]
        
        if len(available_vars) <= 1:
            print("Not enough variables available to create combined dataset")
            return
        
        # Combine into single DataFrame
        combined_df = pd.DataFrame()
        
        for var in available_vars:
            if combined_df.empty:
                combined_df = cleaned_data[var].copy()
            else:
                combined_df = combined_df.join(cleaned_data[var], how='outer')
        
        # Save combined dataset
        combined_file = f"{cleaned_dir}/{location_code}_combined_clean.csv"
        combined_df.to_csv(combined_file)
        print(f"Saved combined dataset with {len(available_vars)} variables to {combined_file}")
        
        # Create correlation matrix if we have enough variables
        if len(available_vars) >= 2:
            plt.figure(figsize=(10, 8))
            corr_matrix = combined_df.corr()
            sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
            plt.title(f'Correlation Matrix for {location_code}')
            plt.tight_layout()
            plt.savefig(f"{cleaned_dir}/{location_code}_correlation_matrix.png")
            plt.close()
            
            print("Created correlation matrix visualization")
    except Exception as e:
        print(f"Error creating combined dataset: {e}")


In [14]:
if __name__ == "__main__":

    
    # Define data directories for each location
    data_dirs = {
        '0N90E': '/run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/0N90E/CSV',
        '4N90E': '/run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/4N90E/CSV',
        '8N90E': '/run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/8N90E/CSV'
    }
    
    # Define variable information (name and unit)
    variable_info = {
        'SST': ('Sea Surface Temperature', '°C'),
        'RH': ('Relative Humidity', '%'),
        'Prec': ('Rainfall', 'mm'),
        'WSPD': ('Wind Speed', 'm/s'),
        'SWRad': ('Short Wave Radiation', 'W/m²'),
        'UWND': ('Zonal Wind', 'm/s'),
        'VWND': ('Meridional Wind', 'm/s'),
        'WDIR': ('Wind Direction', '°')
        # Add other variables as needed
    }
    
    # Define temperature columns for profile plotting
    temp_cols = [
        'TEMP_10.0m', 'TEMP_20.0m', 'TEMP_40.0m', 'TEMP_60.0m', 'TEMP_80.0m', 
        'TEMP_100.0m', 'TEMP_120.0m', 'TEMP_140.0m', 'TEMP_180.0m', 
        'TEMP_300.0m', 'TEMP_500.0m'
    ]
    
    # Create timestamp for this preprocessing run
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    print(f"Starting preprocessing run at {timestamp}")
    
    # Process each location
    for location, data_dir in data_dirs.items():
        print(f"\n{'='*50}")
        print(f"Processing location: {location}")
        print(f"{'='*50}")
        
        # Load data for the location
        data_dict = load_and_combine_buoy_data(location, data_dir)
        
        # Skip if no data was loaded
        if not data_dict:
            print(f"No data found for location {location}. Skipping...")
            continue
        
        # Define output and cleaned directories for this location
        output_dir = os.path.join(data_dir, "../CSV CLEANED")
        cleaned_dir = os.path.join(data_dir, "../CSV CLEANED")
        
        # Ensure the directories exist
        os.makedirs(output_dir, exist_ok=True)
        os.makedirs(cleaned_dir, exist_ok=True)
        
        # Preprocess the data
        cleaned_data = preprocess_buoy_data(data_dict, location, output_dir, cleaned_dir)
        
        # Generate and save plots
        for var_type, df in cleaned_data.items():
            # Check if this DataFrame has variables we can plot
            for variable in df.columns:
                if variable in variable_info:
                    var_name, unit = variable_info[variable]
                    
                    # Generate time series plot
                    plot_time_series(df, variable, var_name, unit, location, output_dir)
                    
                    # Generate seasonal patterns plot
                    plot_seasonal_patterns(df, variable, var_name, unit, location, output_dir)
                    
                    # Generate annual trends plot
                    plot_annual_trends(df, variable, var_name, unit, location, output_dir)
            
            # Check if this is the temperature dataframe and has the needed columns
            if all(col in df.columns for col in temp_cols):
                plot_temperature_profile(df, temp_cols, location, output_dir)
            
            # Check if this is the wind dataframe with required columns
            if all(col in df.columns for col in ['UWND', 'VWND', 'WDIR']):
                plot_wind_rose(df, location, output_dir)
        
        # Check if preprocessing was successful
        if cleaned_data:
            print(f"Successfully processed data for location {location}")
        else:
            print(f"Failed to process data for location {location}")
    
    print(f"\nPreprocessing run completed at {datetime.now().strftime('%Y%m%d_%H%M%S')}")

Starting preprocessing run at 20250503_224547

Processing location: 0N90E
Loading data for buoy location 0N90E...
Successfully loaded radiation data from /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/0N90E/CSV/rad0n90e_dy.csv
Successfully loaded rainfall data from /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/0N90E/CSV/rain0n90e_dy.csv
Successfully loaded humidity data from /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/0N90E/CSV/rh0n90e_dy.csv
Successfully loaded sst data from /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/0N90E/CSV/sst0n90e_dy.csv
Successfully loaded temperature data from /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/0N90E/CSV/t0n90e_dy.csv
Successfully loaded wind data from /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/0N90E/CSV/w0n90e_dy.csv

Processing Short Wave Radiation data...
Quality filtering for radiation: Kept 3545/3559 rows (99.61%)
Handling missing v

  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',


Saved cleaned radiation data to /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/0N90E/CSV/../CSV CLEANED/0N90E_SWRad_clean.csv

Processing Rainfall data...
Quality filtering for rainfall: Kept 3194/3275 rows (97.53%)
Handling missing values for rainfall variables: ['Prec', 'StDev', '%Time']
Detected 52 outliers in Prec (1.63%)
Marked outliers in column 'is_outlier_Prec' and replaced values with NaN


  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',


Saved cleaned rainfall data to /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/0N90E/CSV/../CSV CLEANED/0N90E_Prec_clean.csv

Processing Relative Humidity data...
Quality filtering for humidity: Kept 3440/3446 rows (99.83%)
Handling missing values for humidity variables: ['RH']
Detected 12 outliers in RH (0.35%)
Marked outliers in column 'is_outlier_RH' and replaced values with NaN


  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',


Saved cleaned humidity data to /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/0N90E/CSV/../CSV CLEANED/0N90E_RH_clean.csv

Processing Sea Surface Temperature data...
Quality filtering for sst: Kept 4192/4203 rows (99.74%)
Handling missing values for sst variables: ['SST']
Detected 8 outliers in SST (0.19%)
Marked outliers in column 'is_outlier_SST' and replaced values with NaN


  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',


Saved cleaned SST data to /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/0N90E/CSV/../CSV CLEANED/0N90E_SST_clean.csv

Processing Temperature Profile data...
  Removing duplicate SST column from temperature profile data (using the one from SST data)
Found 11 temperature depth measurements
Handling missing values for temperature variables: ['TEMP_10.0m', 'TEMP_20.0m', 'TEMP_40.0m', 'TEMP_60.0m', 'TEMP_80.0m', 'TEMP_100.0m', 'TEMP_120.0m', 'TEMP_140.0m', 'TEMP_180.0m', 'TEMP_300.0m', 'TEMP_500.0m']
  Imputing 708 missing values for TEMP_10.0m
    Filled 60/708 values. 648 remain missing.
  Imputing 117 missing values for TEMP_20.0m
    Filled 78/117 values. 39 remain missing.
  Imputing 976 missing values for TEMP_40.0m
    Filled 71/976 values. 905 remain missing.
  Imputing 774 missing values for TEMP_60.0m
    Filled 61/774 values. 713 remain missing.
  Imputing 1263 missing values for TEMP_80.0m
    Filled 88/1263 values. 1175 remain missing.
  Imputing 559 missing v

  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',


windrose package not found. Skipping wind rose plot.
Saved cleaned wind data to /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/0N90E/CSV/../CSV CLEANED/0N90E_WIND_clean.csv

Creating combined dataset...
Saved combined dataset with 5 variables to /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/0N90E/CSV/../CSV CLEANED/0N90E_combined_clean.csv
Created correlation matrix visualization


  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',
  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',
  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',
  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',
  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',


Successfully processed data for location 0N90E

Processing location: 4N90E
Loading data for buoy location 4N90E...
Successfully loaded radiation data from /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/4N90E/CSV/rad4n90e_dy.csv
Successfully loaded rainfall data from /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/4N90E/CSV/rain4n90e_dy.csv
Successfully loaded humidity data from /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/4N90E/CSV/rh4n90e_dy.csv
Successfully loaded sst data from /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/4N90E/CSV/sst4n90e_dy.csv
Successfully loaded temperature data from /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/4N90E/CSV/t4n90e_dy.csv
Successfully loaded wind data from /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/4N90E/CSV/w4n90e_dy.csv

Processing Short Wave Radiation data...
Quality filtering for radiation: Kept 1823/1826 rows (99.84%)
Handling missing 

  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',


Saved cleaned radiation data to /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/4N90E/CSV/../CSV CLEANED/4N90E_SWRad_clean.csv

Processing Rainfall data...
Quality filtering for rainfall: Kept 1868/1908 rows (97.90%)
Handling missing values for rainfall variables: ['Prec', 'StDev', '%Time']
Detected 35 outliers in Prec (1.87%)
Marked outliers in column 'is_outlier_Prec' and replaced values with NaN


  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',


Saved cleaned rainfall data to /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/4N90E/CSV/../CSV CLEANED/4N90E_Prec_clean.csv

Processing Relative Humidity data...
Quality filtering for humidity: Kept 1848/1852 rows (99.78%)
Handling missing values for humidity variables: ['RH']
Detected 15 outliers in RH (0.81%)
Marked outliers in column 'is_outlier_RH' and replaced values with NaN


  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',


Saved cleaned humidity data to /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/4N90E/CSV/../CSV CLEANED/4N90E_RH_clean.csv

Processing Sea Surface Temperature data...
Quality filtering for sst: Kept 2357/2397 rows (98.33%)
Handling missing values for sst variables: ['SST']
Detected 27 outliers in SST (1.15%)
Marked outliers in column 'is_outlier_SST' and replaced values with NaN


  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',


Saved cleaned SST data to /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/4N90E/CSV/../CSV CLEANED/4N90E_SST_clean.csv

Processing Temperature Profile data...
  Removing duplicate SST column from temperature profile data (using the one from SST data)
Found 12 temperature depth measurements
Handling missing values for temperature variables: ['TEMP_10.0m', 'TEMP_13.0m', 'TEMP_20.0m', 'TEMP_40.0m', 'TEMP_60.0m', 'TEMP_80.0m', 'TEMP_100.0m', 'TEMP_120.0m', 'TEMP_140.0m', 'TEMP_180.0m', 'TEMP_300.0m', 'TEMP_500.0m']
  Imputing 1821 missing values for TEMP_10.0m
    Filled 50/1821 values. 1771 remain missing.
  Imputing 998 missing values for TEMP_13.0m
    Filled 39/998 values. 959 remain missing.
  Imputing 1558 missing values for TEMP_20.0m
    Filled 63/1558 values. 1495 remain missing.
  Imputing 1041 missing values for TEMP_40.0m
    Filled 54/1041 values. 987 remain missing.
  Imputing 1656 missing values for TEMP_60.0m
    Filled 68/1656 values. 1588 remain missing.
 

  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',


windrose package not found. Skipping wind rose plot.
Saved cleaned wind data to /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/4N90E/CSV/../CSV CLEANED/4N90E_WIND_clean.csv

Creating combined dataset...
Saved combined dataset with 5 variables to /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/4N90E/CSV/../CSV CLEANED/4N90E_combined_clean.csv
Created correlation matrix visualization


  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',
  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',
  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',
  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',
  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',


Successfully processed data for location 4N90E

Processing location: 8N90E
Loading data for buoy location 8N90E...
Successfully loaded radiation data from /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/8N90E/CSV/rad8n90e_dy.csv
Successfully loaded rainfall data from /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/8N90E/CSV/rain8n90e_dy.csv
Successfully loaded humidity data from /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/8N90E/CSV/rh8n90e_dy.csv
Successfully loaded sst data from /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/8N90E/CSV/sst8n90e_dy.csv
Successfully loaded temperature data from /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/8N90E/CSV/t8n90e_dy.csv
Successfully loaded wind data from /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/8N90E/CSV/w8n90e_dy.csv

Processing Short Wave Radiation data...
Quality filtering for radiation: Kept 3152/3166 rows (99.56%)
Handling missing 

  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',


Saved cleaned radiation data to /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/8N90E/CSV/../CSV CLEANED/8N90E_SWRad_clean.csv

Processing Rainfall data...
Quality filtering for rainfall: Kept 1692/1713 rows (98.77%)
Handling missing values for rainfall variables: ['Prec', 'StDev', '%Time']
Detected 28 outliers in Prec (1.65%)
Marked outliers in column 'is_outlier_Prec' and replaced values with NaN


  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',


Saved cleaned rainfall data to /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/8N90E/CSV/../CSV CLEANED/8N90E_Prec_clean.csv

Processing Relative Humidity data...
Quality filtering for humidity: Kept 3106/3113 rows (99.78%)
Handling missing values for humidity variables: ['RH']
Detected 30 outliers in RH (0.97%)
Marked outliers in column 'is_outlier_RH' and replaced values with NaN


  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',


Saved cleaned humidity data to /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/8N90E/CSV/../CSV CLEANED/8N90E_RH_clean.csv

Processing Sea Surface Temperature data...
Quality filtering for sst: Kept 3006/3009 rows (99.90%)
Handling missing values for sst variables: ['SST']
Detected 42 outliers in SST (1.40%)
Marked outliers in column 'is_outlier_SST' and replaced values with NaN


  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',


Saved cleaned SST data to /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/8N90E/CSV/../CSV CLEANED/8N90E_SST_clean.csv

Processing Temperature Profile data...
  Removing duplicate SST column from temperature profile data (using the one from SST data)
Found 12 temperature depth measurements
Handling missing values for temperature variables: ['TEMP_10.0m', 'TEMP_13.0m', 'TEMP_20.0m', 'TEMP_40.0m', 'TEMP_60.0m', 'TEMP_80.0m', 'TEMP_100.0m', 'TEMP_120.0m', 'TEMP_140.0m', 'TEMP_180.0m', 'TEMP_300.0m', 'TEMP_500.0m']
  Imputing 728 missing values for TEMP_10.0m
    Filled 48/728 values. 680 remain missing.
  Imputing 581 missing values for TEMP_13.0m
    Filled 49/581 values. 532 remain missing.
  Imputing 644 missing values for TEMP_20.0m
    Filled 60/644 values. 584 remain missing.
  Imputing 560 missing values for TEMP_40.0m
    Filled 51/560 values. 509 remain missing.
  Imputing 510 missing values for TEMP_60.0m
    Filled 50/510 values. 460 remain missing.
  Imputing 4

  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',


windrose package not found. Skipping wind rose plot.
Saved cleaned wind data to /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/8N90E/CSV/../CSV CLEANED/8N90E_WIND_clean.csv

Creating combined dataset...
Saved combined dataset with 5 variables to /run/media/cryptedlm/localdisk/Kuliah/Tugas Akhir/Dataset/Buoys/8N90E/CSV/../CSV CLEANED/8N90E_combined_clean.csv
Created correlation matrix visualization


  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',
  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',
  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',
  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',
  monthly_data = df[variable].resample('M').mean()
  annual_data = df[variable].resample('Y').mean()
  plt.plot(valid_data.index, trend_line, 'r--',


Successfully processed data for location 8N90E

Preprocessing run completed at 20250503_224640
