In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import logging
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
import shap
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import confusion_matrix, classification_report
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# ==============================================
# SETUP LOGGING AND OUTPUT FOLDER
# ==============================================

# Create output folder with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_folder = f'Results_{timestamp}'
os.makedirs(output_folder, exist_ok=True)

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(os.path.join(output_folder, 'execution.log')),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger()

logger.info(f"Script execution started. Output will be saved in: {output_folder}")


  from .autonotebook import tqdm as notebook_tqdm
2025-05-12 13:52:01,327 - INFO - Script execution started. Output will be saved in: Results_20250512_135201


In [None]:

# ==============================================
# DATA LOADING AND CLEANING
# ==============================================

def load_and_clean_data():
    """Load and clean the raw dataset."""
    try:
        logger.info("Loading and cleaning dataset...")

        # Define column names
        column_names = [
            'station_id', 'longitude', 'latitude', 'time', 'AtmosphericPressure',
            'WindDirection', 'WindSpeed', 'Gust', 'Wavelength', 'WavePeriod',
            'MeanWaveDirection', 'Hmax', 'AirTemperature', 'DownPoint',
            'SeaTemperature', 'RelativeHumidity', 'OC_Flag'
        ]

        # Load data
        raw_data = pd.read_csv('download-csv-3.csv', low_memory=False, skiprows=1, names=column_names)

        # Basic info before cleaning
        logger.info(f"\nRaw dataset shape: {raw_data.shape}")
        logger.info("\nFirst 5 rows before cleaning:\n" + str(raw_data.head()))

        # Remove the units row (first row after header)
        data = raw_data.iloc[1:].reset_index(drop=True)

        # Convert time column
        data['time'] = pd.to_datetime(data['time'], format='%Y-%m-%dT%H:%M:%SZ', errors='coerce')

        # Convert numeric columns
        numeric_cols = [
            'longitude', 'latitude', 'AtmosphericPressure', 'WindDirection',
            'WindSpeed', 'Gust', 'Wavelength', 'WavePeriod', 'MeanWaveDirection',
            'Hmax', 'AirTemperature', 'DownPoint', 'SeaTemperature', 'RelativeHumidity'
        ]
        for col in numeric_cols:
            data[col] = pd.to_numeric(data[col], errors='coerce')

        # Handle OC_Flag
        data['OC_Flag'] = data['OC_Flag'].fillna(0).astype(int)

        # Save cleaned data sample
        data.head(20).to_csv(os.path.join(output_folder, 'cleaned_data_sample.csv'), index=False)

        return data

    except Exception as e:
        logger.error(f"Error during data loading/cleaning: {str(e)}", exc_info=True)
        raise

# ==============================================
# MISSING DATA HANDLING
# ==============================================

def handle_missing_values(data):
    """Implement missing data handling strategy."""
    try:
        logger.info("\nHandling missing values...")

        # Initial missing values analysis
        missing_stats = data.isnull().sum().sort_values(ascending=False)
        missing_stats = missing_stats[missing_stats > 0]
        logger.info("Missing values before handling:\n" + str(missing_stats))

        # Plot missing values
        plt.figure(figsize=(12, 6))
        missing_stats.plot(kind='bar')
        plt.title('Missing Values Before Handling')
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(output_folder, 'missing_values_before.png'))
        plt.close()

        # Drop columns with >50% missing values
        threshold = len(data) * 0.5
        data = data.dropna(thresh=threshold, axis=1)

        # For remaining numeric columns: forward fill then backfill
        numeric_cols = data.select_dtypes(include=[np.number]).columns
        data[numeric_cols] = data[numeric_cols].fillna(method='ffill').fillna(method='bfill')

        # Final missing values check
        logger.info("\nMissing values after handling:\n" + str(data.isnull().sum()))
        logger.info(f"\nAfter Cleaning dataset shape: {data.shape}")
        return data

    except Exception as e:
        logger.error(f"Error during missing value handling: {str(e)}", exc_info=True)
        raise

# ==============================================
# FEATURE ENGINEERING
# ==============================================

def add_features(data):
    """Add time-based, derived, lagged, rolling, and interaction features."""
    try:
        logger.info("\nAdding engineered features...")

        original_columns = set(data.columns)

        # Time features
        data['hour'] = data['time'].dt.hour
        data['day_of_week'] = data['time'].dt.dayofweek
        data['month'] = data['time'].dt.month
        data['year'] = data['time'].dt.year

        # Cyclical encoding for time features
        data['hour_sin'] = np.sin(2 * np.pi * data['hour']/24)
        data['hour_cos'] = np.cos(2 * np.pi * data['hour']/24)
        data['month_sin'] = np.sin(2 * np.pi * data['month']/12)
        data['month_cos'] = np.cos(2 * np.pi * data['month']/12)

        # Weather indices
        data['WindChill'] = 13.12 + 0.6215*data['AirTemperature'] - \
                            11.37*(data['WindSpeed']**0.16) + \
                            0.3965*data['AirTemperature']*(data['WindSpeed']**0.16)

        data['HeatIndex'] = -8.78469475556 + 1.61139411*data['AirTemperature'] + \
                            2.33854883889*data['RelativeHumidity'] - \
                            0.14611605*data['AirTemperature']*data['RelativeHumidity']

        # Original lag features
        data['WindSpeed_lag1'] = data['WindSpeed'].shift(1)
        data['AirTemp_lag1'] = data['AirTemperature'].shift(1)

        # Original rolling features
        data['WindSpeed_rolling_3h'] = data['WindSpeed'].rolling(window=3).mean()
        data['Pressure_rolling_6h'] = data['AtmosphericPressure'].rolling(window=6).mean()

        # New SeaTemperature features
        target = 'SeaTemperature'
        if target in data.columns:
            # Lagged features
            data['SeaTemperature_lag1'] = data[target].shift(1)
            data['SeaTemperature_lag2'] = data[target].shift(2)
            data['SeaTemperature_lag3'] = data[target].shift(3)
            data['SeaTemperature_lag6'] = data[target].shift(6)

            # Rolling features
            data['SeaTemperature_rolling_3h'] = data[target].rolling(window=3).mean()
            data['SeaTemperature_rolling_6h'] = data[target].rolling(window=6).mean()
            data['SeaTemperature_rolling_12h'] = data[target].rolling(window=12).mean()

            # Interaction term
            if 'WindSpeed' in data.columns:
                data['SeaTemp_WindSpeed_interaction'] = data[target] * data['WindSpeed']
            else:
                logger.warning("WindSpeed column not found for interaction term")
        else:
            logger.warning("SeaTemperature column not found for new features")

        # Drop rows with NA values created by lag/rolling features
        data = data.dropna()

        # Calculate number of new features added
        new_features = set(data.columns) - original_columns
        logger.info(f"\nAdded {len(new_features)} new features")
        logger.info("New feature columns:\n" + "\n".join(new_features))

        return data

    except Exception as e:
        logger.error(f"Error during feature engineering: {str(e)}", exc_info=True)
        raise

# ==============================================
# DATA VISUALIZATION
# ==============================================

def visualize_data(data, output_folder):
    """Generate comprehensive visualizations of cleaned data."""
    try:
        logger.info("\nGenerating visualizations...")

        # 1. Time Series of Key Variables
        plt.figure(figsize=(15, 12))
        plot_cols = ['WindSpeed', 'AirTemperature', 'AtmosphericPressure', 'SeaTemperature']

        for i, col in enumerate(plot_cols, 1):
            plt.subplot(len(plot_cols), 1, i)
            plt.plot(data['time'], data[col], linewidth=0.5)
            plt.title(f'{col} Time Series')
            plt.xlabel('Time')
            plt.ylabel(col)
        plt.tight_layout()
        plt.savefig(os.path.join(output_folder, 'timeseries_key_vars.png'))
        plt.close()

        # 2. Correlation Heatmap
        numeric_cols = data.select_dtypes(include=[np.number]).columns
        plt.figure(figsize=(16, 14))
        corr_matrix = data[numeric_cols].corr()
        sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm',
                center=0, vmin=-1, vmax=1, annot_kws={"size": 8})
        plt.title('Feature Correlation Matrix')
        plt.xticks(rotation=45)
        plt.yticks(rotation=0)
        plt.tight_layout()
        plt.savefig(os.path.join(output_folder, 'correlation_matrix.png'))
        plt.close()

        # 3. Distribution Plots
        plt.figure(figsize=(15, 10))
        dist_cols = ['WindSpeed', 'AirTemperature', 'AtmosphericPressure', 'RelativeHumidity']

        for i, col in enumerate(dist_cols, 1):
            plt.subplot(2, 2, i)
            sns.histplot(data[col], kde=True, bins=30)
            plt.title(f'{col} Distribution')
        plt.tight_layout()
        plt.savefig(os.path.join(output_folder, 'distributions.png'))
        plt.close()

        # 4. Wind Direction Rose Plot
        if 'WindDirection' in data.columns:
            plt.figure(figsize=(8, 8))
            ax = plt.subplot(111, projection='polar')
            wind = data['WindDirection'].dropna()
            hist, bins = np.histogram(wind, bins=36, range=(0, 360))
            degrees = np.deg2rad(np.arange(0, 360, 10))
            ax.bar(degrees, hist, width=np.deg2rad(10), bottom=0.0, alpha=0.5)
            ax.set_theta_zero_location('N')
            ax.set_theta_direction(-1)
            ax.set_title('Wind Direction Distribution')
            plt.savefig(os.path.join(output_folder, 'wind_rose.png'))
            plt.close()

        logger.info("Visualizations completed successfully")

    except Exception as e:
        logger.error(f"Error during visualization: {str(e)}", exc_info=True)
        raise

# ==============================================
# MAIN EXECUTION
# ==============================================

def main():
    try:
        # Step 1: Load and clean data
        data = load_and_clean_data()

        # Step 2: Handle missing values
        data = handle_missing_values(data)

        # Step 3: Feature engineering
        data = add_features(data)

        # Step 4: Data visualization
        visualize_data(data, output_folder)

        # Step 5: Save final processed data
        processed_path = os.path.join(output_folder, 'processed_data.csv')
        data.to_csv(processed_path, index=False)
        logger.info(f"\nFinal processed data saved to: {processed_path}")
        logger.info(f"Final dataset shape: {data.shape}")
        logger.info("\nProcessing pipeline completed successfully")

    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}", exc_info=True)
        raise

if __name__ == "__main__":
    main()

2025-05-12 14:12:54,014 - INFO - Loading and cleaning dataset...
2025-05-12 14:12:55,325 - INFO - 
Raw dataset shape: (613393, 17)
2025-05-12 14:12:55,326 - INFO - 
First 5 rows before cleaning:
  station_id     longitude       latitude                  time  \
0        NaN  degrees_east  degrees_north                   UTC   
1         M1         -11.2        53.1266  2001-02-06T13:00:00Z   
2         M1         -11.2        53.1266  2001-02-06T14:00:00Z   
3         M1         -11.2        53.1266  2001-02-06T15:00:00Z   
4         M1         -11.2        53.1266  2001-02-06T16:00:00Z   

  AtmosphericPressure WindDirection WindSpeed  Gust Wavelength WavePeriod  \
0                  mb  degrees_true        kn    kn          m          s   
1               967.6         270.0     21.98  33.1        NaN        NaN   
2               969.8         270.0     23.93  35.0        NaN        NaN   
3               972.0         270.0     19.07  31.1        NaN        NaN   
4               9

In [8]:
data = pd.read_csv('processed_data.csv')
data.head(5)    

Unnamed: 0,station_id,longitude,latitude,time,AtmosphericPressure,WindDirection,WindSpeed,Gust,Wavelength,WavePeriod,...,WindSpeed_rolling_3h,Pressure_rolling_6h,SeaTemperature_lag1,SeaTemperature_lag2,SeaTemperature_lag3,SeaTemperature_lag6,SeaTemperature_rolling_3h,SeaTemperature_rolling_6h,SeaTemperature_rolling_12h,SeaTemp_WindSpeed_interaction
0,M1,-11.2,53.1266,2001-02-07 03:00:00,984.2,350.0,7.0,13.6,1.4,7.0,...,7.326667,981.9,9.0,9.0,9.0,9.0,9.0,9.0,9.0,63.0
1,M1,-11.2,53.1266,2001-02-07 05:00:00,985.0,30.0,12.06,19.5,1.4,7.0,...,9.013333,982.766667,9.0,9.0,9.0,9.0,9.0,9.0,9.0,108.54
2,M1,-11.2,53.1266,2001-02-07 06:00:00,986.8,10.0,19.07,29.2,1.4,7.0,...,12.71,983.8,9.0,9.0,9.0,9.0,9.0,9.0,9.0,171.63
3,M1,-11.2,53.1266,2001-02-07 07:00:00,987.4,40.0,16.93,31.1,1.4,7.0,...,16.02,984.766667,9.0,9.0,9.0,9.0,9.0,9.0,9.0,152.37
4,M1,-11.2,53.1266,2001-02-07 08:00:00,988.4,30.0,21.01,29.2,1.4,7.0,...,19.003333,985.8,9.0,9.0,9.0,9.0,9.0,9.0,9.0,189.09


In [9]:
import pandas as pd

def detect_outliers_iqr(df, multiplier=1.5):
    outlier_indices = {}
    numeric_cols = df.select_dtypes(include='number').columns

    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - multiplier * IQR
        upper_bound = Q3 + multiplier * IQR

        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        if not outliers.empty:
            outlier_indices[col] = outliers.index.tolist()

    return outlier_indices

# Example usage:
data_path = r'C:\Users\Lenovo\Desktop\final_thesis_submission\processed_data.csv'
df = pd.read_csv(data_path)
outliers = detect_outliers_iqr(df)

# Print outliers summary
for col, indices in outliers.items():
    print(f"Column '{col}' has {len(indices)} outliers at indices: {indices}")



Column 'AtmosphericPressure' has 9242 outliers at indices: [555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 8957, 8958, 8960, 8962, 8964, 8967, 8968, 8971, 8972, 8975, 8977, 8979, 8980, 8983, 8985, 8987, 8988, 8990, 8993, 8994, 9217, 9220, 9222, 9224, 9226, 9227, 9230, 9231, 9234, 9236, 9237, 9239, 9241, 9244, 9246, 9247, 9250, 9270, 9271, 9274, 9276, 9277, 9280, 9282, 9284, 9285, 9288, 9289, 9290, 9291, 9292, 9293, 9294, 9296, 9297, 9299, 9302, 9304, 9306, 9307, 9309, 9311, 9314, 14403, 14404, 14407, 14408, 14411, 14413, 14414, 14415, 14416, 14417, 14418, 14419, 14420, 14421, 14422, 14423, 14424, 14425, 14426, 14427, 14428, 14429, 14430, 14431, 14432, 14433, 14434, 14435, 14436, 14437, 14438, 14439, 14440, 14441, 14442, 14443, 14444, 14445, 14447, 14448, 14450, 14452, 14455, 14457, 14458, 14555, 14557, 14559, 14560, 14563, 14836, 14838, 14840, 14845, 14847, 14848, 14851, 14853, 14855, 14857, 14869, 14871, 14873, 14874, 1487

In [12]:
import pandas as pd

def remove_outliers_iqr(df, multiplier=1.5):
    df_cleaned = df.copy()
    numeric_cols = df.select_dtypes(include='number').columns

    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - multiplier * IQR
        upper_bound = Q3 + multiplier * IQR

        # Keep only rows within bounds
        df_cleaned = df_cleaned[(df_cleaned[col] >= lower_bound) & (df_cleaned[col] <= upper_bound)]

    return df_cleaned

# Load your dataset
data_path = r'C:\Users\Lenovo\Desktop\final_thesis\Mondaymeeting\processed_data.csv'
df = pd.read_csv(data_path)

# Remove outliers
df_no_outliers = remove_outliers_iqr(df)

# Save or inspect the cleaned dataset
print(f"Original shape: {df.shape}")
print(f"Shape after outlier removal: {df_no_outliers.shape}")

# Optional: Save to new CSV
# df_no_outliers.to_csv('cleaned_data.csv', index=False)


Original shape: (613387, 29)
Shape after outlier removal: (515298, 29)
