In [0]:
# ============================================================================
# EUROPEAN GRID STRESS PREDICTION - MACHINE LEARNING PIPELINE
# Author: Pedro Miguel
# Description: Complete ML pipeline for predicting grid stress scores and 
#              blackout risk classification across 26 European countries
# ============================================================================

# Install required packages
import sys
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "xgboost", "--quiet"])

# Import libraries
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
import warnings
warnings.filterwarnings('ignore')

# Initialize Spark session
spark = SparkSession.builder.getOrCreate()

print("=" * 80)
print("MODULE 1: DATA LOADING & CLEANING PIPELINE")
print("=" * 80)

# ============================================================================
# STEP 1.1: DEFINE COLUMNS TO EXCLUDE
# ============================================================================

# Columns explicitly requested to exclude (contain leakage or are derived)
EXCLUDE_COLS_EXPLICIT = [
    'reserve_margin_ml',      # ML-derived feature (potential leakage)
    'forecast_load_error',    # Derived from target components
    'load_rel_error'          # Derived from target components
]

# Generation columns with 100% NaN (identified in exploration)
EXCLUDE_COLS_NAN = [
    'Fossil_Peat__Actual_Consumption',
    'Marine__Actual_Aggregated',
    'Energy_storage__Actual_Consumption',
    'Fossil_Brown_coal_Lignite__Actual_Aggregated',
    'Fossil_Coal_derived_gas__Actual_Aggregated',
    'Wind_Offshore__Actual_Aggregated',
    'Nuclear__Actual_Consumption',
    'Fossil_Oil_shale__Actual_Aggregated',
    'Fossil_Peat__Actual_Aggregated',
    'Energy_storage__Actual_Aggregated',
    'Wind_Offshore__Actual_Consumption',
    'Nuclear__Actual_Aggregated'
]

# Combine all columns to exclude
EXCLUDE_COLS_ALL = EXCLUDE_COLS_EXPLICIT + EXCLUDE_COLS_NAN

print("\n" + "-" * 80)
print("COLUMNS TO EXCLUDE:")
print("-" * 80)
print(f"  Explicit exclusions: {len(EXCLUDE_COLS_EXPLICIT)}")
print(f"  NaN columns (100%):  {len(EXCLUDE_COLS_NAN)}")
print(f"  Total to exclude:    {len(EXCLUDE_COLS_ALL)}")

# ============================================================================
# STEP 1.2: LOAD DATASETS FROM EXISTING SPLITS
# ============================================================================

print("\n" + "-" * 80)
print("LOADING EXISTING DATA SPLITS:")
print("-" * 80)

# Load the pre-split datasets
train_spark = spark.table("default.train_set")
val_spark = spark.table("default.validation_set")
test_spark = spark.table("default.test_set")

# Convert to Pandas for easier manipulation
# Note: This loads all data into memory - ensure sufficient RAM
print("\n  Converting to Pandas DataFrames...")
train_df = train_spark.toPandas()
val_df = val_spark.toPandas()
test_df = test_spark.toPandas()

print(f"\n  âœ“ Train set loaded:      {train_df.shape[0]:,} rows Ã— {train_df.shape[1]} columns")
print(f"  âœ“ Validation set loaded: {val_df.shape[0]:,} rows Ã— {val_df.shape[1]} columns")
print(f"  âœ“ Test set loaded:       {test_df.shape[0]:,} rows Ã— {test_df.shape[1]} columns")

# ============================================================================
# STEP 1.3: DATA CLEANING FUNCTION
# ============================================================================

def clean_dataset(df, name="Dataset"):
    """
    Clean dataset by removing excluded columns and handling missing values.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe to clean
    name : str
        Name of the dataset (for logging purposes)
    
    Returns:
    --------
    pd.DataFrame
        Cleaned dataframe
    dict
        Cleaning report with statistics
    """
    print(f"\n  Processing {name}...")
    
    # Store initial shape
    initial_shape = df.shape
    
    # Remove excluded columns
    cols_to_drop = [col for col in EXCLUDE_COLS_ALL if col in df.columns]
    df_cleaned = df.drop(columns=cols_to_drop)
    
    # Check for remaining NaN values in non-excluded columns
    nan_counts = df_cleaned.isnull().sum()
    cols_with_nan = nan_counts[nan_counts > 0]
    
    # Create cleaning report
    report = {
        'initial_shape': initial_shape,
        'final_shape': df_cleaned.shape,
        'columns_removed': len(cols_to_drop),
        'remaining_nan_cols': len(cols_with_nan),
        'nan_details': cols_with_nan.to_dict() if len(cols_with_nan) > 0 else {}
    }
    
    print(f"    Initial: {initial_shape[0]:,} rows Ã— {initial_shape[1]} cols")
    print(f"    Final:   {df_cleaned.shape[0]:,} rows Ã— {df_cleaned.shape[1]} cols")
    print(f"    Removed: {len(cols_to_drop)} columns")
    
    return df_cleaned, report

# ============================================================================
# STEP 1.4: APPLY CLEANING TO ALL DATASETS
# ============================================================================

print("\n" + "-" * 80)
print("CLEANING DATASETS:")
print("-" * 80)

train_clean, train_report = clean_dataset(train_df, "Train Set")
val_clean, val_report = clean_dataset(val_df, "Validation Set")
test_clean, test_report = clean_dataset(test_df, "Test Set")

# ============================================================================
# STEP 1.5: VERIFY DATA INTEGRITY
# ============================================================================

print("\n" + "-" * 80)
print("DATA INTEGRITY CHECK:")
print("-" * 80)

# Check if target variable exists and has no missing values
target_col = 'grid_stress_score'

for name, df in [("Train", train_clean), ("Validation", val_clean), ("Test", test_clean)]:
    if target_col in df.columns:
        missing = df[target_col].isnull().sum()
        print(f"\n  {name} - {target_col}:")
        print(f"    Present: âœ“")
        print(f"    Missing: {missing} ({missing/len(df)*100:.2f}%)")
        print(f"    Range:   [{df[target_col].min():.1f}, {df[target_col].max():.1f}]")
    else:
        print(f"\n  {name} - {target_col}: âœ— NOT FOUND")

# Check for any remaining columns with high NaN percentage
print("\n" + "-" * 80)
print("REMAINING NaN ANALYSIS:")
print("-" * 80)

nan_threshold = 50  # Flag columns with >50% NaN
for name, df in [("Train", train_clean), ("Validation", val_clean), ("Test", test_clean)]:
    nan_pct = (df.isnull().sum() / len(df) * 100)
    high_nan_cols = nan_pct[nan_pct > nan_threshold]
    
    if len(high_nan_cols) > 0:
        print(f"\n  {name} - Columns with >{nan_threshold}% NaN:")
        for col, pct in high_nan_cols.items():
            print(f"    {col}: {pct:.2f}%")
    else:
        print(f"\n  {name}: âœ“ No columns with >{nan_threshold}% NaN")

print("\n" + "=" * 80)
print("MODULE 1 COMPLETE: Data loaded and cleaned successfully!")
print("=" * 80)

In [0]:
# ============================================================================
# MODULE 2: ADVANCED CLEANING - COMPLETE DATA ONLY
# ============================================================================

print("\n" + "=" * 80)
print("MODULE 2: ADVANCED CLEANING - COMPLETE DATA ONLY")
print("=" * 80)

# ============================================================================
# STEP 2.1: KEEP ONLY COLUMNS WITH ZERO NaN (COMPLETE DATA)
# ============================================================================

print("\n" + "-" * 80)
print("FILTERING FOR COMPLETE COLUMNS (0% NaN):")
print("-" * 80)

def keep_complete_columns_only(df, name="Dataset"):
    """
    Keep only columns that have absolutely no missing values.
    Protected columns (index, country, target) are always kept.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe
    name : str
        Dataset name for logging
    
    Returns:
    --------
    pd.DataFrame
        Dataframe with only complete columns
    list
        List of removed column names
    list
        List of kept column names
    """
    # Protected columns that must be kept regardless
    protected_cols = ['index', 'country', 'grid_stress_score']
    
    # Calculate NaN count for each column
    nan_counts = df.isnull().sum()
    
    # Identify complete columns (0 NaN)
    complete_cols = nan_counts[nan_counts == 0].index.tolist()
    
    # Identify incomplete columns (excluding protected ones)
    incomplete_cols = []
    for col in df.columns:
        if col not in protected_cols and col not in complete_cols:
            incomplete_cols.append(col)
    
    # Keep only complete columns + protected columns
    cols_to_keep = list(set(complete_cols + protected_cols))
    df_complete = df[cols_to_keep]
    
    print(f"\n  {name}:")
    print(f"    Initial columns: {len(df.columns)}")
    print(f"    Complete columns: {len(complete_cols)}")
    print(f"    Incomplete columns removed: {len(incomplete_cols)}")
    print(f"    Final columns: {len(df_complete.columns)}")
    print(f"    Shape: {df_complete.shape}")
    
    return df_complete, incomplete_cols, cols_to_keep

# Apply to all datasets
train_complete, train_removed, train_kept = keep_complete_columns_only(train_clean, "Train")
val_complete, val_removed, val_kept = keep_complete_columns_only(val_clean, "Validation")
test_complete, test_removed, test_kept = keep_complete_columns_only(test_clean, "Test")

# ============================================================================
# STEP 2.2: SHOW REMOVED COLUMNS
# ============================================================================

print("\n" + "-" * 80)
print("COLUMNS REMOVED (had NaN values):")
print("-" * 80)
for i, col in enumerate(sorted(train_removed), 1):
    print(f"  {i:2d}. {col}")

# ============================================================================
# STEP 2.3: SHOW KEPT COLUMNS (FEATURES)
# ============================================================================

print("\n" + "-" * 80)
print("COLUMNS KEPT (complete data):")
print("-" * 80)

# Separate into categories
feature_cols = [col for col in train_kept if col not in ['index', 'country', 'grid_stress_score']]

print(f"\nProtected columns (3):")
print(f"  - index")
print(f"  - country")
print(f"  - grid_stress_score (target)")

print(f"\nFeature columns ({len(feature_cols)}):")
for i, col in enumerate(sorted(feature_cols), 1):
    print(f"  {i:2d}. {col}")

# ============================================================================
# STEP 2.4: VERIFY DATA INTEGRITY
# ============================================================================

print("\n" + "-" * 80)
print("DATA INTEGRITY VERIFICATION:")
print("-" * 80)

for name, df in [("Train", train_complete), ("Validation", val_complete), ("Test", test_complete)]:
    total_nans = df.isnull().sum().sum()
    print(f"\n  {name} Set:")
    print(f"    Shape: {df.shape[0]:,} rows Ã— {df.shape[1]} columns")
    print(f"    Features: {df.shape[1] - 3}")
    print(f"    Total NaN values: {total_nans}")
    
    if total_nans == 0:
        print(f"    Status: âœ“ COMPLETE DATA (0% missing)")
    else:
        print(f"    Status: âœ— WARNING - Still has NaN values!")

# ============================================================================
# STEP 2.5: FINAL DATASET SUMMARY
# ============================================================================

print("\n" + "-" * 80)
print("FINAL CLEANED DATASETS SUMMARY:")
print("-" * 80)

for name, df in [("Train", train_complete), ("Validation", val_complete), ("Test", test_complete)]:
    print(f"\n  {name} Set:")
    print(f"    Rows: {df.shape[0]:,}")
    print(f"    Features: {df.shape[1] - 3} (excluding index, country, target)")
    print(f"    Date range: {df['index'].min()} to {df['index'].max()}")
    print(f"    Countries: {df['country'].nunique()}")
    
    # Target statistics
    print(f"    Target (grid_stress_score):")
    print(f"      Mean: {df['grid_stress_score'].mean():.2f}")
    print(f"      Std:  {df['grid_stress_score'].std():.2f}")
    print(f"      Range: [{df['grid_stress_score'].min():.0f}, {df['grid_stress_score'].max():.0f}]")

print("\n" + "=" * 80)
print("MODULE 2 COMPLETE: Only complete data retained!")
print("=" * 80)
print("\nNext: Module 3 will prepare features for modeling")
print("=" * 80)

In [0]:
# ============================================================================
# MODULE 2.5: REMOVE TARGET LEAKAGE FEATURES
# ============================================================================

print("\n" + "=" * 80)
print("MODULE 2.5: REMOVING TARGET LEAKAGE FEATURES")
print("=" * 80)

# ============================================================================
# STEP 2.5.1: IDENTIFY AND REMOVE SCORE COMPONENTS
# ============================================================================

print("\n" + "-" * 80)
print("REMOVING SCORE COMPONENTS (TARGET LEAKAGE):")
print("-" * 80)

# These columns are components of grid_stress_score and create data leakage
LEAKAGE_COLS = [
    'score_T7',
    'score_T8',
    'score_load_error',
    'score_reserve_margin'
]

print("\nColumns to remove (components of target variable):")
for i, col in enumerate(LEAKAGE_COLS, 1):
    print(f"  {i}. {col}")

# Remove from all datasets
train_final = train_complete.drop(columns=LEAKAGE_COLS)
val_final = val_complete.drop(columns=LEAKAGE_COLS)
test_final = test_complete.drop(columns=LEAKAGE_COLS)

print("\n" + "-" * 80)
print("RESULTS:")
print("-" * 80)

for name, df_before, df_after in [
    ("Train", train_complete, train_final),
    ("Validation", val_complete, val_final),
    ("Test", test_complete, test_final)
]:
    print(f"\n  {name} Set:")
    print(f"    Before: {df_before.shape[0]:,} rows Ã— {df_before.shape[1]} columns")
    print(f"    After:  {df_after.shape[0]:,} rows Ã— {df_after.shape[1]} columns")
    print(f"    Features: {df_after.shape[1] - 3} (excluding index, country, target)")

# ============================================================================
# STEP 2.5.2: FINAL FEATURE LIST
# ============================================================================

print("\n" + "-" * 80)
print("FINAL FEATURE SET (NO LEAKAGE):")
print("-" * 80)

# Get final feature columns
final_features = [col for col in train_final.columns if col not in ['index', 'country', 'grid_stress_score']]

print(f"\nTotal features: {len(final_features)}")
print("\nFeature list:")
for i, col in enumerate(sorted(final_features), 1):
    print(f"  {i:2d}. {col}")

# Categorize features for clarity
load_features = [col for col in final_features if 'Load' in col]
forecast_features = [col for col in final_features if 'forecast' in col]
weather_features = [col for col in final_features if 'mean_' in col or 'ssrd' in col]
network_features = [col for col in final_features if 'import' in col or 'export' in col or 'P10' in col or 'P90' in col]

print("\n" + "-" * 80)
print("FEATURE CATEGORIES:")
print("-" * 80)
print(f"  Load-related:     {len(load_features)} features")
for f in load_features:
    print(f"    - {f}")
    
print(f"\n  Forecast-related: {len(forecast_features)} features")
for f in forecast_features:
    print(f"    - {f}")
    
print(f"\n  Weather-related:  {len(weather_features)} features")
for f in weather_features:
    print(f"    - {f}")
    
print(f"\n  Network-related:  {len(network_features)} features")
for f in network_features:
    print(f"    - {f}")

# ============================================================================
# STEP 2.5.3: VERIFY NO LEAKAGE & DATA QUALITY
# ============================================================================

print("\n" + "-" * 80)
print("DATA QUALITY VERIFICATION:")
print("-" * 80)

for name, df in [("Train", train_final), ("Validation", val_final), ("Test", test_final)]:
    print(f"\n  {name} Set:")
    print(f"    âœ“ Shape: {df.shape[0]:,} rows Ã— {df.shape[1]} columns")
    print(f"    âœ“ Features: {df.shape[1] - 3}")
    print(f"    âœ“ NaN values: {df.isnull().sum().sum()}")
    print(f"    âœ“ Target present: {'grid_stress_score' in df.columns}")
    print(f"    âœ“ No leakage: {all(col not in df.columns for col in LEAKAGE_COLS)}")

print("\n" + "=" * 80)
print("MODULE 2 COMPLETE: Clean data with NO leakage!")
print("=" * 80)
print("\nReady for Module 3: Feature Engineering & Preprocessing")
print("=" * 80)

In [0]:
# ============================================================================
# MODULE 3: FEATURE ENGINEERING & PREPROCESSING PIPELINE (FIXED)
# ============================================================================

print("\n" + "=" * 80)
print("MODULE 3: FEATURE ENGINEERING & PREPROCESSING PIPELINE")
print("=" * 80)

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
import pickle

# ============================================================================
# STEP 3.1: SEPARATE FEATURES AND TARGET
# ============================================================================

print("\n" + "-" * 80)
print("SEPARATING FEATURES AND TARGET:")
print("-" * 80)

def separate_features_target(df, name="Dataset"):
    """
    Separate features, target, and metadata from dataset.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe
    name : str
        Dataset name for logging
    
    Returns:
    --------
    tuple: (X, y, metadata)
        X: Feature dataframe
        y: Target series
        metadata: DataFrame with index, country
    """
    # Metadata columns
    metadata = df[['index', 'country']].copy()
    
    # Target variable
    y = df['grid_stress_score'].copy()
    
    # Features (all numeric columns except metadata and target)
    feature_cols = [col for col in df.columns if col not in ['index', 'country', 'grid_stress_score']]
    X = df[feature_cols].copy()
    
    print(f"\n  {name}:")
    print(f"    Features (X): {X.shape}")
    print(f"    Target (y):   {y.shape}")
    print(f"    Metadata:     {metadata.shape}")
    
    return X, y, metadata

# Separate all datasets
X_train, y_train, meta_train = separate_features_target(train_final, "Train")
X_val, y_val, meta_val = separate_features_target(val_final, "Validation")
X_test, y_test, meta_test = separate_features_target(test_final, "Test")

# ============================================================================
# STEP 3.2: CHECK FOR PROBLEMATIC VALUES BEFORE ENGINEERING
# ============================================================================

print("\n" + "-" * 80)
print("CHECKING FOR PROBLEMATIC VALUES:")
print("-" * 80)

def check_data_issues(X, name="Dataset"):
    """Check for inf, nan, and zero values that could cause issues"""
    print(f"\n  {name}:")
    
    # Check for inf
    inf_cols = X.columns[np.isinf(X).any()].tolist()
    if inf_cols:
        print(f"    âœ— Infinity values in: {inf_cols}")
    else:
        print(f"    âœ“ No infinity values")
    
    # Check for zeros in denominator columns
    zero_checks = {
        'Forecasted_Load': (X['Forecasted_Load'] == 0).sum(),
        'Actual_Load': (X['Actual_Load'] == 0).sum()
    }
    
    for col, zero_count in zero_checks.items():
        if zero_count > 0:
            print(f"    âš  {col} has {zero_count} zero values (potential division issue)")
        else:
            print(f"    âœ“ {col} has no zeros")
    
    return inf_cols, zero_checks

inf_train, zeros_train = check_data_issues(X_train, "Train")
inf_val, zeros_val = check_data_issues(X_val, "Validation")
inf_test, zeros_test = check_data_issues(X_test, "Test")

# ============================================================================
# STEP 3.3: CREATE DERIVED FEATURES (WITH SAFETY CHECKS)
# ============================================================================

print("\n" + "-" * 80)
print("ENGINEERING DERIVED FEATURES (WITH SAFETY):")
print("-" * 80)

def engineer_features_safe(X, meta, name="Dataset"):
    """
    Create derived features with safety checks to avoid inf/nan.
    
    Parameters:
    -----------
    X : pd.DataFrame
        Feature dataframe
    meta : pd.DataFrame
        Metadata with datetime index
    name : str
        Dataset name
    
    Returns:
    --------
    pd.DataFrame
        Enhanced feature dataframe with new features
    """
    X_eng = X.copy()
    
    print(f"\n  {name}: Creating derived features...")
    
    # Safety constant to avoid division by zero
    EPSILON = 1e-6
    
    # 1. Load forecast error (actual vs forecasted)
    X_eng['load_forecast_error'] = X_eng['Actual_Load'] - X_eng['Forecasted_Load']
    
    # Use safe division with epsilon and clip extreme values
    X_eng['load_forecast_error_pct'] = (X_eng['load_forecast_error'] / 
                                        (X_eng['Forecasted_Load'].abs() + EPSILON)) * 100
    X_eng['load_forecast_error_pct'] = X_eng['load_forecast_error_pct'].clip(-1000, 1000)
    
    # 2. Net position metrics (P10 and P90 spread)
    X_eng['net_position_spread'] = X_eng['P90_net'] - X_eng['P10_net']
    X_eng['net_position_avg'] = (X_eng['P90_net'] + X_eng['P10_net']) / 2
    
    # 3. Import/Export balance
    X_eng['trade_balance_flag'] = (X_eng['T7_high_exports'].astype(int) - 
                                   X_eng['T8_high_imports'].astype(int))
    
    # 4. Renewable energy potential (solar + wind forecasts)
    X_eng['renewable_forecast_total'] = X_eng['solar_forecast'] + X_eng['wind_forecast']
    
    # Safe ratio calculation
    X_eng['renewable_to_load_ratio'] = (X_eng['renewable_forecast_total'] / 
                                        (X_eng['Forecasted_Load'].abs() + EPSILON))
    X_eng['renewable_to_load_ratio'] = X_eng['renewable_to_load_ratio'].clip(0, 10)
    
    # 5. Temporal features from datetime index
    datetime_series = pd.to_datetime(meta['index'])
    X_eng['hour'] = datetime_series.dt.hour
    X_eng['day_of_week'] = datetime_series.dt.dayofweek
    X_eng['month'] = datetime_series.dt.month
    X_eng['is_weekend'] = (datetime_series.dt.dayofweek >= 5).astype(int)
    
    # 6. Cyclical encoding for hour (24-hour cycle)
    X_eng['hour_sin'] = np.sin(2 * np.pi * X_eng['hour'] / 24)
    X_eng['hour_cos'] = np.cos(2 * np.pi * X_eng['hour'] / 24)
    
    # 7. Cyclical encoding for month (12-month cycle)
    X_eng['month_sin'] = np.sin(2 * np.pi * X_eng['month'] / 12)
    X_eng['month_cos'] = np.cos(2 * np.pi * X_eng['month'] / 12)
    
    new_features = X_eng.shape[1] - X.shape[1]
    print(f"    Original features: {X.shape[1]}")
    print(f"    New features added: {new_features}")
    print(f"    Total features: {X_eng.shape[1]}")
    
    # Check for inf/nan after engineering
    inf_count = np.isinf(X_eng).sum().sum()
    nan_count = X_eng.isna().sum().sum()
    
    if inf_count > 0:
        print(f"    âœ— WARNING: {inf_count} infinity values detected!")
    else:
        print(f"    âœ“ No infinity values")
    
    if nan_count > 0:
        print(f"    âœ— WARNING: {nan_count} NaN values detected!")
    else:
        print(f"    âœ“ No NaN values")
    
    return X_eng

# Apply feature engineering to all datasets
X_train_eng = engineer_features_safe(X_train, meta_train, "Train")
X_val_eng = engineer_features_safe(X_val, meta_val, "Validation")
X_test_eng = engineer_features_safe(X_test, meta_test, "Test")

# ============================================================================
# STEP 3.4: ENCODE COUNTRY VARIABLE
# ============================================================================

print("\n" + "-" * 80)
print("ENCODING COUNTRY VARIABLE:")
print("-" * 80)

# Initialize and fit label encoder on training countries
country_encoder = LabelEncoder()
country_encoder.fit(meta_train['country'])

# Transform all datasets
X_train_eng['country_encoded'] = country_encoder.transform(meta_train['country'])
X_val_eng['country_encoded'] = country_encoder.transform(meta_val['country'])
X_test_eng['country_encoded'] = country_encoder.transform(meta_test['country'])

print(f"\n  Countries encoded: {len(country_encoder.classes_)}")
print(f"  Sample countries: {list(country_encoder.classes_[:5])}...")

# ============================================================================
# STEP 3.5: FINAL CHECK BEFORE SCALING
# ============================================================================

print("\n" + "-" * 80)
print("FINAL CHECK BEFORE SCALING:")
print("-" * 80)

for name, X in [("Train", X_train_eng), ("Validation", X_val_eng), ("Test", X_test_eng)]:
    inf_count = np.isinf(X).sum().sum()
    nan_count = X.isna().sum().sum()
    print(f"\n  {name}:")
    print(f"    Shape: {X.shape}")
    print(f"    Infinity values: {inf_count}")
    print(f"    NaN values: {nan_count}")
    if inf_count == 0 and nan_count == 0:
        print(f"    Status: âœ“ READY FOR SCALING")
    else:
        print(f"    Status: âœ— NEEDS CLEANING")

# ============================================================================
# STEP 3.6: FEATURE SCALING
# ============================================================================

print("\n" + "-" * 80)
print("SCALING FEATURES:")
print("-" * 80)

# Initialize scaler
scaler = StandardScaler()

# Fit on training data only (avoid data leakage)
X_train_scaled = scaler.fit_transform(X_train_eng)
X_val_scaled = scaler.transform(X_val_eng)
X_test_scaled = scaler.transform(X_test_eng)

# Convert back to DataFrame with column names
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train_eng.columns, index=X_train_eng.index)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val_eng.columns, index=X_val_eng.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test_eng.columns, index=X_test_eng.index)

print(f"\n  âœ“ Scaler fitted on train set")
print(f"  âœ“ Train scaled: {X_train_scaled.shape}")
print(f"  âœ“ Validation scaled: {X_val_scaled.shape}")
print(f"  âœ“ Test scaled: {X_test_scaled.shape}")

print("\n" + "=" * 80)
print("MODULE 3 COMPLETE: Features engineered and preprocessed safely!")
print("=" * 80)
print("\nReady for Module 4: Model Training")
print("=" * 80)

In [0]:
# ============================================================================
# MODULE 4: MODEL TRAINING PIPELINE (REGRESSION + CLASSIFICATION)
# ============================================================================

print("\n" + "=" * 80)
print("MODULE 4: MODEL TRAINING PIPELINE")
print("=" * 80)

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.metrics import (mean_squared_error, mean_absolute_error, r2_score,
                             accuracy_score, precision_score, recall_score, f1_score,
                             confusion_matrix, classification_report)
import time

# ============================================================================
# STEP 4.1: CREATE CLASSIFICATION TARGET (BLACKOUT RISK)
# ============================================================================

print("\n" + "-" * 80)
print("CREATING CLASSIFICATION TARGET (BLACKOUT RISK):")
print("-" * 80)

def create_blackout_risk_target(y, threshold=50):
    """
    Create binary classification target for blackout risk.
    
    High Risk (1): grid_stress_score >= threshold
    Low Risk (0): grid_stress_score < threshold
    
    Parameters:
    -----------
    y : pd.Series
        Grid stress score
    threshold : float
        Threshold for high risk classification
    
    Returns:
    --------
    pd.Series
        Binary classification target
    """
    return (y >= threshold).astype(int)

# Create classification targets
y_train_class = create_blackout_risk_target(y_train, threshold=50)
y_val_class = create_blackout_risk_target(y_val, threshold=50)
y_test_class = create_blackout_risk_target(y_test, threshold=50)

print(f"\n  Classification threshold: grid_stress_score >= 50")
print(f"\n  Train set class distribution:")
print(f"    Low Risk (0):  {(y_train_class == 0).sum():,} ({(y_train_class == 0).sum()/len(y_train_class)*100:.2f}%)")
print(f"    High Risk (1): {(y_train_class == 1).sum():,} ({(y_train_class == 1).sum()/len(y_train_class)*100:.2f}%)")

print(f"\n  Validation set class distribution:")
print(f"    Low Risk (0):  {(y_val_class == 0).sum():,} ({(y_val_class == 0).sum()/len(y_val_class)*100:.2f}%)")
print(f"    High Risk (1): {(y_val_class == 1).sum():,} ({(y_val_class == 1).sum()/len(y_val_class)*100:.2f}%)")

print(f"\n  Test set class distribution:")
print(f"    Low Risk (0):  {(y_test_class == 0).sum():,} ({(y_test_class == 0).sum()/len(y_test_class)*100:.2f}%)")
print(f"    High Risk (1): {(y_test_class == 1).sum():,} ({(y_test_class == 1).sum()/len(y_test_class)*100:.2f}%)")

# ============================================================================
# STEP 4.2: DEFINE MODEL CONFIGURATIONS
# ============================================================================

print("\n" + "-" * 80)
print("MODEL CONFIGURATIONS:")
print("-" * 80)

# Regression models
regression_models = {
    'Ridge': Ridge(alpha=1.0, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=20, 
                                          min_samples_split=10, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, 
                                                   learning_rate=0.1, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, 
                           random_state=42, n_jobs=-1)
}

# Classification models
classification_models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=20, 
                                           min_samples_split=10, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, max_depth=5, 
                                                    learning_rate=0.1, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, 
                            random_state=42, n_jobs=-1, eval_metric='logloss')
}

print(f"\n  Regression models: {len(regression_models)}")
for model_name in regression_models.keys():
    print(f"    - {model_name}")

print(f"\n  Classification models: {len(classification_models)}")
for model_name in classification_models.keys():
    print(f"    - {model_name}")

# ============================================================================
# STEP 4.3: TRAIN REGRESSION MODELS
# ============================================================================

print("\n" + "=" * 80)
print("TRAINING REGRESSION MODELS (GRID STRESS SCORE PREDICTION)")
print("=" * 80)

regression_results = {}

for model_name, model in regression_models.items():
    print(f"\n{'-'*80}")
    print(f"Training: {model_name}")
    print(f"{'-'*80}")
    
    # Train
    start_time = time.time()
    model.fit(X_train_scaled, y_train)
    train_time = time.time() - start_time
    
    # Predict on all sets
    y_train_pred = model.predict(X_train_scaled)
    y_val_pred = model.predict(X_val_scaled)
    y_test_pred = model.predict(X_test_scaled)
    
    # Calculate metrics
    results = {
        'model': model,
        'train_time': train_time,
        'train_metrics': {
            'r2': r2_score(y_train, y_train_pred),
            'rmse': np.sqrt(mean_squared_error(y_train, y_train_pred)),
            'mae': mean_absolute_error(y_train, y_train_pred)
        },
        'val_metrics': {
            'r2': r2_score(y_val, y_val_pred),
            'rmse': np.sqrt(mean_squared_error(y_val, y_val_pred)),
            'mae': mean_absolute_error(y_val, y_val_pred)
        },
        'test_metrics': {
            'r2': r2_score(y_test, y_test_pred),
            'rmse': np.sqrt(mean_squared_error(y_test, y_test_pred)),
            'mae': mean_absolute_error(y_test, y_test_pred)
        }
    }
    
    regression_results[model_name] = results
    
    # Display results
    print(f"\n  Training time: {train_time:.2f} seconds")
    print(f"\n  Train Set:")
    print(f"    RÂ²:   {results['train_metrics']['r2']:.6f}")
    print(f"    RMSE: {results['train_metrics']['rmse']:.4f}")
    print(f"    MAE:  {results['train_metrics']['mae']:.4f}")
    
    print(f"\n  Validation Set:")
    print(f"    RÂ²:   {results['val_metrics']['r2']:.6f}")
    print(f"    RMSE: {results['val_metrics']['rmse']:.4f}")
    print(f"    MAE:  {results['val_metrics']['mae']:.4f}")
    
    print(f"\n  Test Set:")
    print(f"    RÂ²:   {results['test_metrics']['r2']:.6f}")
    print(f"    RMSE: {results['test_metrics']['rmse']:.4f}")
    print(f"    MAE:  {results['test_metrics']['mae']:.4f}")

# ============================================================================
# STEP 4.4: REGRESSION MODELS COMPARISON
# ============================================================================

print(f"\n{'='*80}")
print("REGRESSION MODELS COMPARISON")
print(f"{'='*80}")

comparison_data = []
for model_name, results in regression_results.items():
    comparison_data.append({
        'Model': model_name,
        'Train RÂ²': results['train_metrics']['r2'],
        'Val RÂ²': results['val_metrics']['r2'],
        'Test RÂ²': results['test_metrics']['r2'],
        'Val RMSE': results['val_metrics']['rmse'],
        'Test RMSE': results['test_metrics']['rmse'],
        'Train Time (s)': results['train_time']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Val RÂ²', ascending=False)

print("\n")
print(comparison_df.to_string(index=False))

# Identify best model
best_regression_model = comparison_df.iloc[0]['Model']
print(f"\n{'='*80}")
print(f"BEST REGRESSION MODEL: {best_regression_model}")
print(f"{'='*80}")

print("\n" + "=" * 80)
print("MODULE 4A COMPLETE: Regression models trained!")
print("=" * 80)

In [0]:
# ============================================================================
# MODULE 4B: CLASSIFICATION MODEL TRAINING (BLACKOUT RISK)
# ============================================================================

print("\n" + "=" * 80)
print("MODULE 4B: CLASSIFICATION MODEL TRAINING (BLACKOUT RISK)")
print("=" * 80)

classification_results = {}

for model_name, model in classification_models.items():
    print(f"\n{'-'*80}")
    print(f"Training: {model_name}")
    print(f"{'-'*80}")
    
    # Train
    start_time = time.time()
    model.fit(X_train_scaled, y_train_class)
    train_time = time.time() - start_time
    
    print(f"  âœ“ Training completed in {train_time:.2f} seconds")
    
    # Predict on all sets
    print(f"  Making predictions...")
    y_train_pred_class = model.predict(X_train_scaled)
    y_val_pred_class = model.predict(X_val_scaled)
    y_test_pred_class = model.predict(X_test_scaled)
    
    # Calculate metrics
    results = {
        'model': model,
        'train_time': train_time,
        'train_metrics': {
            'accuracy': accuracy_score(y_train_class, y_train_pred_class),
            'precision': precision_score(y_train_class, y_train_pred_class, zero_division=0),
            'recall': recall_score(y_train_class, y_train_pred_class, zero_division=0),
            'f1': f1_score(y_train_class, y_train_pred_class, zero_division=0)
        },
        'val_metrics': {
            'accuracy': accuracy_score(y_val_class, y_val_pred_class),
            'precision': precision_score(y_val_class, y_val_pred_class, zero_division=0),
            'recall': recall_score(y_val_class, y_val_pred_class, zero_division=0),
            'f1': f1_score(y_val_class, y_val_pred_class, zero_division=0)
        },
        'test_metrics': {
            'accuracy': accuracy_score(y_test_class, y_test_pred_class),
            'precision': precision_score(y_test_class, y_test_pred_class, zero_division=0),
            'recall': recall_score(y_test_class, y_test_pred_class, zero_division=0),
            'f1': f1_score(y_test_class, y_test_pred_class, zero_division=0)
        },
        'confusion_matrix': {
            'train': confusion_matrix(y_train_class, y_train_pred_class),
            'val': confusion_matrix(y_val_class, y_val_pred_class),
            'test': confusion_matrix(y_test_class, y_test_pred_class)
        }
    }
    
    classification_results[model_name] = results
    
    # Display results
    print(f"\n  Train Set:")
    print(f"    Accuracy:  {results['train_metrics']['accuracy']:.6f}")
    print(f"    Precision: {results['train_metrics']['precision']:.6f}")
    print(f"    Recall:    {results['train_metrics']['recall']:.6f}")
    print(f"    F1-Score:  {results['train_metrics']['f1']:.6f}")
    
    print(f"\n  Validation Set:")
    print(f"    Accuracy:  {results['val_metrics']['accuracy']:.6f}")
    print(f"    Precision: {results['val_metrics']['precision']:.6f}")
    print(f"    Recall:    {results['val_metrics']['recall']:.6f}")
    print(f"    F1-Score:  {results['val_metrics']['f1']:.6f}")
    
    print(f"\n  Test Set:")
    print(f"    Accuracy:  {results['test_metrics']['accuracy']:.6f}")
    print(f"    Precision: {results['test_metrics']['precision']:.6f}")
    print(f"    Recall:    {results['test_metrics']['recall']:.6f}")
    print(f"    F1-Score:  {results['test_metrics']['f1']:.6f}")
    
    # Show confusion matrix for test set
    print(f"\n  Test Confusion Matrix:")
    cm = results['confusion_matrix']['test']
    print(f"    True Negatives:  {cm[0, 0]:,}")
    print(f"    False Positives: {cm[0, 1]:,}")
    print(f"    False Negatives: {cm[1, 0]:,}")
    print(f"    True Positives:  {cm[1, 1]:,}")

# ============================================================================
# CLASSIFICATION MODELS COMPARISON
# ============================================================================

print(f"\n{'='*80}")
print("CLASSIFICATION MODELS COMPARISON")
print(f"{'='*80}")

comparison_class_data = []
for model_name, results in classification_results.items():
    comparison_class_data.append({
        'Model': model_name,
        'Train Acc': results['train_metrics']['accuracy'],
        'Val Acc': results['val_metrics']['accuracy'],
        'Test Acc': results['test_metrics']['accuracy'],
        'Test Precision': results['test_metrics']['precision'],
        'Test Recall': results['test_metrics']['recall'],
        'Test F1': results['test_metrics']['f1'],
        'Train Time (s)': results['train_time']
    })

comparison_class_df = pd.DataFrame(comparison_class_data)
comparison_class_df = comparison_class_df.sort_values('Test F1', ascending=False)

print("\n")
print(comparison_class_df.to_string(index=False))

# Identify best model
best_classification_model = comparison_class_df.iloc[0]['Model']
print(f"\n{'='*80}")
print(f"BEST CLASSIFICATION MODEL: {best_classification_model}")
print(f"{'='*80}")

print("\n" + "=" * 80)
print("MODULE 4B COMPLETE: Classification models trained!")
print("=" * 80)
print("\nNext: Module 5 will create visualizations and final model selection")
print("=" * 80)

In [0]:
# ============================================================================
# MODULE 5: MODEL EVALUATION & VISUALIZATION
# ============================================================================

print("\n" + "=" * 80)
print("MODULE 5: MODEL EVALUATION & VISUALIZATION")
print("=" * 80)

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# ============================================================================
# STEP 5.1: REGRESSION - ACTUAL VS PREDICTED PLOTS
# ============================================================================

print("\n" + "-" * 80)
print("CREATING REGRESSION VISUALIZATIONS:")
print("-" * 80)

# Get best regression model predictions
best_reg_model = regression_results['Random Forest']['model']
y_test_pred_reg = best_reg_model.predict(X_test_scaled)

# Create figure with subplots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Regression Model Evaluation - Random Forest', fontsize=16, fontweight='bold')

# 1. Actual vs Predicted scatter plot
ax1 = axes[0, 0]
ax1.scatter(y_test, y_test_pred_reg, alpha=0.3, s=10)
ax1.plot([0, 75], [0, 75], 'r--', lw=2, label='Perfect Prediction')
ax1.set_xlabel('Actual Grid Stress Score', fontsize=12)
ax1.set_ylabel('Predicted Grid Stress Score', fontsize=12)
ax1.set_title('Actual vs Predicted (Test Set)', fontsize=14, fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Add RÂ² text
r2_test = regression_results['Random Forest']['test_metrics']['r2']
ax1.text(0.05, 0.95, f'RÂ² = {r2_test:.4f}', transform=ax1.transAxes, 
         fontsize=12, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# 2. Residuals plot
residuals = y_test - y_test_pred_reg
ax2 = axes[0, 1]
ax2.scatter(y_test_pred_reg, residuals, alpha=0.3, s=10)
ax2.axhline(y=0, color='r', linestyle='--', lw=2)
ax2.set_xlabel('Predicted Grid Stress Score', fontsize=12)
ax2.set_ylabel('Residuals', fontsize=12)
ax2.set_title('Residuals Plot', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3)

# 3. Residuals distribution
ax3 = axes[1, 0]
ax3.hist(residuals, bins=50, edgecolor='black', alpha=0.7)
ax3.axvline(x=0, color='r', linestyle='--', lw=2)
ax3.set_xlabel('Residuals', fontsize=12)
ax3.set_ylabel('Frequency', fontsize=12)
ax3.set_title('Residuals Distribution', fontsize=14, fontweight='bold')
ax3.grid(True, alpha=0.3)

# Add statistics
mean_res = residuals.mean()
std_res = residuals.std()
ax3.text(0.05, 0.95, f'Mean: {mean_res:.4f}\nStd: {std_res:.4f}', 
         transform=ax3.transAxes, fontsize=12, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# 4. Model comparison
ax4 = axes[1, 1]
models = list(regression_results.keys())
test_r2 = [regression_results[m]['test_metrics']['r2'] for m in models]
colors = ['green' if r2 == max(test_r2) else 'skyblue' for r2 in test_r2]

bars = ax4.barh(models, test_r2, color=colors, edgecolor='black')
ax4.set_xlabel('RÂ² Score', fontsize=12)
ax4.set_title('Model Comparison (Test RÂ²)', fontsize=14, fontweight='bold')
ax4.set_xlim([0, 1])
ax4.grid(True, alpha=0.3, axis='x')

# Add values on bars
for i, (bar, r2) in enumerate(zip(bars, test_r2)):
    ax4.text(r2 + 0.01, i, f'{r2:.4f}', va='center', fontsize=10)

plt.tight_layout()
plt.show()

print("  âœ“ Regression visualizations created")

# ============================================================================
# STEP 5.2: CLASSIFICATION - CONFUSION MATRIX & METRICS
# ============================================================================

print("\n" + "-" * 80)
print("CREATING CLASSIFICATION VISUALIZATIONS:")
print("-" * 80)

# Get best classification model predictions
best_class_model = classification_results['Random Forest']['model']
y_test_pred_class = best_class_model.predict(X_test_scaled)

# Create figure with subplots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Classification Model Evaluation - Random Forest', fontsize=16, fontweight='bold')

# 1. Confusion Matrix
ax1 = axes[0, 0]
cm = confusion_matrix(y_test_class, y_test_pred_class)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1, cbar=False,
            xticklabels=['Low Risk', 'High Risk'],
            yticklabels=['Low Risk', 'High Risk'])
ax1.set_xlabel('Predicted', fontsize=12)
ax1.set_ylabel('Actual', fontsize=12)
ax1.set_title('Confusion Matrix (Test Set)', fontsize=14, fontweight='bold')

# 2. Classification metrics comparison
ax2 = axes[0, 1]
models_class = list(classification_results.keys())
test_f1 = [classification_results[m]['test_metrics']['f1'] for m in models_class]
test_acc = [classification_results[m]['test_metrics']['accuracy'] for m in models_class]
test_prec = [classification_results[m]['test_metrics']['precision'] for m in models_class]
test_rec = [classification_results[m]['test_metrics']['recall'] for m in models_class]

x = np.arange(len(models_class))
width = 0.2

ax2.bar(x - 1.5*width, test_acc, width, label='Accuracy', alpha=0.8)
ax2.bar(x - 0.5*width, test_prec, width, label='Precision', alpha=0.8)
ax2.bar(x + 0.5*width, test_rec, width, label='Recall', alpha=0.8)
ax2.bar(x + 1.5*width, test_f1, width, label='F1-Score', alpha=0.8)

ax2.set_xlabel('Models', fontsize=12)
ax2.set_ylabel('Score', fontsize=12)
ax2.set_title('Classification Metrics Comparison', fontsize=14, fontweight='bold')
ax2.set_xticks(x)
ax2.set_xticklabels(models_class, rotation=45, ha='right')
ax2.legend()
ax2.set_ylim([0, 1.1])
ax2.grid(True, alpha=0.3, axis='y')

# 3. Class distribution in predictions
ax3 = axes[1, 0]
actual_counts = [sum(y_test_class == 0), sum(y_test_class == 1)]
pred_counts = [sum(y_test_pred_class == 0), sum(y_test_pred_class == 1)]

x_pos = np.arange(2)
width = 0.35

bars1 = ax3.bar(x_pos - width/2, actual_counts, width, label='Actual', alpha=0.8)
bars2 = ax3.bar(x_pos + width/2, pred_counts, width, label='Predicted', alpha=0.8)

ax3.set_xlabel('Class', fontsize=12)
ax3.set_ylabel('Count', fontsize=12)
ax3.set_title('Class Distribution: Actual vs Predicted', fontsize=14, fontweight='bold')
ax3.set_xticks(x_pos)
ax3.set_xticklabels(['Low Risk (0)', 'High Risk (1)'])
ax3.legend()
ax3.grid(True, alpha=0.3, axis='y')

# Add count labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax3.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height):,}', ha='center', va='bottom', fontsize=9)

# 4. Model F1-Score comparison
ax4 = axes[1, 1]
colors_f1 = ['green' if f1 == max(test_f1) else 'coral' for f1 in test_f1]

bars = ax4.barh(models_class, test_f1, color=colors_f1, edgecolor='black')
ax4.set_xlabel('F1-Score', fontsize=12)
ax4.set_title('Model Comparison (Test F1-Score)', fontsize=14, fontweight='bold')
ax4.set_xlim([0, 1])
ax4.grid(True, alpha=0.3, axis='x')

# Add values on bars
for i, (bar, f1) in enumerate(zip(bars, test_f1)):
    ax4.text(f1 + 0.01, i, f'{f1:.4f}', va='center', fontsize=10)

plt.tight_layout()
plt.show()

print("  âœ“ Classification visualizations created")

# ============================================================================
# STEP 5.3: FEATURE IMPORTANCE (RANDOM FOREST)
# ============================================================================

print("\n" + "-" * 80)
print("FEATURE IMPORTANCE ANALYSIS:")
print("-" * 80)

# Get feature importance from best regression model
feature_importance_reg = best_reg_model.feature_importances_
feature_names = X_train_scaled.columns

# Create DataFrame and sort
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance_reg
}).sort_values('Importance', ascending=False)

# Plot top 15 features
fig, ax = plt.subplots(figsize=(12, 8))
top_n = 15
top_features = importance_df.head(top_n)

bars = ax.barh(range(len(top_features)), top_features['Importance'], 
               color='steelblue', edgecolor='black')
ax.set_yticks(range(len(top_features)))
ax.set_yticklabels(top_features['Feature'])
ax.set_xlabel('Importance', fontsize=12)
ax.set_title(f'Top {top_n} Feature Importance - Random Forest Regression', 
             fontsize=14, fontweight='bold')
ax.invert_yaxis()
ax.grid(True, alpha=0.3, axis='x')

# Add values on bars
for i, (bar, imp) in enumerate(zip(bars, top_features['Importance'])):
    ax.text(imp + 0.001, i, f'{imp:.4f}', va='center', fontsize=9)

plt.tight_layout()
plt.show()

print(f"\n  Top 10 Most Important Features:")
for i, row in importance_df.head(10).iterrows():
    print(f"    {i+1:2d}. {row['Feature']:40s} {row['Importance']:.6f}")

print("\n" + "=" * 80)
print("MODULE 5 COMPLETE: Visualizations and analysis finished!")
print("=" * 80)

In [0]:
# ============================================================================
# MODULE 6: MODEL SAVING & PIPELINE SUMMARY
# ============================================================================

print("\n" + "=" * 80)
print("MODULE 6: MODEL SAVING & PIPELINE SUMMARY")
print("=" * 80)

import pickle
from datetime import datetime
import os

# ============================================================================
# STEP 6.0: CREATE OUTPUT FOLDER USING RELATIVE PATH
# ============================================================================

print("\n" + "-" * 80)
print("CREATING OUTPUT FOLDER:")
print("-" * 80)

# Get current working directory and create relative path
current_dir = os.getcwd()
print(f"  Current directory: {current_dir}")

# Create output folder name
output_folder_name = 'pipeline_model_v2'

# Try to create in current directory first
output_folder = os.path.join(current_dir, output_folder_name)

try:
    os.makedirs(output_folder, exist_ok=True)
    print(f"  âœ“ Folder created: {output_folder_name}")
except Exception as e:
    print(f"  Error with current directory: {e}")
    # Fallback: use /tmp which always works
    output_folder = f'/tmp/{output_folder_name}'
    os.makedirs(output_folder, exist_ok=True)
    print(f"  âœ“ Using temp folder: {output_folder}")

print(f"  âœ“ Output folder ready")

# ============================================================================
# STEP 6.1: SAVE PREPROCESSING OBJECTS
# ============================================================================

print("\n" + "-" * 80)
print("SAVING PREPROCESSING OBJECTS:")
print("-" * 80)

# Create a preprocessing pipeline dictionary
preprocessing_pipeline = {
    'scaler': scaler,
    'country_encoder': country_encoder,
    'feature_names': list(X_train_scaled.columns),
    'target_name': 'grid_stress_score',
    'classification_threshold': 50
}

# Save preprocessing pipeline
preprocessing_path = os.path.join(output_folder, 'grid_stress_preprocessing.pkl')
with open(preprocessing_path, 'wb') as f:
    pickle.dump(preprocessing_pipeline, f)

print(f"  âœ“ grid_stress_preprocessing.pkl")

# ============================================================================
# STEP 6.2: SAVE BEST MODELS
# ============================================================================

print("\n" + "-" * 80)
print("SAVING BEST MODELS:")
print("-" * 80)

# Save best regression model
best_reg_model_name = 'Random Forest'
best_reg_model = regression_results[best_reg_model_name]['model']

regression_path = os.path.join(output_folder, 'grid_stress_regression_model.pkl')
with open(regression_path, 'wb') as f:
    pickle.dump(best_reg_model, f)

print(f"  âœ“ grid_stress_regression_model.pkl")

# Save best classification model
best_class_model_name = 'Random Forest'
best_class_model = classification_results[best_class_model_name]['model']

classification_path = os.path.join(output_folder, 'grid_stress_classification_model.pkl')
with open(classification_path, 'wb') as f:
    pickle.dump(best_class_model, f)

print(f"  âœ“ grid_stress_classification_model.pkl")

# ============================================================================
# STEP 6.3: SAVE ALL MODEL RESULTS
# ============================================================================

print("\n" + "-" * 80)
print("SAVING ALL MODEL RESULTS:")
print("-" * 80)

# Save all regression results
regression_results_summary = {}
for model_name, results in regression_results.items():
    regression_results_summary[model_name] = {
        'train_time': results['train_time'],
        'train_metrics': results['train_metrics'],
        'val_metrics': results['val_metrics'],
        'test_metrics': results['test_metrics']
    }

results_reg_path = os.path.join(output_folder, 'regression_results_all.pkl')
with open(results_reg_path, 'wb') as f:
    pickle.dump(regression_results_summary, f)

print(f"  âœ“ regression_results_all.pkl")

# Save all classification results
classification_results_summary = {}
for model_name, results in classification_results.items():
    classification_results_summary[model_name] = {
        'train_time': results['train_time'],
        'train_metrics': results['train_metrics'],
        'val_metrics': results['val_metrics'],
        'test_metrics': results['test_metrics'],
        'confusion_matrix': results['confusion_matrix']
    }

results_class_path = os.path.join(output_folder, 'classification_results_all.pkl')
with open(results_class_path, 'wb') as f:
    pickle.dump(classification_results_summary, f)

print(f"  âœ“ classification_results_all.pkl")

# ============================================================================
# STEP 6.4: CREATE MODEL METADATA
# ============================================================================

print("\n" + "-" * 80)
print("CREATING MODEL METADATA:")
print("-" * 80)

metadata = {
    'created_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'author': 'Peter Leme',
    'project': 'European Grid Stress Prediction',
    'version': 'v2',
    
    # Data information
    'data_info': {
        'train_samples': len(X_train),
        'val_samples': len(X_val),
        'test_samples': len(X_test),
        'total_samples': len(X_train) + len(X_val) + len(X_test),
        'num_features': X_train_scaled.shape[1],
        'num_countries': len(country_encoder.classes_),
        'countries': list(country_encoder.classes_),
        'date_range_train': str(meta_train['index'].min()) + " to " + str(meta_train['index'].max()),
        'date_range_val': str(meta_val['index'].min()) + " to " + str(meta_val['index'].max()),
        'date_range_test': str(meta_test['index'].min()) + " to " + str(meta_test['index'].max())
    },
    
    # Regression model performance
    'regression': {
        'best_model': best_reg_model_name,
        'test_r2': float(regression_results[best_reg_model_name]['test_metrics']['r2']),
        'test_rmse': float(regression_results[best_reg_model_name]['test_metrics']['rmse']),
        'test_mae': float(regression_results[best_reg_model_name]['test_metrics']['mae']),
        'val_r2': float(regression_results[best_reg_model_name]['val_metrics']['r2']),
        'train_time_seconds': float(regression_results[best_reg_model_name]['train_time'])
    },
    
    # Classification model performance
    'classification': {
        'best_model': best_class_model_name,
        'threshold': 50,
        'test_accuracy': float(classification_results[best_class_model_name]['test_metrics']['accuracy']),
        'test_precision': float(classification_results[best_class_model_name]['test_metrics']['precision']),
        'test_recall': float(classification_results[best_class_model_name]['test_metrics']['recall']),
        'test_f1': float(classification_results[best_class_model_name]['test_metrics']['f1']),
        'val_accuracy': float(classification_results[best_class_model_name]['val_metrics']['accuracy']),
        'val_f1': float(classification_results[best_class_model_name]['val_metrics']['f1']),
        'train_time_seconds': float(classification_results[best_class_model_name]['train_time']),
        'class_distribution': {
            'train': {'low_risk': int((y_train_class == 0).sum()), 'high_risk': int((y_train_class == 1).sum())},
            'val': {'low_risk': int((y_val_class == 0).sum()), 'high_risk': int((y_val_class == 1).sum())},
            'test': {'low_risk': int((y_test_class == 0).sum()), 'high_risk': int((y_test_class == 1).sum())}
        }
    },
    
    # Feature information
    'features': {
        'total': len(X_train_scaled.columns),
        'original': 12,
        'engineered': 15,
        'encoded': 1,
        'feature_list': list(X_train_scaled.columns)
    }
}

# Save metadata as pickle
metadata_path = os.path.join(output_folder, 'model_metadata.pkl')
with open(metadata_path, 'wb') as f:
    pickle.dump(metadata, f)

print(f"  âœ“ model_metadata.pkl")

# Save as JSON
import json
metadata_json_path = os.path.join(output_folder, 'model_metadata.json')
with open(metadata_json_path, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"  âœ“ model_metadata.json")

# ============================================================================
# STEP 6.5: CREATE README
# ============================================================================

print("\n" + "-" * 80)
print("CREATING README:")
print("-" * 80)

readme_content = f"""European Grid Stress Prediction - ML Pipeline v2
Author: Peter Leme
Created: {metadata['created_date']}

MODELS:
- Regression (Random Forest): Test R2 = {metadata['regression']['test_r2']:.4f}
- Classification (Random Forest): Test F1 = {metadata['classification']['test_f1']:.4f}

DATA:
- Total: {metadata['data_info']['total_samples']:,} samples
- Features: {metadata['features']['total']}
- Countries: {metadata['data_info']['num_countries']}

FILES:
1. grid_stress_preprocessing.pkl
2. grid_stress_regression_model.pkl
3. grid_stress_classification_model.pkl
4. regression_results_all.pkl
5. classification_results_all.pkl
6. model_metadata.pkl
7. model_metadata.json
8. README.txt
"""

readme_path = os.path.join(output_folder, 'README.txt')
with open(readme_path, 'w') as f:
    f.write(readme_content)

print(f"  âœ“ README.txt")

# ============================================================================
# STEP 6.6: VERIFY AND SUMMARIZE
# ============================================================================

print("\n" + "-" * 80)
print("SAVED FILES:")
print("-" * 80)

saved_files = os.listdir(output_folder)
for i, file in enumerate(sorted(saved_files), 1):
    file_path = os.path.join(output_folder, file)
    file_size = os.path.getsize(file_path) / (1024 * 1024)
    print(f"  {i}. {file:45s} ({file_size:.2f} MB)")

print(f"\n  Total files: {len(saved_files)}")
print(f"  Location: {output_folder}")

# ============================================================================
# FINAL SUMMARY
# ============================================================================

print("\n" + "=" * 80)
print("ðŸŽ‰ COMPLETE PIPELINE FINISHED! ðŸŽ‰")
print("=" * 80)

print("\nMODELS PERFORMANCE:")
print(f"  Regression  - Test RÂ²: {metadata['regression']['test_r2']:.4f}, RMSE: {metadata['regression']['test_rmse']:.2f}")
print(f"  Classification - Test F1: {metadata['classification']['test_f1']:.4f}, Accuracy: {metadata['classification']['test_accuracy']:.4f}")

print(f"\nALL FILES SAVED TO: {output_folder}")
print("\nâœ“ Ready for Streamlit deployment!")
print("=" * 80)

In [0]:
dbutils.fs.cp("dbfs:/path/to/file.csv", "file:/tmp/file.csv")
