In [None]:
!pip install keras-tuner shap tf-explain tqdm

In [8]:
# ===============================================================
# CELL 1: MAIN CODE - RUN THIS FIRST
# ===============================================================

#!/usr/bin/env python
"""
Comprehensive hyper-parameter tuner for Thermal-NN.

• Optimized for Google Colab T4 GPU
• Works on CPU, GPU (Colab T4/V100) or headless servers.
• Handles any CSV that has a numeric target column named 'pm'.
• Writes:
      artifacts/tnn_best.keras
      artifacts/metrics.json

Usage in Google Colab:
    Just run all cells in order, modify the CONFIG section as needed.
"""

# --------- 0. Configuration (modify these as needed) ----------------
CONFIG = {
    'csv_path': '/content/measures_v2.csv',        # Path to your CSV file
    'trials': 20,                                   # Number of hyperparameter trials
    'epochs': 30,                                   # Epochs per trial
    'final_epochs': 50,                            # Epochs for final model
    'val_split': 0.2,                              # Validation split ratio
    'batch_size': 512,                             # Batch size (increased for better GPU usage)
    'random_seed': 42                              # Random seed for reproducibility
}

# --------- 1. environment tweaks (before TF import) -----------------
import os
os.environ.setdefault("TF_FORCE_GPU_ALLOW_GROWTH", "true")
os.environ.setdefault("TF_GPU_ALLOCATOR", "cuda_malloc_async")
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2")  # Reduce TF logging

# --------- 2. imports ------------------------------------------------
from pathlib import Path
import json, gc, time, sys
import numpy as np, pandas as pd
import tensorflow as tf

# Install keras-tuner if not available
try:
    import keras_tuner as kt
except ImportError:
    print("Installing keras-tuner...")
    import subprocess
    subprocess.run(["pip", "install", "keras-tuner"], check=True)
    import keras_tuner as kt

from sklearn.preprocessing import StandardScaler
from keras import backend as K

# --------- 3. GPU setup and verification ----------------------------
def setup_gpu():
    """Configure GPU settings for optimal performance in Colab"""
    print("Setting up GPU configuration...")

    # Check TensorFlow version
    print(f"TensorFlow version: {tf.__version__}")

    # List available devices
    physical_devices = tf.config.list_physical_devices()
    print(f"Available devices: {[device.name for device in physical_devices]}")

    # Configure GPU memory growth
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            # Enable memory growth for all GPUs
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            print(f"✅ GPU memory growth enabled for {len(gpus)} GPU(s)")

            # Verify GPU is accessible
            with tf.device('/GPU:0'):
                test_tensor = tf.constant([[1.0]])
                result = tf.matmul(test_tensor, test_tensor)
                print(f"✅ GPU test successful: {result.numpy()}")

        except RuntimeError as e:
            print(f"⚠️  GPU setup error: {e}")
            print("Falling back to CPU")

    else:
        print("⚠️  No GPU detected - using CPU")

    return len(gpus) > 0

# Set reproducibility
tf.keras.utils.set_random_seed(CONFIG['random_seed'])
np.random.seed(CONFIG['random_seed'])

# --------- 4. load & scale data -------------------------------------
def load_scaled(csv_path, val_split=0.2):
    """Load and scale data with improved error handling"""
    print(f"Loading data from {csv_path}")

    # Handle both string and Path objects
    csv_path = Path(csv_path)

    if not csv_path.exists():
        raise FileNotFoundError(f"CSV file not found: {csv_path}")

    try:
        df = pd.read_csv(csv_path)
        print(f"Data shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")

        if "pm" not in df.columns:
            available_cols = list(df.columns)
            raise KeyError(f"'pm' target column not found in CSV. Available columns: {available_cols}")

        # Check for missing values
        missing_count = df.isnull().sum().sum()
        if missing_count > 0:
            print(f"⚠️  Warning: Found {missing_count} missing values, filling with column means")
            df = df.fillna(df.mean())

        X_raw = df.drop(columns=["pm"]).values.astype("float32")
        y_raw = df["pm"].values.reshape(-1, 1).astype("float32")

        print(f"Features shape: {X_raw.shape}, Target shape: {y_raw.shape}")

        xs, ys = StandardScaler(), StandardScaler()
        X = xs.fit_transform(X_raw)
        y = ys.fit_transform(y_raw).ravel()

        # Stratified split to maintain data distribution
        idx = np.random.permutation(len(X))
        split = int(len(X) * (1 - val_split))
        x_train, y_train = X[idx[:split]], y[idx[:split]]
        x_val, y_val = X[idx[split:]], y[idx[split:]]

        print(f"Train set: {x_train.shape}, Validation set: {x_val.shape}")

        return (x_train, y_train), (x_val, y_val), (xs, ys)

    except Exception as e:
        raise RuntimeError(f"Error loading data: {e}")

# --------- 5. model factory -----------------------------------------
class ThermalNN(tf.keras.Model):
    """Two-layer dense surrogate, same topology as original notebook."""
    def __init__(self, width=32, act="relu", dropout_rate=0.0):
        super().__init__()
        self.g = tf.keras.Sequential([
            tf.keras.layers.Dense(width, activation=act),
            tf.keras.layers.Dropout(dropout_rate),
            tf.keras.layers.Dense(1)
        ])
        self.c = tf.keras.Sequential([
            tf.keras.layers.Dense(width, activation=act),
            tf.keras.layers.Dropout(dropout_rate),
            tf.keras.layers.Dense(1)
        ])
        self.p = tf.keras.Sequential([
            tf.keras.layers.Dense(width, activation=act),
            tf.keras.layers.Dropout(dropout_rate),
            tf.keras.layers.Dense(1)
        ])

    def call(self, x, training=None):
        g = self.g(x, training=training)
        c = self.c(x, training=training)
        p = self.p(x, training=training)
        return p / (g + 1e-7) + c

def build_tnn(hp: kt.HyperParameters):
    """Build model with hyperparameters"""
    width = hp.Int("width", 16, 128, step=16)  # Adjusted range
    act = hp.Choice("activation", ["relu", "elu", "swish"])
    lr = hp.Float("lr", 1e-5, 1e-2, sampling="log")
    dropout = hp.Float("dropout", 0.0, 0.3, step=0.1)

    # Force model creation on GPU
    with tf.device('/GPU:0'):
        model = ThermalNN(width, act, dropout)

        # Use mixed precision for better GPU performance
        optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

        model.compile(
            optimizer=optimizer,
            loss="mse",
            metrics=["mse", "mae"]
        )
    return model

# --------- 6. callbacks for better training -------------------------
def get_callbacks(patience=6):
    """Get training callbacks"""
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=patience,
            restore_best_weights=True,
            verbose=1
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=3,
            min_lr=1e-7,
            verbose=1
        ),
        tf.keras.callbacks.LambdaCallback(
            on_epoch_end=lambda epoch, logs: gc.collect() if epoch % 5 == 0 else None
        )
    ]
    return callbacks

# --------- 7. main training function --------------------------------
def run_thermal_nn_tuning(config=None):
    """Main function to run hyperparameter tuning"""
    if config is None:
        config = CONFIG

    print("🔥 Starting Thermal-NN Hyperparameter Tuning")
    print("=" * 50)

    # Setup GPU
    gpu_available = setup_gpu()

    # Load and prepare data
    try:
        (x_tr, y_tr), (x_val, y_val), scalers = load_scaled(
            config['csv_path'],
            config['val_split']
        )
    except Exception as e:
        print(f"❌ Error loading data: {e}")
        return None

    # Setup tuner
    print(f"\n🔍 Starting hyperparameter search with {config['trials']} trials")

    tuner = kt.RandomSearch(
        build_tnn,
        objective="val_loss",
        max_trials=config['trials'],
        directory="tuner_logs",
        project_name="thermal_nn",
        max_consecutive_failed_trials=max(5, config['trials'] // 4),
        overwrite=True  # Clean start each run
    )

    # Memory cleanup callback
    clear_cb = tf.keras.callbacks.LambdaCallback(
        on_trial_end=lambda *_: (K.clear_session(), gc.collect(),
                                 tf.keras.backend.clear_session())
    )

    # Start tuning
    t0 = time.time()
    try:
        # Verify GPU usage before training
        print("\n🔍 Verifying GPU usage...")

        # Force data to GPU
        with tf.device('/GPU:0'):
            x_tr_gpu = tf.constant(x_tr)
            y_tr_gpu = tf.constant(y_tr)
            x_val_gpu = tf.constant(x_val)
            y_val_gpu = tf.constant(y_val)
            print(f"✅ Training data moved to GPU: {x_tr_gpu.device}")
            print(f"✅ Validation data moved to GPU: {x_val_gpu.device}")

        # Monitor GPU memory usage
        def gpu_memory_callback():
            if tf.config.list_physical_devices('GPU'):
                gpu_info = tf.config.experimental.get_memory_info('GPU:0')
                current_mb = gpu_info['current'] / 1024 / 1024
                peak_mb = gpu_info['peak'] / 1024 / 1024
                print(f"🔥 GPU Memory - Current: {current_mb:.1f}MB, Peak: {peak_mb:.1f}MB")

        gpu_monitor_cb = tf.keras.callbacks.LambdaCallback(
            on_epoch_begin=lambda epoch, logs: gpu_memory_callback() if epoch % 5 == 0 else None
        )

        tuner.search(
            x_tr_gpu.numpy(), y_tr_gpu.numpy(),
            epochs=config['epochs'],
            batch_size=config['batch_size'],
            validation_data=(x_val_gpu.numpy(), y_val_gpu.numpy()),
            callbacks=[clear_cb, gpu_monitor_cb] + get_callbacks(patience=6),
            verbose=1
        )
        print(f"✅ Tuning completed in {time.time()-t0:.1f}s")

        # Get best model and retrain
        best_hp = tuner.get_best_hyperparameters(1)[0]
        print(f"\n🏆 Best hyperparameters:")
        for param, value in best_hp.values.items():
            print(f"  {param}: {value}")

        print("\n🔄 Training final model with best hyperparameters...")

        # Force final model to GPU
        with tf.device('/GPU:0'):
            best_model = tuner.hypermodel.build(best_hp)

        # Train final model with more epochs on GPU
        with tf.device('/GPU:0'):
            history = best_model.fit(
                x_tr_gpu.numpy(), y_tr_gpu.numpy(),
                validation_data=(x_val_gpu.numpy(), y_val_gpu.numpy()),
                epochs=config['final_epochs'],
                batch_size=config['batch_size'],
                callbacks=[gpu_monitor_cb] + get_callbacks(patience=8),
                verbose=1
            )

        # Evaluate final model
        with tf.device('/GPU:0'):
            val_loss, val_mse, val_mae = best_model.evaluate(x_val_gpu.numpy(), y_val_gpu.numpy(), verbose=0)
        print(f"✅ Final validation metrics:")
        print(f"  Loss: {val_loss:.4f}")
        print(f"  MSE: {val_mse:.4f}")
        print(f"  MAE: {val_mae:.4f}")

        # Final GPU memory check
        if tf.config.list_physical_devices('GPU'):
            gpu_info = tf.config.experimental.get_memory_info('GPU:0')
            final_mb = gpu_info['current'] / 1024 / 1024
            peak_mb = gpu_info['peak'] / 1024 / 1024
            print(f"🔥 Final GPU Memory - Current: {final_mb:.1f}MB, Peak: {peak_mb:.1f}MB")

        # Save results
        art = Path("artifacts")
        art.mkdir(exist_ok=True)

        best_model.save(art / "tnn_best.keras", include_optimizer=False)

        metrics = {
            "val_loss": float(val_loss),
            "val_mse": float(val_mse),
            "val_mae": float(val_mae),
            "hyperparameters": best_hp.values,
            "training_time_seconds": time.time() - t0,
            "gpu_used": gpu_available,
            "config_used": config
        }

        with open(art / "metrics.json", "w") as fp:
            json.dump(metrics, fp, indent=2)

        print(f"\n💾 Model saved to: {art / 'tnn_best.keras'}")
        print(f"📊 Metrics saved to: {art / 'metrics.json'}")

        return best_model, metrics, history

    except Exception as e:
        print(f"❌ Error during tuning: {e}")
        return None

    finally:
        # Final cleanup
        K.clear_session()
        gc.collect()

# --------- 8. Helper functions for Colab ----------------------------
def show_config():
    """Display current configuration"""
    print("📋 Current Configuration:")
    print("-" * 30)
    for key, value in CONFIG.items():
        print(f"  {key}: {value}")

def update_config(**kwargs):
    """Update configuration parameters"""
    for key, value in kwargs.items():
        if key in CONFIG:
            CONFIG[key] = value
            print(f"✅ Updated {key}: {value}")
        else:
            print(f"⚠️  Warning: {key} is not a valid config parameter")

# --------- 9. Ready to run! -----------------------------------------
print("🚀 Thermal-NN Tuner loaded and ready!")
print("\nTo run with default settings:")
print(">>> result = run_thermal_nn_tuning()")
print("\nTo see current config:")
print(">>> show_config()")
print("\nTo update config:")
print(">>> update_config(trials=30, epochs=40)")

🚀 Thermal-NN Tuner loaded and ready!

To run with default settings:
>>> result = run_thermal_nn_tuning()

To see current config:
>>> show_config()

To update config:
>>> update_config(trials=30, epochs=40)


In [6]:
# ===============================================================
# CELL 2: GPU MONITORING SETUP - RUN THIS SECOND
# ===============================================================

# Force mixed precision for better GPU performance
try:
    policy = tf.keras.mixed_precision.Policy('mixed_float16')
    tf.keras.mixed_precision.set_global_policy(policy)
    print('✅ Mixed precision enabled (float16)')
except:
    print('⚠️  Mixed precision not available, using float32')

# GPU monitoring functions
def check_gpu_utilization():
    """Check if GPU is actually being used"""
    import subprocess

    if tf.config.list_physical_devices('GPU'):
        try:
            # Check nvidia-smi for GPU utilization
            result = subprocess.run(['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total',
                                   '--format=csv,noheader,nounits'],
                                  capture_output=True, text=True)
            if result.returncode == 0:
                gpu_util, mem_used, mem_total = result.stdout.strip().split(', ')
                print(f"🔥 GPU Utilization: {gpu_util}% | Memory: {mem_used}MB/{mem_total}MB")
                return int(gpu_util) > 0
            else:
                print("Could not get GPU utilization")
                return False
        except:
            print("nvidia-smi not available")
            return False
    else:
        print("No GPU available")
        return False

def monitor_gpu_usage(duration=300):
    """Monitor GPU usage for specified duration (default 5 minutes)"""
    import threading
    import time

    def monitor():
        start_time = time.time()
        while time.time() - start_time < duration:
            check_gpu_utilization()
            time.sleep(10)

    thread = threading.Thread(target=monitor)
    thread.daemon = True
    thread.start()
    return thread

# Check current GPU status
print("Current GPU status:")
check_gpu_utilization()

print("\n✅ GPU monitoring functions ready!")
print("Start monitoring with: monitor_thread = monitor_gpu_usage(300)")

✅ Mixed precision enabled (float16)
Current GPU status:
🔥 GPU Utilization: 0% | Memory: 278MB/15360MB

✅ GPU monitoring functions ready!
Start monitoring with: monitor_thread = monitor_gpu_usage(300)


In [9]:
# ===============================================================
# CELL 3: CHECK AND MODIFY CONFIGURATION - RUN THIS THIRD
# ===============================================================

# Check current configuration
show_config()

# Optional: Modify configuration for better GPU utilization
print("\n🔧 Optimizing for GPU performance...")
update_config(
    batch_size=1024,    # Larger batch for better GPU utilization
    trials=15,          # Fewer trials but more focused
    epochs=25           # Slightly fewer epochs per trial
)

print("\n📁 Checking if data file exists...")
import os
if os.path.exists(CONFIG['csv_path']):
    print(f"✅ Data file found: {CONFIG['csv_path']}")
else:
    print(f"❌ Data file not found: {CONFIG['csv_path']}")
    print("Please upload your CSV file or update the path in CONFIG")

📋 Current Configuration:
------------------------------
  csv_path: /content/measures_v2.csv
  trials: 20
  epochs: 30
  final_epochs: 50
  val_split: 0.2
  batch_size: 512
  random_seed: 42

🔧 Optimizing for GPU performance...
✅ Updated batch_size: 1024
✅ Updated trials: 15
✅ Updated epochs: 25

📁 Checking if data file exists...
✅ Data file found: /content/measures_v2.csv


In [10]:
# ===============================================================
# CELL 4: START TRAINING - RUN THIS FOURTH
# ===============================================================

# Start GPU monitoring (optional but recommended)
print("🔍 Starting GPU monitoring...")
monitor_thread = monitor_gpu_usage(600)  # Monitor for 10 minutes

# Start training
print("\n🚀 Starting hyperparameter tuning...")
result = run_thermal_nn_tuning()

if result:
    best_model, metrics, history = result
    print("\n🎉 Training completed successfully!")
    print("Model and metrics saved in 'artifacts' folder")
else:
    print("\n❌ Training failed")

Trial 15 Complete [00h 00m 42s]
val_loss: 2434.235595703125

Best val_loss So Far: 348.73736572265625
Total elapsed time: 00h 14m 38s
✅ Tuning completed in 878.2s

🏆 Best hyperparameters:
  width: 48
  activation: elu
  lr: 0.0003204119265097899
  dropout: 0.1

🔄 Training final model with best hyperparameters...
🔥 GPU Memory - Current: 171.7MB, Peak: 267.5MB
Epoch 1/50
[1m1040/1040[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 7ms/step - loss: 75989.6094 - mae: 4.5771 - mse: 75989.6094 - val_loss: 3187.2876 - val_mae: 2.0370 - val_mse: 3187.2876 - learning_rate: 3.2041e-04
Epoch 2/50
[1m1040/1040[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 371446080.0000 - mae: 30.5517 - mse: 371446080.0000 - val_loss: 159824.4375 - val_mae: 4.8457 - val_mse: 159824.4375 - learning_rate: 3.2041e-04
Epoch 3/50
[1m1040/1040[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - loss: 52221.1992 - mae: 3.8650 - mse: 52221.1992 - val_loss: 18649.5664 - val_m

In [12]:
# ===============================================================
# CELL 5: CHECK RESULTS - RUN THIS AFTER TRAINING
# ===============================================================

# Check final GPU utilization
print("Final GPU check:")
check_gpu_utilization()

# Load and display results
import json
from pathlib import Path

artifacts_dir = Path("artifacts")
if artifacts_dir.exists():
    # Display metrics
    if (artifacts_dir / "metrics.json").exists():
        with open(artifacts_dir / "metrics.json", "r") as f:
            metrics = json.load(f)

        print("\n📊 Final Results:")
        print("=" * 40)
        print(f"Validation Loss: {metrics['val_loss']:.4f}")
        print(f"Validation MSE: {metrics['val_mse']:.4f}")
        print(f"Validation MAE: {metrics['val_mae']:.4f}")
        print(f"Training Time: {metrics['training_time_seconds']:.1f} seconds")
        print(f"GPU Used: {metrics['gpu_used']}")

        print(f"\n🏆 Best Hyperparameters:")
        for param, value in metrics['hyperparameters'].items():
            print(f"  {param}: {value}")

    # List saved files
    print(f"\n💾 Saved Files:")
    for file in artifacts_dir.iterdir():
        print(f"  {file.name} ({file.stat().st_size / 1024:.1f} KB)")

else:
    print("❌ No artifacts directory found - training may have failed")

Final GPU check:
🔥 GPU Utilization: 0% | Memory: 410MB/15360MB

📊 Final Results:
Validation Loss: 3187.2996
Validation MSE: 3187.2996
Validation MAE: 2.0369
Training Time: 944.6 seconds
GPU Used: True

🏆 Best Hyperparameters:
  width: 48
  activation: elu
  lr: 0.0003204119265097899
  dropout: 0.1

💾 Saved Files:
  metrics.json (0.5 KB)
  tnn_best.keras (72.2 KB)
