# XGBOOST EXOPLANET CLASSIFICATION PIPELINE

In [23]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix, 
    f1_score, precision_score, recall_score
)
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
import json
import pickle
from pathlib import Path
from itertools import product

warnings.filterwarnings('ignore')

# CONFIGURATION GLOBALE

In [24]:
# Style configuration
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# GitHub-compatible paths
DATA_PATH = Path('../../data/processed')
MODEL_PATH = Path('./xgboost_model')
MODEL_PATH.mkdir(parents=True, exist_ok=True)

# Class names
CLASS_NAMES = {
    0: 'False Positive',
    1: 'Candidate',
    2: 'Confirmed'
}

print("=" * 80)
print("XGBOOST - HYPERPARAMETER TUNING + FINAL TRAINING")
print("=" * 80)

XGBOOST - HYPERPARAMETER TUNING + FINAL TRAINING


# VERIFICATION CUDA/GPU

In [25]:
def check_cuda_availability():
    """Check CUDA availability for XGBoost"""
    print("\n" + "=" * 80)
    print("1. CUDA VERIFICATION")
    print("=" * 80)
    
    print(f"XGBoost version: {xgb.__version__}")
    build_info = xgb.build_info()
    
    if not build_info.get('USE_CUDA', False):
        print("WARNING: XGBoost not compiled with CUDA support")
        print("INFO: Model will use CPU")
        device = 'cpu'
    else:
        print("SUCCESS: GPU support enabled")
        device = 'cuda'
    
    return device

device = check_cuda_availability()


1. CUDA VERIFICATION
XGBoost version: 2.1.4
SUCCESS: GPU support enabled


# Data Load

In [26]:
def load_datasets():
    """Load train, validation and test datasets"""
    print("\n" + "=" * 80)
    print("2. LOADING DATASETS")
    print("=" * 80)
    
    X_train = pd.read_csv(DATA_PATH / 'step6_X_train.csv')
    y_train = pd.read_csv(DATA_PATH / 'step6_y_train.csv').squeeze()
    
    X_validate = pd.read_csv(DATA_PATH / 'step6_X_val.csv')
    y_validate = pd.read_csv(DATA_PATH / 'step6_y_val.csv').squeeze()
    
    X_test = pd.read_csv(DATA_PATH / 'step6_X_test.csv')
    y_test = pd.read_csv(DATA_PATH / 'step6_y_test.csv').squeeze()
    
    print(f"\nDataset dimensions:")
    print(f"   Train:      {X_train.shape[0]:>6} samples, {X_train.shape[1]:>3} features")
    print(f"   Validation: {X_validate.shape[0]:>6} samples, {X_validate.shape[1]:>3} features")
    print(f"   Test:       {X_test.shape[0]:>6} samples, {X_test.shape[1]:>3} features")
    
    print(f"\nClass distribution (Train):")
    for cls in [0, 1, 2]:
        count = (y_train == cls).sum()
        pct = count / len(y_train) * 100
        print(f"   {CLASS_NAMES[cls]:20s}: {count:>5} ({pct:>5.2f}%)")
    
    return X_train, y_train, X_validate, y_validate, X_test, y_test

X_train, y_train, X_validate, y_validate, X_test, y_test = load_datasets()


2. LOADING DATASETS

Dataset dimensions:
   Train:        6694 samples,  13 features
   Validation:    956 samples,  13 features
   Test:          957 samples,  13 features

Class distribution (Train):
   False Positive      :  3387 (50.60%)
   Candidate           :  1385 (20.69%)
   Confirmed           :  1922 (28.71%)


# Features Verification


In [27]:
def verify_required_features(X):
    """Verify presence of critical features"""
    print("\n" + "=" * 80)
    print("3. FEATURE VERIFICATION")
    print("=" * 80)
    
    required_features = [
        'koi_duration',
        'koi_duration_err1',
        'koi_depth',
        'koi_depth_err1',
        'koi_model_snr'
    ]
    
    print("\nRequired features:")
    all_present = True
    for feat in required_features:
        if feat in X.columns:
            print(f"   [OK] {feat}")
        else:
            print(f"   [MISSING] {feat}")
            all_present = False
    
    if not all_present:
        raise ValueError("Critical features missing")
    
    print("\nAll required features present")

verify_required_features(X_train)


3. FEATURE VERIFICATION

Required features:
   [OK] koi_duration
   [OK] koi_duration_err1
   [OK] koi_depth
   [OK] koi_depth_err1
   [OK] koi_model_snr

All required features present


# FEATURE ENGINEERING

In [28]:
def engineer_transit_features(X):
    """
    Create additional features based on planetary transits
    
    Features created:
    1. transit_depth_duration_ratio: Depth/duration ratio
    2. snr_log: Log of SNR for normalization
    3. snr_squared: Squared SNR for non-linear effects
    4. transit_detectability: Combined detectability index
    """
    print("\n" + "=" * 80)
    print("4. FEATURE ENGINEERING")
    print("=" * 80)
    
    X_enhanced = X.copy()
    features_created = []
    
    # Feature 1: Depth/Duration ratio
    if 'koi_depth' in X.columns and 'koi_duration' in X.columns:
        X_enhanced['transit_depth_duration_ratio'] = (
            X['koi_depth'] / (X['koi_duration'] + 1e-6)
        )
        features_created.append('transit_depth_duration_ratio')
        print("   [Created] transit_depth_duration_ratio")
    
    # Features 2-3: SNR transformations
    if 'koi_model_snr' in X.columns:
        X_enhanced['snr_log'] = np.log1p(X['koi_model_snr'])
        X_enhanced['snr_squared'] = X['koi_model_snr'] ** 2
        features_created.extend(['snr_log', 'snr_squared'])
        print("   [Created] snr_log")
        print("   [Created] snr_squared")
    
    # Feature 4: Transit detectability
    if all(f in X.columns for f in ['koi_duration', 'koi_depth', 'koi_model_snr']):
        X_enhanced['transit_detectability'] = (
            X['koi_depth'] * X['koi_model_snr'] / (X['koi_duration'] + 1)
        )
        features_created.append('transit_detectability')
        print("   [Created] transit_detectability")
    
    print(f"\nFeatures created: {len(features_created)}")
    print(f"Total features: {X_enhanced.shape[1]} (original: {X.shape[1]})")
    
    return X_enhanced

# Apply feature engineering
X_train_enh = engineer_transit_features(X_train)
X_val_enh = engineer_transit_features(X_validate)
X_test_enh = engineer_transit_features(X_test)



4. FEATURE ENGINEERING
   [Created] transit_depth_duration_ratio
   [Created] snr_log
   [Created] snr_squared
   [Created] transit_detectability

Features created: 4
Total features: 17 (original: 13)

4. FEATURE ENGINEERING
   [Created] transit_depth_duration_ratio
   [Created] snr_log
   [Created] snr_squared
   [Created] transit_detectability

Features created: 4
Total features: 17 (original: 13)

4. FEATURE ENGINEERING
   [Created] transit_depth_duration_ratio
   [Created] snr_log
   [Created] snr_squared
   [Created] transit_detectability

Features created: 4
Total features: 17 (original: 13)


# CALCULATION OF CLASS WEIGHTS

In [29]:
def compute_balanced_class_weights(y):
    """
    Compute class weights to handle imbalance
    Special boost for class 1 (Candidate)
    """
    print("\n" + "=" * 80)
    print("5. CLASS WEIGHT CALCULATION")
    print("=" * 80)
    
    all_classes = np.array([0, 1, 2])
    auto_weights = compute_class_weight(
        class_weight='balanced',
        classes=all_classes,
        y=y
    )
    
    # Manual weight adjustment
    class_weights = {
        0: auto_weights[0] * 0.8,   # False Positive: slight reduction
        1: auto_weights[1] * 1.5,   # Candidate: BOOST +50%
        2: auto_weights[2] * 1.0    # Confirmed: standard weight
    }
    
    print("\nComputed weights:")
    for cls, weight in class_weights.items():
        boost = " [BOOST +50%]" if cls == 1 else ""
        print(f"   {CLASS_NAMES[cls]:20s}: {weight:.4f}x{boost}")
    
    return class_weights

class_weights = compute_balanced_class_weights(y_train)
sample_weights_train = np.array([class_weights[cls] for cls in y_train])



5. CLASS WEIGHT CALCULATION

Computed weights:
   False Positive      : 0.5270x
   Candidate           : 2.4166x [BOOST +50%]
   Confirmed           : 1.1609x


# HYPERPARAMETER TUNING (Train/Val)

In [None]:
def perform_hyperparameter_tuning(X_train, y_train, X_val, y_val, 
                                   sample_weights, device):
    """
    Perform Grid Search on hyperparameters
    Uses Train for training and Val for validation
    """
    print("\n" + "=" * 80)
    print("PHASE 1: HYPERPARAMETER TUNING")
    print("=" * 80)
    
    # Hyperparameter grid
    param_grid = {
        'learning_rate': [0.01, 0.03, 0.05],
        'max_depth': [8, 10, 12],
        'min_child_weight': [20, 30, 40],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.7, 0.8, 0.9]
    }
    
    total_combinations = np.prod([len(v) for v in param_grid.values()])
    print(f"\nTotal combinations to test: {total_combinations}")
    print("Testing in progress...\n")
    
    # Prepare datasets
    dtrain = xgb.DMatrix(X_train, label=y_train, weight=sample_weights)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    best_score = float('inf')
    best_params = None
    results = []
    
    # Grid Search
    for lr, md, mcw, sub, col in product(
        param_grid['learning_rate'],
        param_grid['max_depth'],
        param_grid['min_child_weight'],
        param_grid['subsample'],
        param_grid['colsample_bytree']
    ):
        params = {
            'objective': 'multi:softprob',
            'num_class': 3,
            'eval_metric': 'mlogloss',
            'tree_method': 'hist' if device == 'cpu' else 'gpu_hist',
            'device': device,
            'learning_rate': lr,
            'max_depth': md,
            'min_child_weight': mcw,
            'subsample': sub,
            'colsample_bytree': col,
            'alpha': 0.1,
            'lambda': 0.1,
            'gamma': 0.001,
            'random_state': 42,
            'verbosity': 0
        }
        
        evals = [(dtrain, 'train'), (dval, 'valid')]
        model_temp = xgb.train(
            params,
            dtrain,
            num_boost_round=1000,
            evals=evals,
            early_stopping_rounds=50,
            verbose_eval=False
        )
        
        val_score = model_temp.best_score
        
        results.append({
            'learning_rate': lr,
            'max_depth': md,
            'min_child_weight': mcw,
            'subsample': sub,
            'colsample_bytree': col,
            'val_score': val_score,
            'best_iteration': model_temp.best_iteration
        })
        
        if val_score < best_score:
            best_score = val_score
            best_params = params.copy()
        
        print(f"lr={lr:.2f}, md={md}, mcw={mcw}, sub={sub:.1f}, "
              f"col={col:.1f} -> val_loss={val_score:.6f}")
    
    # Display best results
    print("\n" + "=" * 80)
    print("BEST HYPERPARAMETERS FOUND")
    print("=" * 80)
    print(f"\nBest validation score: {best_score:.6f}")
    print("\nOptimal hyperparameters:")
    print(f"   learning_rate:     {best_params['learning_rate']}")
    print(f"   max_depth:         {best_params['max_depth']}")
    print(f"   min_child_weight:  {best_params['min_child_weight']}")
    print(f"   subsample:         {best_params['subsample']}")
    print(f"   colsample_bytree:  {best_params['colsample_bytree']}")
    
    # Save results
    results_df = pd.DataFrame(results).sort_values('val_score')
    results_df.to_csv(MODEL_PATH / 'hyperparameter_tuning_results.csv', 
                      index=False)
    print(f"\nResults saved: hyperparameter_tuning_results.csv")
    
    return best_params, best_score

best_params, best_score = perform_hyperparameter_tuning(
    X_train_enh, y_train, X_val_enh, y_validate,
    sample_weights_train, device
)


PHASE 1: HYPERPARAMETER TUNING

🔬 Nombre total de combinaisons: 243
⏳ Test en cours...

lr=0.01, md=8, mcw=20, sub=0.7, col=0.7 → val_loss=0.523210
lr=0.01, md=8, mcw=20, sub=0.7, col=0.8 → val_loss=0.521443
lr=0.01, md=8, mcw=20, sub=0.7, col=0.9 → val_loss=0.520415
lr=0.01, md=8, mcw=20, sub=0.8, col=0.7 → val_loss=0.519092
lr=0.01, md=8, mcw=20, sub=0.8, col=0.8 → val_loss=0.518899
lr=0.01, md=8, mcw=20, sub=0.8, col=0.9 → val_loss=0.519555
lr=0.01, md=8, mcw=20, sub=0.9, col=0.7 → val_loss=0.518344
lr=0.01, md=8, mcw=20, sub=0.9, col=0.8 → val_loss=0.516290
lr=0.01, md=8, mcw=20, sub=0.9, col=0.9 → val_loss=0.517444
lr=0.01, md=8, mcw=30, sub=0.7, col=0.7 → val_loss=0.534345
lr=0.01, md=8, mcw=30, sub=0.7, col=0.8 → val_loss=0.533988
lr=0.01, md=8, mcw=30, sub=0.7, col=0.9 → val_loss=0.533704
lr=0.01, md=8, mcw=30, sub=0.8, col=0.7 → val_loss=0.530052
lr=0.01, md=8, mcw=30, sub=0.8, col=0.8 → val_loss=0.530225
lr=0.01, md=8, mcw=30, sub=0.8, col=0.9 → val_loss=0.530042
lr=0.01, md

# FINAL TRAINING (Train + Val)

In [30]:
def train_final_model(X_train, y_train, X_val, y_val, X_test, y_test,
                      best_params, class_weights, device):
    """
    Train final model combining Train and Val
    Evaluate on Test Set
    """
    print("\n" + "=" * 80)
    print("PHASE 2: FINAL TRAINING")
    print("=" * 80)
    
    # Combine Train and Val
    X_train_val = pd.concat([X_train, X_val], axis=0)
    y_train_val = pd.concat([y_train, y_val], axis=0)
    
    # Recalculate weights
    sample_weights_combined = np.array([class_weights[cls] for cls in y_train_val])
    
    print(f"\nCombined dataset (Train+Val): {X_train_val.shape[0]} samples")
    
    # Prepare datasets
    dtrain_final = xgb.DMatrix(X_train_val, label=y_train_val, 
                                weight=sample_weights_combined)
    dtest_final = xgb.DMatrix(X_test, label=y_test)
    
    print("\nFinal training in progress...\n")
    
    evals_final = [(dtrain_final, 'train'), (dtest_final, 'test')]
    
    model_final = xgb.train(
        best_params,
        dtrain_final,
        num_boost_round=3000,
        evals=evals_final,
        early_stopping_rounds=100,
        verbose_eval=100
    )
    
    print(f"\nTraining completed")
    print(f"   Best Iteration: {model_final.best_iteration}")
    print(f"   Best Test Loss: {model_final.best_score:.6f}")
    
    return model_final, X_train_val, y_train_val

model_final, X_train_val, y_train_val = train_final_model(
    X_train_enh, y_train, X_val_enh, y_validate, X_test_enh, y_test,
    best_params, class_weights, device
)


PHASE 2: FINAL TRAINING

Combined dataset (Train+Val): 7650 samples

Final training in progress...

[0]	train-mlogloss:1.07581	test-mlogloss:1.07869
[100]	train-mlogloss:0.45049	test-mlogloss:0.61102
[200]	train-mlogloss:0.35567	test-mlogloss:0.57265
[300]	train-mlogloss:0.30371	test-mlogloss:0.55954
[400]	train-mlogloss:0.26578	test-mlogloss:0.55399
[500]	train-mlogloss:0.23610	test-mlogloss:0.55258
[590]	train-mlogloss:0.21388	test-mlogloss:0.55362

Training completed
   Best Iteration: 490
   Best Test Loss: 0.551759


# FINAL EVALUATION ON TEST SET

In [31]:
def evaluate_final_model(model, X_test, y_test):
    """Evaluate final model on Test Set"""
    print("\n" + "=" * 80)
    print("FINAL EVALUATION ON TEST SET")
    print("=" * 80)
    
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    # Predictions
    y_pred_proba = model.predict(dtest, 
                                  iteration_range=(0, model.best_iteration))
    y_pred = y_pred_proba.argmax(axis=1)
    
    # Global metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', 
                                zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', 
                         zero_division=0)
    f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)
    f1_weighted = f1_score(y_test, y_pred, average='weighted', 
                          zero_division=0)
    
    print("\nPerformance Metrics:")
    print(f"   Accuracy:           {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"   Precision (Avg):    {precision:.4f}")
    print(f"   Recall (Avg):       {recall:.4f}")
    print(f"   F1-Score (Macro):   {f1_macro:.4f}")
    print(f"   F1-Score (Weighted): {f1_weighted:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(
        y_test, y_pred,
        target_names=['False Positive', 'Candidate', 'Confirmed'],
        digits=4,
        zero_division=0
    ))
    
    # Class 1 (Candidate) performance
    class1_mask = y_test == 1
    if class1_mask.sum() > 0:
        class1_acc = accuracy_score(y_test[class1_mask], y_pred[class1_mask])
        print(f"\nCLASS 1 (Candidate) PERFORMANCE: "
              f"{class1_acc:.4f} ({class1_acc*100:.2f}%)")
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'y_pred': y_pred
    }

metrics = evaluate_final_model(model_final, X_test_enh, y_test)


FINAL EVALUATION ON TEST SET

Performance Metrics:
   Accuracy:           0.7513 (75.13%)
   Precision (Avg):    0.7879
   Recall (Avg):       0.7513
   F1-Score (Macro):   0.7358
   F1-Score (Weighted): 0.7610

Classification Report:
                precision    recall  f1-score   support

False Positive     0.8878    0.7190    0.7945       484
     Candidate     0.4930    0.7121    0.5826       198
     Confirmed     0.8244    0.8364    0.8303       275

      accuracy                         0.7513       957
     macro avg     0.7350    0.7558    0.7358       957
  weighted avg     0.7879    0.7513    0.7610       957


CLASS 1 (Candidate) PERFORMANCE: 0.7121 (71.21%)


# VISUALISATIONS

In [32]:
def create_visualizations(model, y_test, y_pred):
    """Create model visualizations"""
    print("\n" + "=" * 80)
    print("9. VISUALIZATIONS")
    print("=" * 80)
    
    # 1. Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['False Positive', 'Candidate', 'Confirmed'],
                yticklabels=['False Positive', 'Candidate', 'Confirmed'])
    plt.title('Confusion Matrix - Test Set', fontsize=16, fontweight='bold')
    plt.ylabel('True Class')
    plt.xlabel('Predicted Class')
    plt.tight_layout()
    plt.savefig(MODEL_PATH / 'confusion_matrix_final.png', dpi=300)
    print("   [Saved] confusion_matrix_final.png")
    plt.close()
    
    # 2. Feature importance
    importance_dict = model.get_score(importance_type='gain')
    feature_importance = pd.DataFrame({
        'feature': list(importance_dict.keys()),
        'importance': list(importance_dict.values())
    }).sort_values('importance', ascending=False)
    
    print("\nTOP 15 FEATURES:")
    print(feature_importance.head(15).to_string(index=False))
    
    plt.figure(figsize=(12, 8))
    top_20 = feature_importance.head(20)
    plt.barh(top_20['feature'], top_20['importance'], color='steelblue')
    plt.xlabel('Importance (Gain)')
    plt.title('Top 20 Features', fontsize=16, fontweight='bold')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig(MODEL_PATH / 'feature_importance_final.png', dpi=300)
    print("   [Saved] feature_importance_final.png")
    plt.close()

create_visualizations(model_final, y_test, metrics['y_pred'])


9. VISUALIZATIONS
   [Saved] confusion_matrix_final.png

TOP 15 FEATURES:
              feature  importance
        koi_model_snr    7.934336
              snr_log    7.769337
             koi_prad    6.345869
    koi_duration_err1    4.268429
          snr_squared    4.063711
        koi_prad_err1    3.747423
           koi_period    3.699139
      koi_period_err1    2.630286
        koi_prad_err2    2.568449
         koi_duration    2.511777
        koi_srad_err1    2.163721
       koi_depth_err1    2.036059
            koi_depth    1.847937
transit_detectability    1.789605
             koi_srad    1.777681
   [Saved] feature_importance_final.png


# SAVE

In [33]:
def save_model_and_metadata(model, best_params, metrics, class_weights, 
                            X_train_val, X_test):
    """Save model and metadata"""
    print("\n" + "=" * 80)
    print("10. SAVING MODEL")
    print("=" * 80)
    
    # Save XGBoost model (JSON format)
    model.save_model(str(MODEL_PATH / 'exoplanet_xgboost_final.json'))
    print("   [Saved] exoplanet_xgboost_final.json")
    
    # Save as pickle for ensemble compatibility
    with open(MODEL_PATH / 'xgboost_model_v1.pkl', 'wb') as f:
        pickle.dump(model, f)
    print("   [Saved] xgboost_model_v1.pkl")
    
    # Metadata
    metadata = {
        'timestamp': datetime.now().isoformat(),
        'xgboost_version': xgb.__version__,
        'device': device,
        'training_strategy': 'Train+Val combined for final model',
        'best_hyperparameters': {
            'learning_rate': best_params['learning_rate'],
            'max_depth': best_params['max_depth'],
            'min_child_weight': best_params['min_child_weight'],
            'subsample': best_params['subsample'],
            'colsample_bytree': best_params['colsample_bytree']
        },
        'best_iteration': int(model.best_iteration),
        'best_score': float(model.best_score),
        'test_metrics': {
            'accuracy': float(metrics['accuracy']),
            'precision': float(metrics['precision']),
            'recall': float(metrics['recall']),
            'f1_macro': float(metrics['f1_macro']),
            'f1_weighted': float(metrics['f1_weighted'])
        },
        'class_weights': {str(k): float(v) for k, v in class_weights.items()},
        'features_engineered': [
            'transit_depth_duration_ratio',
            'snr_log',
            'snr_squared',
            'transit_detectability'
        ],
        'training_samples': int(X_train_val.shape[0]),
        'test_samples': int(X_test.shape[0])
    }
    
    with open(MODEL_PATH / 'model_metadata_final.json', 'w') as f:
        json.dump(metadata, f, indent=4)
    print("   [Saved] model_metadata_final.json")

save_model_and_metadata(model_final, best_params, metrics, class_weights,
                        X_train_val, X_test_enh)

print("\n" + "=" * 80)
print("TRAINING COMPLETED SUCCESSFULLY")
print("=" * 80)
print(f"\nModel saved in: {MODEL_PATH}")
print("Ready for ensemble integration")


10. SAVING MODEL
   [Saved] exoplanet_xgboost_final.json
   [Saved] xgboost_model_v1.pkl
   [Saved] model_metadata_final.json

TRAINING COMPLETED SUCCESSFULLY

Model saved in: xgboost_model
Ready for ensemble integration
