# XGBoost

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report)
import xgboost as xgb
from xgboost import XGBClassifier
import time
import warnings
warnings.filterwarnings('ignore')

# ====================================================================
# V√âRIFICATION ET CONFIGURATION CUDA
# ====================================================================

print("="*80)
print("V√âRIFICATION CUDA")
print("="*80)

# V√©rifier si XGBoost a √©t√© compil√© avec le support GPU
print(f"XGBoost version: {xgb.__version__}")
print(f"XGBoost build info:")
print(xgb.build_info())

# Tester si CUDA est disponible
try:
    import torch
    cuda_available = torch.cuda.is_available()
    if cuda_available:
        print(f"\n‚úÖ CUDA disponible!")
        print(f"   ‚Ä¢ Nombre de GPUs: {torch.cuda.device_count()}")
        print(f"   ‚Ä¢ GPU actuel: {torch.cuda.get_device_name(0)}")
        print(f"   ‚Ä¢ Version CUDA: {torch.version.cuda}")
    else:
        print("\n‚ö†Ô∏è  CUDA non disponible avec PyTorch")
except ImportError:
    print("\n‚ö†Ô∏è  PyTorch non install√© (optionnel pour la v√©rification)")

# ====================================================================
# √âTAPE 7 - XGBOOST AVEC GRIDSEARCH ET GPU (CUDA 11.8)
# ====================================================================

print("\n" + "="*80)
print("√âTAPE 7 - XGBOOST AVEC GRIDSEARCH (GPU ACCELERATED)")
print("="*80)

# 1. CHARGEMENT DES DONN√âES
print("\n1. CHARGEMENT DES DONN√âES")
print("-"*80)

X_train = pd.read_csv('C:/Users/chaym/Desktop/NasaProject/data/processed/step6_X_train.csv')
X_val = pd.read_csv('C:/Users/chaym/Desktop/NasaProject/data/processed/step6_X_val.csv')
y_train = pd.read_csv('C:/Users/chaym/Desktop/NasaProject/data/processed/step6_y_train.csv')
y_val = pd.read_csv('C:/Users/chaym/Desktop/NasaProject/data/processed/step6_y_val.csv')

# Convertir en array 1D pour les targets
y_train = y_train.values.ravel()
y_val = y_val.values.ravel()

print(f"‚úì X_train: {X_train.shape}")
print(f"‚úì X_val: {X_val.shape}")
print(f"‚úì y_train: {y_train.shape}")
print(f"‚úì y_val: {y_val.shape}")

# 2. D√âFINITION DE LA GRILLE D'HYPERPARAM√àTRES
print("\n2. D√âFINITION DE LA GRILLE D'HYPERPARAM√àTRES")
print("-"*80)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}

print("‚úì Param√®tres √† tester:")
for param, values in param_grid.items():
    print(f"  ‚Ä¢ {param}: {values}")

total_combinations = np.prod([len(v) for v in param_grid.values()])
print(f"\n‚úì Nombre total de combinaisons: {total_combinations}")

# 3. GRIDSEARCH AVEC VALIDATION CROIS√âE (GPU)
print("\n3. GRIDSEARCH AVEC VALIDATION CROIS√âE (GPU)")
print("-"*80)

# Mod√®le de base avec GPU
base_model = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    eval_metric='mlogloss',
    random_state=42,
    tree_method='gpu_hist',  # üöÄ GPU acceleration
    gpu_id=0,                 # üöÄ GPU ID (0 pour le premier GPU)
    predictor='gpu_predictor', # üöÄ Utiliser le GPU pour les pr√©dictions
    enable_categorical=False
)

# GridSearch
grid_search = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    cv=5,
    scoring='f1_weighted',
    n_jobs=1,  # Important: n_jobs=1 quand on utilise GPU
    verbose=2,
    return_train_score=True
)

print("üîç D√©marrage du GridSearch avec GPU...")
print("   (Devrait √™tre beaucoup plus rapide qu'avec CPU)")
start_time = time.time()

try:
    grid_search.fit(X_train, y_train)
    elapsed_time = time.time() - start_time
    print(f"\n‚úÖ GridSearch termin√© en {elapsed_time/60:.2f} minutes")
    gpu_success = True
except Exception as e:
    print(f"\n‚ùå Erreur GPU: {e}")
    print("‚ö†Ô∏è  Retour au mode CPU...")
    
    # Fallback vers CPU
    base_model = XGBClassifier(
        objective='multi:softmax',
        num_class=3,
        eval_metric='mlogloss',
        random_state=42,
        tree_method='hist',
        enable_categorical=False
    )
    
    grid_search = GridSearchCV(
        estimator=base_model,
        param_grid=param_grid,
        cv=5,
        scoring='f1_weighted',
        n_jobs=-1,
        verbose=2,
        return_train_score=True
    )
    
    grid_search.fit(X_train, y_train)
    elapsed_time = time.time() - start_time
    print(f"\n‚úì GridSearch termin√© (CPU) en {elapsed_time/60:.2f} minutes")
    gpu_success = False

# 4. MEILLEURS PARAM√àTRES
print("\n4. MEILLEURS PARAM√àTRES TROUV√âS")
print("-"*80)

print("‚úì Meilleurs hyperparam√®tres:")
for param, value in grid_search.best_params_.items():
    print(f"  ‚Ä¢ {param}: {value}")

print(f"\n‚úì Meilleur score (F1 weighted) CV: {grid_search.best_score_:.4f}")

# Sauvegarder les r√©sultats du GridSearch
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results.to_csv('C:/Users/chaym/Desktop/NasaProject/data/processed/step7_gridsearch_results.csv', index=False)
print("‚úì R√©sultats GridSearch sauvegard√©s: step7_gridsearch_results.csv")

# 5. ENTRA√éNEMENT DU MOD√àLE FINAL AVEC COURBES DE LOSS (GPU)
print("\n5. ENTRA√éNEMENT DU MOD√àLE FINAL AVEC EVAL SET (GPU)")
print("-"*80)

if gpu_success:
    best_model = XGBClassifier(
        **grid_search.best_params_,
        objective='multi:softmax',
        num_class=3,
        eval_metric='mlogloss',
        random_state=42,
        tree_method='gpu_hist',      # üöÄ GPU
        gpu_id=0,                     # üöÄ GPU ID
        predictor='gpu_predictor',    # üöÄ GPU predictor
        enable_categorical=False,
        early_stopping_rounds=20
    )
else:
    best_model = XGBClassifier(
        **grid_search.best_params_,
        objective='multi:softmax',
        num_class=3,
        eval_metric='mlogloss',
        random_state=42,
        tree_method='hist',
        enable_categorical=False,
        early_stopping_rounds=20
    )

# Entra√Ænement avec eval_set pour tracer les courbes
eval_set = [(X_train, y_train), (X_val, y_val)]
eval_names = ['train', 'val']

best_model.fit(
    X_train, 
    y_train,
    eval_set=eval_set,
    verbose=False
)

print(f"‚úì Mod√®le entra√Æn√© avec {best_model.n_estimators} estimateurs")
print(f"‚úì Meilleure it√©ration: {best_model.best_iteration}")
print(f"‚úì Acc√©l√©ration: {'GPU (CUDA)' if gpu_success else 'CPU'}")

# 6. PR√âDICTIONS
print("\n6. PR√âDICTIONS SUR TRAIN ET VAL")
print("-"*80)

y_train_pred = best_model.predict(X_train)
y_val_pred = best_model.predict(X_val)

print("‚úì Pr√©dictions effectu√©es")

# 7. M√âTRIQUES D'√âVALUATION
print("\n7. M√âTRIQUES D'√âVALUATION")
print("-"*80)

def calculate_metrics(y_true, y_pred, dataset_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    
    print(f"\n{dataset_name}:")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1-Score:  {f1:.4f}")
    
    return {
        'Dataset': dataset_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    }

metrics_train = calculate_metrics(y_train, y_train_pred, "TRAIN")
metrics_val = calculate_metrics(y_val, y_val_pred, "VAL")

# Sauvegarder les m√©triques
metrics_df = pd.DataFrame([metrics_train, metrics_val])
metrics_df.to_csv('C:/Users/chaym/Desktop/NasaProject/data/processed/step7_metrics.csv', index=False)
print("\n‚úì M√©triques sauvegard√©es: step7_metrics.csv")

# 8. CLASSIFICATION REPORT D√âTAILL√â
print("\n8. CLASSIFICATION REPORT D√âTAILL√â")
print("-"*80)

label_names = ['FALSE POSITIVE', 'CANDIDATE', 'CONFIRMED']

print("\nTRAIN SET:")
print(classification_report(y_train, y_train_pred, target_names=label_names, digits=4))

print("\nVAL SET:")
print(classification_report(y_val, y_val_pred, target_names=label_names, digits=4))

# 9. VISUALISATION DES COURBES DE LOSS
print("\n9. VISUALISATION DES COURBES DE LOSS")
print("-"*80)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# R√©cup√©rer les r√©sultats d'√©valuation
results = best_model.evals_result()

# Courbe de loss
ax1 = axes[0]
epochs = range(len(results['validation_0']['mlogloss']))
ax1.plot(epochs, results['validation_0']['mlogloss'], label='Train Loss', linewidth=2, color='#3498db')
ax1.plot(epochs, results['validation_1']['mlogloss'], label='Val Loss', linewidth=2, color='#e74c3c')
ax1.axvline(x=best_model.best_iteration, color='green', linestyle='--', 
            label=f'Best Iteration ({best_model.best_iteration})', linewidth=2)
ax1.set_xlabel('Iterations', fontsize=12, fontweight='bold')
ax1.set_ylabel('Multiclass Log Loss', fontsize=12, fontweight='bold')
ax1.set_title('Courbes de Loss - Train vs Val', fontsize=14, fontweight='bold')
ax1.legend(fontsize=10)
ax1.grid(True, alpha=0.3)

# Courbe de loss (zoom sur les derni√®res it√©rations)
ax2 = axes[1]
start_idx = max(0, len(epochs) - 100)
ax2.plot(epochs[start_idx:], results['validation_0']['mlogloss'][start_idx:], 
         label='Train Loss', linewidth=2, color='#3498db')
ax2.plot(epochs[start_idx:], results['validation_1']['mlogloss'][start_idx:], 
         label='Val Loss', linewidth=2, color='#e74c3c')
if best_model.best_iteration >= start_idx:
    ax2.axvline(x=best_model.best_iteration, color='green', linestyle='--', 
                label=f'Best Iteration', linewidth=2)
ax2.set_xlabel('Iterations', fontsize=12, fontweight='bold')
ax2.set_ylabel('Multiclass Log Loss', fontsize=12, fontweight='bold')
ax2.set_title('Courbes de Loss - Zoom (derni√®res 100 it√©rations)', fontsize=14, fontweight='bold')
ax2.legend(fontsize=10)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('C:/Users/chaym/Desktop/NasaProject/data/processed/step7_loss_curves.png', dpi=300, bbox_inches='tight')
print("‚úì Courbes de loss sauvegard√©es: step7_loss_curves.png")
plt.close()

# 10. MATRICES DE CONFUSION
print("\n10. MATRICES DE CONFUSION")
print("-"*80)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Matrice de confusion TRAIN
cm_train = confusion_matrix(y_train, y_train_pred)
sns.heatmap(cm_train, annot=True, fmt='d', cmap='Blues', 
            xticklabels=label_names, yticklabels=label_names,
            ax=axes[0], cbar_kws={'label': 'Nombre de pr√©dictions'})
axes[0].set_title('Matrice de Confusion - TRAIN', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Vraie Classe', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Classe Pr√©dite', fontsize=12, fontweight='bold')

# Matrice de confusion VAL
cm_val = confusion_matrix(y_val, y_val_pred)
sns.heatmap(cm_val, annot=True, fmt='d', cmap='Oranges',
            xticklabels=label_names, yticklabels=label_names,
            ax=axes[1], cbar_kws={'label': 'Nombre de pr√©dictions'})
axes[1].set_title('Matrice de Confusion - VAL', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Vraie Classe', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Classe Pr√©dite', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('C:/Users/chaym/Desktop/NasaProject/data/processed/step7_confusion_matrices.png', dpi=300, bbox_inches='tight')
print("‚úì Matrices de confusion sauvegard√©es: step7_confusion_matrices.png")
plt.close()

# 11. IMPORTANCE DES FEATURES
print("\n11. IMPORTANCE DES FEATURES")
print("-"*80)

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 features les plus importantes:")
print(feature_importance.head(10).to_string(index=False))

# Visualisation
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['importance'], color='steelblue')
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance', fontsize=12, fontweight='bold')
plt.ylabel('Features', fontsize=12, fontweight='bold')
plt.title('Top 15 Features les Plus Importantes (XGBoost)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('C:/Users/chaym/Desktop/NasaProject/data/processed/step7_feature_importance.png', dpi=300, bbox_inches='tight')
print("‚úì Feature importance sauvegard√©e: step7_feature_importance.png")
plt.close()

# Sauvegarder l'importance des features
feature_importance.to_csv('C:/Users/chaym/Desktop/NasaProject/data/processed/step7_feature_importance.csv', index=False)
print("‚úì Feature importance CSV: step7_feature_importance.csv")

# 12. COMPARAISON TRAIN VS VAL
print("\n12. COMPARAISON TRAIN VS VAL")
print("-"*80)

fig, ax = plt.subplots(figsize=(12, 6))

metrics_names = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
train_scores = [metrics_train[m] for m in metrics_names]
val_scores = [metrics_val[m] for m in metrics_names]

x = np.arange(len(metrics_names))
width = 0.35

bars1 = ax.bar(x - width/2, train_scores, width, label='Train', color='#3498db', alpha=0.8)
bars2 = ax.bar(x + width/2, val_scores, width, label='Val', color='#e74c3c', alpha=0.8)

ax.set_ylabel('Score', fontsize=12, fontweight='bold')
ax.set_title('Comparaison des M√©triques - Train vs Val', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(metrics_names)
ax.legend(fontsize=11)
ax.grid(axis='y', alpha=0.3)
ax.set_ylim([0, 1.1])

# Ajouter les valeurs sur les barres
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}', ha='center', va='bottom', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.savefig('C:/Users/chaym/Desktop/NasaProject/data/processed/step7_metrics_comparison.png', dpi=300, bbox_inches='tight')
print("‚úì Comparaison des m√©triques sauvegard√©e: step7_metrics_comparison.png")
plt.close()

# 13. SAUVEGARDE DU MOD√àLE
print("\n13. SAUVEGARDE DU MOD√àLE")
print("-"*80)

import joblib
joblib.dump(best_model, 'C:/Users/chaym/Desktop/NasaProject/data/processed/step7_xgboost_model.pkl')
print("‚úì Mod√®le sauvegard√©: step7_xgboost_model.pkl")

# 14. R√âSUM√â FINAL
print("\n" + "="*80)
print("R√âSUM√â FINAL - XGBOOST")
print("="*80)

print(f"\nüöÄ ACC√âL√âRATION: {'GPU (CUDA 11.8)' if gpu_success else 'CPU (Fallback)'}")
print(f"‚è±Ô∏è  TEMPS D'EX√âCUTION: {elapsed_time/60:.2f} minutes")

print(f"\nüéØ MEILLEURS HYPERPARAM√àTRES:")
for param, value in grid_search.best_params_.items():
    print(f"  ‚Ä¢ {param}: {value}")

print(f"\nüìä PERFORMANCES:")
print(f"  TRAIN:")
print(f"    ‚Ä¢ Accuracy:  {metrics_train['Accuracy']:.4f}")
print(f"    ‚Ä¢ F1-Score:  {metrics_train['F1-Score']:.4f}")
print(f"  VAL:")
print(f"    ‚Ä¢ Accuracy:  {metrics_val['Accuracy']:.4f}")
print(f"    ‚Ä¢ F1-Score:  {metrics_val['F1-Score']:.4f}")

overfitting = metrics_train['F1-Score'] - metrics_val['F1-Score']
print(f"\n  üìâ √âcart Train-Val (F1): {overfitting:.4f}")
if overfitting < 0.05:
    print(f"     ‚úÖ Pas d'overfitting significatif!")
elif overfitting < 0.10:
    print(f"     ‚ö†Ô∏è  L√©ger overfitting")
else:
    print(f"     ‚ùå Overfitting d√©tect√©!")

print("\n" + "="*80)
print("√âTAPE 7 TERMIN√âE ‚úì")
print("="*80)

print("\nüìÅ Fichiers g√©n√©r√©s:")
print("  1. step7_gridsearch_results.csv - R√©sultats du GridSearch")
print("  2. step7_metrics.csv - M√©triques Train/Val")
print("  3. step7_loss_curves.png - Courbes de loss")
print("  4. step7_confusion_matrices.png - Matrices de confusion")
print("  5. step7_feature_importance.png - Importance des features")
print("  6. step7_feature_importance.csv - Importance (CSV)")
print("  7. step7_metrics_comparison.png - Comparaison m√©triques")
print("  8. step7_xgboost_model.pkl - Mod√®le entra√Æn√©")

print("\nüí° Prochaines √©tapes:")
print("  1. Analyser les r√©sultats et l'overfitting")
print("  2. Tester d'autres mod√®les (Random Forest, etc.)")
print("  3. √âvaluation finale sur le test set")
print("  4. Interpr√©tation des features importantes")

V√âRIFICATION CUDA
XGBoost version: 2.1.4
XGBoost build info:
{'BUILTIN_PREFETCH_PRESENT': False, 'CUDA_VERSION': [12, 4], 'DEBUG': False, 'MM_PREFETCH_PRESENT': True, 'THRUST_VERSION': [2, 3, 2], 'USE_CUDA': True, 'USE_DLOPEN_NCCL': False, 'USE_FEDERATED': False, 'USE_NCCL': False, 'USE_OPENMP': True, 'USE_RMM': False, 'libxgboost': 'c:\\Users\\chaym\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\xgboost\\lib\\xgboost.dll'}

‚ö†Ô∏è  PyTorch non install√© (optionnel pour la v√©rification)

√âTAPE 7 - XGBOOST AVEC GRIDSEARCH (GPU ACCELERATED)

1. CHARGEMENT DES DONN√âES
--------------------------------------------------------------------------------
‚úì X_train: (7288, 27)
‚úì X_val: (911, 27)
‚úì y_train: (7288,)
‚úì y_val: (911,)

2. D√âFINITION DE LA GRILLE D'HYPERPARAM√àTRES
--------------------------------------------------------------------------------
‚úì Param√®tres √† tester:
  ‚Ä¢ n_estimators: [100, 200, 300]
  ‚Ä¢ max_depth: [3, 5, 7]
  ‚Ä¢ learning_rate: [