# Hyperparameter Tuning with GridSearch

In [7]:
"""
SCRIPT 1: RECHERCHE D'HYPERPARAMÈTRES AVEC RANDOMIZED SEARCH (GPU)
==================================================================
Recherche sur Train, évaluation sur Val (SANS cross-validation)
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_sample_weight
from xgboost import XGBClassifier
import xgboost as xgb
import time
import json
import warnings
warnings.filterwarnings('ignore')

# ====================================================================
# CONFIGURATION
# ====================================================================
BASE_PATH = 'C:/Users/chaym/Desktop/NasaChallenge'
DATA_PATH = f'{BASE_PATH}/data/processed'
MODEL_PATH = f'{BASE_PATH}/models/xgboost_model'

print("="*80)
print("SCRIPT 1: RECHERCHE D'HYPERPARAMÈTRES (GPU - SANS CV)")
print("="*80)

# ====================================================================
# VÉRIFICATION CUDA
# ====================================================================
print("\n1. VÉRIFICATION CUDA")
print("-"*80)

print(f"XGBoost version: {xgb.__version__}")
build_info = xgb.build_info()

if not build_info.get('USE_CUDA', False):
    print("ERREUR: XGBoost n'a pas été compilé avec CUDA")
    exit(1)
print("Support GPU: OUI")

# ====================================================================
# CHARGEMENT DES DONNÉES
# ====================================================================
print("\n2. CHARGEMENT DES DONNÉES")
print("-"*80)

X_train = pd.read_csv(f'{DATA_PATH}/step6_X_train.csv')
X_val = pd.read_csv(f'{DATA_PATH}/step6_X_val.csv')
y_train = pd.read_csv(f'{DATA_PATH}/step6_y_train.csv').values.ravel()
y_val = pd.read_csv(f'{DATA_PATH}/step6_y_val.csv').values.ravel()

print(f"X_train: {X_train.shape}")
print(f"X_val: {X_val.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_val: {y_val.shape}")

print("\nDistribution des classes (Train):")
train_dist = pd.Series(y_train).value_counts().sort_index()
for class_name, count in train_dist.items():
    print(f"  {class_name}: {count} ({count/len(y_train)*100:.1f}%)")

# ====================================================================
# POIDS DES CLASSES
# ====================================================================
print("\n3. CALCUL DES POIDS DES CLASSES")
print("-"*80)

sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)
print("Poids calculés pour gérer le déséquilibre")

# ====================================================================
# RECHERCHE D'HYPERPARAMÈTRES MANUELLE
# ====================================================================
print("\n4. RECHERCHE D'HYPERPARAMÈTRES (TRAIN → VAL)")
print("-"*80)

param_dist = {
    'n_estimators': [100, 120, 150, 200],
    'max_depth': [5, 7, 9, 11],
    'learning_rate': [0.01, 0.02, 0.03, 0.05],
    'subsample': [0.6, 0.7, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 1.0],
    'min_child_weight': [1, 3, 5, 7],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [1.0, 2.0, 3.0, 5.0],
    'gamma': [0, 0.1, 0.3, 0.5]
}

n_iter = 1000
print(f"Nombre de combinaisons à tester: {n_iter}")
print("Évaluation: Train → Val (pas de cross-validation)")

# Configuration GPU de base
base_params = {
    'objective': 'multi:softmax',
    'num_class': 3,
    'eval_metric': 'mlogloss',
    'random_state': 42,
    'tree_method': 'gpu_hist',
    'gpu_id': 0,
    'predictor': 'gpu_predictor',
    'enable_categorical': False
}

results = []
best_val_f1 = 0
best_params = None
best_model = None

np.random.seed(42)
start_time = time.time()

print("\nDémarrage de la recherche...")

for i in range(n_iter):
    # Génération aléatoire des hyperparamètres
    params = {
        'n_estimators': int(np.random.choice(param_dist['n_estimators'])),
        'max_depth': int(np.random.choice(param_dist['max_depth'])),
        'learning_rate': float(np.random.choice(param_dist['learning_rate'])),
        'subsample': float(np.random.choice(param_dist['subsample'])),
        'colsample_bytree': float(np.random.choice(param_dist['colsample_bytree'])),
        'min_child_weight': int(np.random.choice(param_dist['min_child_weight'])),
        'reg_alpha': float(np.random.choice(param_dist['reg_alpha'])),
        'reg_lambda': float(np.random.choice(param_dist['reg_lambda'])),
        'gamma': float(np.random.choice(param_dist['gamma']))
    }
    
    # Combiner avec les paramètres de base
    all_params = {**base_params, **params}
    
    # Entraînement
    model = XGBClassifier(**all_params)
    model.fit(X_train, y_train, sample_weight=sample_weights, verbose=False)
    
    # Prédictions
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    
    # Calcul des scores
    train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    val_f1 = f1_score(y_val, y_val_pred, average='weighted')
    
    # Sauvegarde des résultats
    results.append({
        'iteration': i + 1,
        'train_f1': train_f1,
        'val_f1': val_f1,
        'overfitting': train_f1 - val_f1,
        **params
    })
    
    # Mise à jour du meilleur modèle
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_params = params.copy()
        best_model = model
        print(f"  [{i+1:4d}/{n_iter}] ⭐ NOUVEAU MEILLEUR: Val F1 = {val_f1:.4f} (Train = {train_f1:.4f})")
    elif (i + 1) % 50 == 0:
        print(f"  [{i+1:4d}/{n_iter}] Val F1 = {val_f1:.4f} (Meilleur = {best_val_f1:.4f})")

elapsed_time = time.time() - start_time

# ====================================================================
# MEILLEURS RÉSULTATS
# ====================================================================
print("\n5. MEILLEURS HYPERPARAMÈTRES TROUVÉS")
print("-"*80)

print("Meilleurs hyperparamètres:")
for k, v in sorted(best_params.items()):
    print(f"  {k}: {v}")

print(f"\nMeilleur F1 sur validation: {best_val_f1:.4f}")
print(f"Temps de recherche: {elapsed_time/60:.2f} minutes")

# Sauvegarde JSON
config = {
    'best_params': best_params,
    'best_val_f1': float(best_val_f1),
    'search_time_minutes': float(elapsed_time/60),
    'n_iter': int(n_iter),
    'cross_validation': False,
    'gpu_used': True,
    'sample_weights_used': True,
    'dataset_size': int(len(y_train)),
    'n_features': int(X_train.shape[1])
}

with open(f'{MODEL_PATH}/best_hyperparameters.json', 'w') as f:
    json.dump(config, f, indent=4)
print("\nMeilleurs paramètres sauvegardés: best_hyperparameters.json")

# Sauvegarde de tous les résultats
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('val_f1', ascending=False)
results_df.to_csv(f'{MODEL_PATH}/hyperparameter_search_results.csv', index=False)
print("Tous les résultats: hyperparameter_search_results.csv")

# ====================================================================
# ANALYSE DES RÉSULTATS
# ====================================================================
print("\n6. ANALYSE DES RÉSULTATS")
print("-"*80)

print("\nTop 5 meilleures combinaisons:")
for idx, row in results_df.head(5).iterrows():
    print(f"\nRank {idx+1}:")
    print(f"  Val F1: {row['val_f1']:.4f}")
    print(f"  Train F1: {row['train_f1']:.4f}")
    print(f"  Overfitting: {row['overfitting']:.4f}")

# ====================================================================
# VISUALISATION
# ====================================================================
print("\n7. VISUALISATION")
print("-"*80)

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Distribution des scores Val
ax1 = axes[0, 0]
ax1.hist(results_df['val_f1'], bins=30, edgecolor='black', alpha=0.7, color='steelblue')
ax1.axvline(best_val_f1, color='red', linestyle='--', linewidth=2, 
            label=f'Meilleur: {best_val_f1:.4f}')
ax1.set_xlabel('F1 Score (Validation)', fontsize=12, fontweight='bold')
ax1.set_ylabel('Fréquence', fontsize=12, fontweight='bold')
ax1.set_title('Distribution des Scores Validation', fontsize=14, fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# 2. Top 20 configurations
ax2 = axes[0, 1]
top_20 = results_df.head(20)
ax2.barh(range(len(top_20)), top_20['val_f1'], color='steelblue')
ax2.set_xlabel('F1 Score (Validation)', fontsize=12, fontweight='bold')
ax2.set_ylabel('Rang', fontsize=12, fontweight='bold')
ax2.set_title('Top 20 Configurations', fontsize=14, fontweight='bold')
ax2.invert_yaxis()
ax2.grid(axis='x', alpha=0.3)

# 3. Train vs Val
ax3 = axes[1, 0]
ax3.scatter(results_df['train_f1'], results_df['val_f1'], alpha=0.5, s=30)
ax3.plot([0.5, 1], [0.5, 1], 'r--', linewidth=2, label='Train = Val')
best_row = results_df.iloc[0]
ax3.scatter(best_row['train_f1'], best_row['val_f1'], color='red', s=200, 
            marker='*', label='Meilleur', zorder=5, edgecolor='black', linewidth=2)
ax3.set_xlabel('F1 Score (Train)', fontsize=12, fontweight='bold')
ax3.set_ylabel('F1 Score (Validation)', fontsize=12, fontweight='bold')
ax3.set_title('Train vs Validation', fontsize=14, fontweight='bold')
ax3.legend()
ax3.grid(True, alpha=0.3)

# 4. Overfitting
ax4 = axes[1, 1]
ax4.hist(results_df['overfitting'], bins=30, edgecolor='black', alpha=0.7, color='coral')
ax4.axvline(best_row['overfitting'], color='red', linestyle='--', linewidth=2,
            label=f'Meilleur: {best_row["overfitting"]:.4f}')
ax4.set_xlabel('Overfitting (Train - Val)', fontsize=12, fontweight='bold')
ax4.set_ylabel('Fréquence', fontsize=12, fontweight='bold')
ax4.set_title('Distribution de l\'Overfitting', fontsize=14, fontweight='bold')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(f'{MODEL_PATH}/hyperparameter_search_analysis.png', dpi=300, bbox_inches='tight')
print("Visualisation sauvegardée: hyperparameter_search_analysis.png")
plt.close()

# ====================================================================
# RÉSUMÉ
# ====================================================================
print("\n" + "="*80)
print("RÉSUMÉ - RECHERCHE TERMINÉE")
print("="*80)
print(f"Temps total: {elapsed_time/60:.2f} minutes")
print(f"Combinaisons testées: {n_iter}")
print(f"Meilleur F1 (Val): {best_val_f1:.4f}")
print(f"Cross-validation: NON (split Train/Val)")
print(f"GPU utilisé: OUI")
print("\nMeilleurs hyperparamètres:")
for k, v in sorted(best_params.items()):
    print(f"  {k}: {v}")
print("="*80)
print("\nFichiers générés:")
print("  - best_hyperparameters.json")
print("  - hyperparameter_search_results.csv")
print("  - hyperparameter_search_analysis.png")
print("\nProchaine étape: Entraîner le modèle final avec ces hyperparamètres")
print("="*80)

SCRIPT 1: RECHERCHE D'HYPERPARAMÈTRES (GPU - SANS CV)

1. VÉRIFICATION CUDA
--------------------------------------------------------------------------------
XGBoost version: 2.1.4
Support GPU: OUI

2. CHARGEMENT DES DONNÉES
--------------------------------------------------------------------------------
X_train: (6694, 13)
X_val: (956, 13)
y_train: (6694,)
y_val: (956,)

Distribution des classes (Train):
  0: 3387 (50.6%)
  1: 1385 (20.7%)
  2: 1922 (28.7%)

3. CALCUL DES POIDS DES CLASSES
--------------------------------------------------------------------------------
Poids calculés pour gérer le déséquilibre

4. RECHERCHE D'HYPERPARAMÈTRES (TRAIN → VAL)
--------------------------------------------------------------------------------
Nombre de combinaisons à tester: 1000
Évaluation: Train → Val (pas de cross-validation)

Démarrage de la recherche...
  [   1/1000] ⭐ NOUVEAU MEILLEUR: Val F1 = 0.7879 (Train = 0.8631)
  [   3/1000] ⭐ NOUVEAU MEILLEUR: Val F1 = 0.7937 (Train = 0.8469)
  [

In [8]:
"""
SCRIPT 2: ENTRAÎNEMENT DU MODÈLE FINAL AVEC MEILLEURS HYPERPARAMÈTRES
=====================================================================
Entraîne sur Train+Val, évalue sur Test
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (f1_score, classification_report, confusion_matrix, 
                             accuracy_score, precision_score, recall_score)
from sklearn.utils.class_weight import compute_sample_weight
from xgboost import XGBClassifier
import xgboost as xgb
import joblib
import json
import warnings
warnings.filterwarnings('ignore')

# ====================================================================
# CONFIGURATION
# ====================================================================
BASE_PATH = 'C:/Users/chaym/Desktop/NasaChallenge'
DATA_PATH = f'{BASE_PATH}/data/processed'
MODEL_PATH = f'{BASE_PATH}/models/xgboost_model'

print("="*80)
print("SCRIPT 2: ENTRAÎNEMENT DU MODÈLE FINAL")
print("="*80)

# ====================================================================
# VÉRIFICATION CUDA
# ====================================================================
print("\n1. VÉRIFICATION CUDA")
print("-"*80)

print(f"XGBoost version: {xgb.__version__}")
build_info = xgb.build_info()

if not build_info.get('USE_CUDA', False):
    print("ERREUR: XGBoost n'a pas été compilé avec CUDA")
    exit(1)
print("Support GPU: OUI")

# ====================================================================
# CHARGEMENT DES DONNÉES
# ====================================================================
print("\n2. CHARGEMENT DES DONNÉES")
print("-"*80)

X_train = pd.read_csv(f'{DATA_PATH}/step6_X_train.csv')
X_val = pd.read_csv(f'{DATA_PATH}/step6_X_val.csv')
X_test = pd.read_csv(f'{DATA_PATH}/step6_X_test.csv')

y_train = pd.read_csv(f'{DATA_PATH}/step6_y_train.csv').values.ravel()
y_val = pd.read_csv(f'{DATA_PATH}/step6_y_val.csv').values.ravel()
y_test = pd.read_csv(f'{DATA_PATH}/step6_y_test.csv').values.ravel()

print(f"X_train: {X_train.shape}")
print(f"X_val: {X_val.shape}")
print(f"X_test: {X_test.shape}")

# Combiner Train + Val pour l'entraînement final
X_train_full = pd.concat([X_train, X_val], axis=0)
y_train_full = np.concatenate([y_train, y_val])

print(f"\nX_train_full (Train+Val): {X_train_full.shape}")
print(f"y_train_full (Train+Val): {y_train_full.shape}")

print("\nDistribution des classes (Train+Val):")
train_full_dist = pd.Series(y_train_full).value_counts().sort_index()
for class_name, count in train_full_dist.items():
    print(f"  Classe {class_name}: {count} ({count/len(y_train_full)*100:.1f}%)")

print("\nDistribution des classes (Test):")
test_dist = pd.Series(y_test).value_counts().sort_index()
for class_name, count in test_dist.items():
    print(f"  Classe {class_name}: {count} ({count/len(y_test)*100:.1f}%)")

# ====================================================================
# CALCUL DES POIDS
# ====================================================================
print("\n3. CALCUL DES POIDS DES CLASSES")
print("-"*80)

sample_weights = compute_sample_weight(class_weight='balanced', y=y_train_full)
print("Poids calculés pour gérer le déséquilibre des classes")

# ====================================================================
# MEILLEURS HYPERPARAMÈTRES
# ====================================================================
print("\n4. CHARGEMENT DES MEILLEURS HYPERPARAMÈTRES")
print("-"*80)

# Meilleurs hyperparamètres trouvés
best_params = {
    'colsample_bytree': 0.7,
    'gamma': 0.0,
    'learning_rate': 0.05,
    'max_depth': 11,
    'min_child_weight': 5,
    'n_estimators': 200,
    'reg_alpha': 0.0,
    'reg_lambda': 3.0,
    'subsample': 0.6
}

print("Hyperparamètres utilisés:")
for k, v in sorted(best_params.items()):
    print(f"  {k}: {v}")

# ====================================================================
# ENTRAÎNEMENT DU MODÈLE FINAL
# ====================================================================
print("\n5. ENTRAÎNEMENT DU MODÈLE FINAL")
print("-"*80)

# Configuration complète
final_model = XGBClassifier(
    # Meilleurs hyperparamètres
    **best_params,
    
    # Paramètres de base
    objective='multi:softmax',
    num_class=3,
    eval_metric='mlogloss',
    random_state=42,
    
    # GPU
    tree_method='gpu_hist',
    gpu_id=0,
    predictor='gpu_predictor',
    enable_categorical=False
)

print("Entraînement sur Train+Val...")
print(f"  Taille: {X_train_full.shape[0]} exemples")
print(f"  Features: {X_train_full.shape[1]}")

import time
start_time = time.time()

final_model.fit(
    X_train_full, 
    y_train_full, 
    sample_weight=sample_weights,
    verbose=True
)

training_time = time.time() - start_time
print(f"\nEntraînement terminé en {training_time:.2f} secondes")

# ====================================================================
# ÉVALUATION SUR TEST
# ====================================================================
print("\n6. ÉVALUATION SUR LE SET DE TEST")
print("-"*80)

# Prédictions
y_test_pred = final_model.predict(X_test)
y_train_full_pred = final_model.predict(X_train_full)

# Métriques
test_f1 = f1_score(y_test, y_test_pred, average='weighted')
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='weighted')
test_recall = recall_score(y_test, y_test_pred, average='weighted')

train_full_f1 = f1_score(y_train_full, y_train_full_pred, average='weighted')
train_full_accuracy = accuracy_score(y_train_full, y_train_full_pred)

print("RÉSULTATS SUR TEST:")
print(f"  Accuracy: {test_accuracy:.4f}")
print(f"  Precision: {test_precision:.4f}")
print(f"  Recall: {test_recall:.4f}")
print(f"  F1 Score: {test_f1:.4f}")

print("\nRÉSULTATS SUR TRAIN+VAL:")
print(f"  Accuracy: {train_full_accuracy:.4f}")
print(f"  F1 Score: {train_full_f1:.4f}")

print(f"\nOverfitting: {train_full_f1 - test_f1:.4f}")

# ====================================================================
# RAPPORT DE CLASSIFICATION DÉTAILLÉ
# ====================================================================
print("\n7. RAPPORT DE CLASSIFICATION (TEST)")
print("-"*80)

print("\nRapport détaillé par classe:")
print(classification_report(y_test, y_test_pred, target_names=['Classe 0', 'Classe 1', 'Classe 2']))

# ====================================================================
# MATRICE DE CONFUSION
# ====================================================================
print("\n8. MATRICE DE CONFUSION")
print("-"*80)

cm = confusion_matrix(y_test, y_test_pred)
print("\nMatrice de confusion:")
print(cm)

# Visualisation de la matrice de confusion
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True,
            xticklabels=['Classe 0', 'Classe 1', 'Classe 2'],
            yticklabels=['Classe 0', 'Classe 1', 'Classe 2'])
plt.title('Matrice de Confusion - Test Set', fontsize=16, fontweight='bold')
plt.ylabel('Vraie Classe', fontsize=12, fontweight='bold')
plt.xlabel('Classe Prédite', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig(f'{MODEL_PATH}/confusion_matrix_test.png', dpi=300, bbox_inches='tight')
print("\nMatrice de confusion sauvegardée: confusion_matrix_test.png")
plt.close()

# ====================================================================
# FEATURE IMPORTANCE
# ====================================================================
print("\n9. IMPORTANCE DES FEATURES")
print("-"*80)

# Obtenir l'importance des features
feature_importance = final_model.feature_importances_
feature_names = X_train_full.columns

# Créer un DataFrame pour l'importance
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("\nTop 15 features les plus importantes:")
for idx, row in importance_df.head(15).iterrows():
    print(f"  {row['feature']}: {row['importance']:.4f}")

# Visualisation
fig, ax = plt.subplots(figsize=(12, 8))
top_features = importance_df.head(20)
ax.barh(range(len(top_features)), top_features['importance'], color='steelblue')
ax.set_yticks(range(len(top_features)))
ax.set_yticklabels(top_features['feature'])
ax.set_xlabel('Importance', fontsize=12, fontweight='bold')
ax.set_ylabel('Features', fontsize=12, fontweight='bold')
ax.set_title('Top 20 Features les Plus Importantes', fontsize=14, fontweight='bold')
ax.invert_yaxis()
ax.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig(f'{MODEL_PATH}/feature_importance.png', dpi=300, bbox_inches='tight')
print("\nImportance des features sauvegardée: feature_importance.png")
plt.close()

# Sauvegarder l'importance dans un CSV
importance_df.to_csv(f'{MODEL_PATH}/feature_importance.csv', index=False)

# ====================================================================
# SAUVEGARDE DU MODÈLE
# ====================================================================
print("\n10. SAUVEGARDE DU MODÈLE FINAL")
print("-"*80)

# Sauvegarder le modèle
joblib.dump(final_model, f'{MODEL_PATH}/xgboost_final_model.pkl')
print("Modèle sauvegardé: xgboost_final_model.pkl")

# Sauvegarder les résultats finaux
final_results = {
    'model_type': 'XGBoost',
    'training_data': 'Train + Val',
    'test_data': 'Test',
    'n_train_samples': int(len(y_train_full)),
    'n_test_samples': int(len(y_test)),
    'n_features': int(X_train_full.shape[1]),
    
    'hyperparameters': best_params,
    
    'test_metrics': {
        'accuracy': float(test_accuracy),
        'precision': float(test_precision),
        'recall': float(test_recall),
        'f1_score': float(test_f1)
    },
    
    'train_metrics': {
        'accuracy': float(train_full_accuracy),
        'f1_score': float(train_full_f1)
    },
    
    'overfitting': float(train_full_f1 - test_f1),
    'training_time_seconds': float(training_time),
    'gpu_used': True,
    'sample_weights_used': True
}

with open(f'{MODEL_PATH}/final_model_results.json', 'w') as f:
    json.dump(final_results, f, indent=4)
print("Résultats finaux sauvegardés: final_model_results.json")

# ====================================================================
# VISUALISATION COMPARATIVE
# ====================================================================
print("\n11. VISUALISATIONS COMPARATIVES")
print("-"*80)

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Comparaison des métriques Train vs Test
ax1 = axes[0]
metrics = ['Accuracy', 'F1 Score']
train_scores = [train_full_accuracy, train_full_f1]
test_scores = [test_accuracy, test_f1]

x = np.arange(len(metrics))
width = 0.35

bars1 = ax1.bar(x - width/2, train_scores, width, label='Train+Val', color='steelblue')
bars2 = ax1.bar(x + width/2, test_scores, width, label='Test', color='coral')

ax1.set_ylabel('Score', fontsize=12, fontweight='bold')
ax1.set_title('Comparaison Train+Val vs Test', fontsize=14, fontweight='bold')
ax1.set_xticks(x)
ax1.set_xticklabels(metrics)
ax1.legend()
ax1.grid(axis='y', alpha=0.3)
ax1.set_ylim([0, 1])

# Ajouter les valeurs sur les barres
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}', ha='center', va='bottom', fontsize=10)

# Distribution des prédictions par classe
ax2 = axes[1]
pred_dist = pd.Series(y_test_pred).value_counts().sort_index()
true_dist = pd.Series(y_test).value_counts().sort_index()

x = np.arange(len(pred_dist))
width = 0.35

bars1 = ax2.bar(x - width/2, true_dist.values, width, label='Vraies classes', color='steelblue')
bars2 = ax2.bar(x + width/2, pred_dist.values, width, label='Prédictions', color='coral')

ax2.set_ylabel('Nombre d\'exemples', fontsize=12, fontweight='bold')
ax2.set_title('Distribution des Classes (Test)', fontsize=14, fontweight='bold')
ax2.set_xticks(x)
ax2.set_xticklabels([f'Classe {i}' for i in range(len(pred_dist))])
ax2.legend()
ax2.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(f'{MODEL_PATH}/final_model_comparison.png', dpi=300, bbox_inches='tight')
print("Visualisations comparatives sauvegardées: final_model_comparison.png")
plt.close()

# ====================================================================
# RÉSUMÉ FINAL
# ====================================================================
print("\n" + "="*80)
print("RÉSUMÉ FINAL - MODÈLE ENTRAÎNÉ ET ÉVALUÉ")
print("="*80)

print(f"\nDonnées d'entraînement: Train + Val ({len(y_train_full)} exemples)")
print(f"Données de test: Test ({len(y_test)} exemples)")
print(f"Nombre de features: {X_train_full.shape[1]}")

print("\nHyperparamètres utilisés:")
for k, v in sorted(best_params.items()):
    print(f"  {k}: {v}")

print("\nPERFORMANCES SUR TEST:")
print(f"  Accuracy:  {test_accuracy:.4f}")
print(f"  Precision: {test_precision:.4f}")
print(f"  Recall:    {test_recall:.4f}")
print(f"  F1 Score:  {test_f1:.4f}")

print(f"\nTemps d'entraînement: {training_time:.2f} secondes")
print(f"GPU utilisé: OUI")
print(f"Sample weights: OUI")

print("\n" + "="*80)
print("FICHIERS GÉNÉRÉS:")
print("="*80)
print("  1. xgboost_final_model.pkl - Modèle entraîné")
print("  2. final_model_results.json - Résultats détaillés")
print("  3. confusion_matrix_test.png - Matrice de confusion")
print("  4. feature_importance.png - Importance des features")
print("  5. feature_importance.csv - Importance (CSV)")
print("  6. final_model_comparison.png - Comparaisons")
print("="*80)

print("\n✅ ENTRAÎNEMENT ET ÉVALUATION TERMINÉS AVEC SUCCÈS!")
print("="*80)

SCRIPT 2: ENTRAÎNEMENT DU MODÈLE FINAL

1. VÉRIFICATION CUDA
--------------------------------------------------------------------------------
XGBoost version: 2.1.4
Support GPU: OUI

2. CHARGEMENT DES DONNÉES
--------------------------------------------------------------------------------
X_train: (6694, 13)
X_val: (956, 13)
X_test: (957, 13)

X_train_full (Train+Val): (7650, 13)
y_train_full (Train+Val): (7650,)

Distribution des classes (Train+Val):
  Classe 0: 3871 (50.6%)
  Classe 1: 1583 (20.7%)
  Classe 2: 2196 (28.7%)

Distribution des classes (Test):
  Classe 0: 484 (50.6%)
  Classe 1: 198 (20.7%)
  Classe 2: 275 (28.7%)

3. CALCUL DES POIDS DES CLASSES
--------------------------------------------------------------------------------
Poids calculés pour gérer le déséquilibre des classes

4. CHARGEMENT DES MEILLEURS HYPERPARAMÈTRES
--------------------------------------------------------------------------------
Hyperparamètres utilisés:
  colsample_bytree: 0.7
  gamma: 0.0
  lear

In [9]:
"""
SCRIPT 3: AJUSTEMENT DES POIDS
=======================================================
Teste différents poids pour améliorer la performance de la Classe 1
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (f1_score, classification_report, confusion_matrix,
                             precision_score, recall_score, accuracy_score)
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier
import xgboost as xgb
import json
import warnings
warnings.filterwarnings('ignore')

# ====================================================================
# CONFIGURATION
# ====================================================================
BASE_PATH = 'C:/Users/chaym/Desktop/NasaChallenge'
DATA_PATH = f'{BASE_PATH}/data/processed'
MODEL_PATH = f'{BASE_PATH}/models/xgboost_model'

print("="*80)
print("SCRIPT 3: AMÉLIORATION CLASSE 1 - AJUSTEMENT DES POIDS")
print("="*80)

# ====================================================================
# VÉRIFICATION CUDA
# ====================================================================
print("\n1. VÉRIFICATION CUDA")
print("-"*80)

print(f"XGBoost version: {xgb.__version__}")
build_info = xgb.build_info()

if not build_info.get('USE_CUDA', False):
    print("ERREUR: XGBoost n'a pas été compilé avec CUDA")
    exit(1)
print("Support GPU: OUI")

# ====================================================================
# CHARGEMENT DES DONNÉES
# ====================================================================
print("\n2. CHARGEMENT DES DONNÉES")
print("-"*80)

X_train = pd.read_csv(f'{DATA_PATH}/step6_X_train.csv')
X_val = pd.read_csv(f'{DATA_PATH}/step6_X_val.csv')
X_test = pd.read_csv(f'{DATA_PATH}/step6_X_test.csv')

y_train = pd.read_csv(f'{DATA_PATH}/step6_y_train.csv').values.ravel()
y_val = pd.read_csv(f'{DATA_PATH}/step6_y_val.csv').values.ravel()
y_test = pd.read_csv(f'{DATA_PATH}/step6_y_test.csv').values.ravel()

# Combiner Train + Val
X_train_full = pd.concat([X_train, X_val], axis=0)
y_train_full = np.concatenate([y_train, y_val])

print(f"X_train_full: {X_train_full.shape}")
print(f"X_test: {X_test.shape}")

print("\nDistribution des classes (Train+Val):")
for class_name, count in pd.Series(y_train_full).value_counts().sort_index().items():
    print(f"  Classe {class_name}: {count} ({count/len(y_train_full)*100:.1f}%)")

# ====================================================================
# MEILLEURS HYPERPARAMÈTRES (BASELINE)
# ====================================================================
print("\n3. HYPERPARAMÈTRES DE BASE")
print("-"*80)

best_params = {
    'colsample_bytree': 0.7,
    'gamma': 0.0,
    'learning_rate': 0.05,
    'max_depth': 11,
    'min_child_weight': 5,
    'n_estimators': 200,
    'reg_alpha': 0.0,
    'reg_lambda': 3.0,
    'subsample': 0.6,
    
    # Paramètres de base
    'objective': 'multi:softmax',
    'num_class': 3,
    'eval_metric': 'mlogloss',
    'random_state': 42,
    'tree_method': 'gpu_hist',
    'gpu_id': 0,
    'predictor': 'gpu_predictor',
    'enable_categorical': False
}

print("Hyperparamètres:")
for k, v in sorted(best_params.items()):
    print(f"  {k}: {v}")

# ====================================================================
# BASELINE - POIDS BALANCED STANDARD
# ====================================================================
print("\n4. BASELINE - POIDS BALANCED STANDARD")
print("-"*80)

# Calcul des poids standard
class_weights_base = compute_class_weight('balanced', 
                                          classes=np.unique(y_train_full), 
                                          y=y_train_full)
sample_weights_base = np.array([class_weights_base[y] for y in y_train_full])

print("Poids de classes (balanced):")
for i, w in enumerate(class_weights_base):
    print(f"  Classe {i}: {w:.4f}")

# Entraîner modèle baseline
print("\nEntraînement modèle baseline...")
baseline_model = XGBClassifier(**best_params)
baseline_model.fit(X_train_full, y_train_full, 
                   sample_weight=sample_weights_base, verbose=False)

# Évaluation baseline
y_test_pred_base = baseline_model.predict(X_test)
baseline_f1_overall = f1_score(y_test, y_test_pred_base, average='weighted')
baseline_f1_class1 = f1_score(y_test, y_test_pred_base, average=None)[1]
baseline_precision_class1 = precision_score(y_test, y_test_pred_base, average=None)[1]
baseline_recall_class1 = recall_score(y_test, y_test_pred_base, average=None)[1]

print("\nRÉSULTATS BASELINE:")
print(f"  F1 global (Test): {baseline_f1_overall:.4f}")
print(f"  F1 Classe 1: {baseline_f1_class1:.4f}")
print(f"  Precision Classe 1: {baseline_precision_class1:.4f}")
print(f"  Recall Classe 1: {baseline_recall_class1:.4f}")

# ====================================================================
# TEST DE DIFFÉRENTS MULTIPLICATEURS POUR CLASSE 1
# ====================================================================
print("\n5. TEST DE DIFFÉRENTS MULTIPLICATEURS POUR CLASSE 1")
print("-"*80)

# Multiplicateurs à tester
multipliers = [1.0, 1.2, 1.5, 1.8, 2.0, 2.5, 3.0, 3.5, 4.0]

results = []

print("\nTest des multiplicateurs...")
print("-"*80)

for mult in multipliers:
    # Calculer les nouveaux poids
    class_weights = class_weights_base.copy()
    class_weights[1] *= mult  # Multiplier le poids de Classe 1
    
    sample_weights = np.array([class_weights[y] for y in y_train_full])
    
    # Entraîner le modèle
    model = XGBClassifier(**best_params)
    model.fit(X_train_full, y_train_full, 
              sample_weight=sample_weights, verbose=False)
    
    # Prédictions
    y_pred = model.predict(X_test)
    
    # Métriques globales
    f1_overall = f1_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    
    # Métriques par classe
    f1_per_class = f1_score(y_test, y_pred, average=None)
    precision_per_class = precision_score(y_test, y_pred, average=None)
    recall_per_class = recall_score(y_test, y_pred, average=None)
    
    # Matrice de confusion
    cm = confusion_matrix(y_test, y_pred)
    
    # Sauvegarder les résultats
    result = {
        'multiplier': mult,
        'f1_overall': f1_overall,
        'accuracy': accuracy,
        'f1_class0': f1_per_class[0],
        'f1_class1': f1_per_class[1],
        'f1_class2': f1_per_class[2],
        'precision_class1': precision_per_class[1],
        'recall_class1': recall_per_class[1],
        'confusion_matrix': cm,
        'class_weights': class_weights.copy()
    }
    results.append(result)
    
    # Afficher
    print(f"Mult {mult:.1f}x: F1={f1_overall:.4f} | "
          f"F1_C1={f1_per_class[1]:.4f} (P={precision_per_class[1]:.4f}, R={recall_per_class[1]:.4f})")

# ====================================================================
# ANALYSE DES RÉSULTATS
# ====================================================================
print("\n6. ANALYSE DES RÉSULTATS")
print("-"*80)

results_df = pd.DataFrame(results)

# Trouver le meilleur multiplicateur selon différents critères
best_overall_idx = results_df['f1_overall'].idxmax()
best_class1_idx = results_df['f1_class1'].idxmax()
best_balanced_idx = (results_df['f1_overall'] + results_df['f1_class1']).idxmax()

print("\nMEILLEURS MULTIPLICATEURS:")
print("-"*80)

print(f"\n1. Meilleur F1 Global:")
print(f"   Multiplicateur: {results_df.loc[best_overall_idx, 'multiplier']:.1f}x")
print(f"   F1 Global: {results_df.loc[best_overall_idx, 'f1_overall']:.4f}")
print(f"   F1 Classe 1: {results_df.loc[best_overall_idx, 'f1_class1']:.4f}")

print(f"\n2. Meilleur F1 Classe 1:")
print(f"   Multiplicateur: {results_df.loc[best_class1_idx, 'multiplier']:.1f}x")
print(f"   F1 Global: {results_df.loc[best_class1_idx, 'f1_overall']:.4f}")
print(f"   F1 Classe 1: {results_df.loc[best_class1_idx, 'f1_class1']:.4f}")

print(f"\n3. Meilleur Équilibre (F1 Global + F1 Classe 1):")
print(f"   Multiplicateur: {results_df.loc[best_balanced_idx, 'multiplier']:.1f}x")
print(f"   F1 Global: {results_df.loc[best_balanced_idx, 'f1_overall']:.4f}")
print(f"   F1 Classe 1: {results_df.loc[best_balanced_idx, 'f1_class1']:.4f}")

# Amélioration par rapport au baseline
improvement_overall = results_df.loc[best_overall_idx, 'f1_overall'] - baseline_f1_overall
improvement_class1 = results_df.loc[best_class1_idx, 'f1_class1'] - baseline_f1_class1

print(f"\nAMÉLIORATIONS:")
print(f"  F1 Global: +{improvement_overall:.4f}")
print(f"  F1 Classe 1: +{improvement_class1:.4f}")

# ====================================================================
# VISUALISATIONS
# ====================================================================
print("\n7. VISUALISATIONS")
print("-"*80)

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. F1 Score vs Multiplicateur
ax1 = axes[0, 0]
ax1.plot(results_df['multiplier'], results_df['f1_overall'], 
         marker='o', linewidth=2, markersize=8, label='F1 Global', color='steelblue')
ax1.plot(results_df['multiplier'], results_df['f1_class1'], 
         marker='s', linewidth=2, markersize=8, label='F1 Classe 1', color='coral')
ax1.axhline(baseline_f1_overall, linestyle='--', color='steelblue', 
            alpha=0.5, label='Baseline Global')
ax1.axhline(baseline_f1_class1, linestyle='--', color='coral', 
            alpha=0.5, label='Baseline Classe 1')
ax1.set_xlabel('Multiplicateur Classe 1', fontsize=12, fontweight='bold')
ax1.set_ylabel('F1 Score', fontsize=12, fontweight='bold')
ax1.set_title('F1 Score vs Multiplicateur', fontsize=14, fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# 2. Precision vs Recall Classe 1
ax2 = axes[0, 1]
scatter = ax2.scatter(results_df['recall_class1'], results_df['precision_class1'], 
                      c=results_df['multiplier'], s=200, cmap='viridis', 
                      edgecolors='black', linewidth=2)
ax2.scatter(baseline_recall_class1, baseline_precision_class1, 
            s=300, marker='*', color='red', edgecolors='black', 
            linewidth=2, label='Baseline', zorder=5)
ax2.set_xlabel('Recall Classe 1', fontsize=12, fontweight='bold')
ax2.set_ylabel('Precision Classe 1', fontsize=12, fontweight='bold')
ax2.set_title('Precision vs Recall - Classe 1', fontsize=14, fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3)
cbar = plt.colorbar(scatter, ax=ax2)
cbar.set_label('Multiplicateur', fontsize=10, fontweight='bold')

# 3. F1 par classe
ax3 = axes[1, 0]
best_mult = results_df.loc[best_balanced_idx, 'multiplier']
best_result = results_df.loc[best_balanced_idx]

x = np.arange(3)
width = 0.35

baseline_f1s = [f1_score(y_test, y_test_pred_base, average=None)[i] for i in range(3)]
best_f1s = [best_result['f1_class0'], best_result['f1_class1'], best_result['f1_class2']]

bars1 = ax3.bar(x - width/2, baseline_f1s, width, label='Baseline', color='lightgray')
bars2 = ax3.bar(x + width/2, best_f1s, width, label=f'Mult {best_mult:.1f}x', color='coral')

ax3.set_ylabel('F1 Score', fontsize=12, fontweight='bold')
ax3.set_title('F1 par Classe - Comparaison', fontsize=14, fontweight='bold')
ax3.set_xticks(x)
ax3.set_xticklabels(['Classe 0', 'Classe 1', 'Classe 2'])
ax3.legend()
ax3.grid(axis='y', alpha=0.3)

# Ajouter les valeurs
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax3.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}', ha='center', va='bottom', fontsize=9)

# 4. Tableau récapitulatif
ax4 = axes[1, 1]
ax4.axis('off')

table_data = []
for idx, row in results_df.iterrows():
    table_data.append([
        f"{row['multiplier']:.1f}x",
        f"{row['f1_overall']:.4f}",
        f"{row['f1_class1']:.4f}",
        f"{row['precision_class1']:.4f}",
        f"{row['recall_class1']:.4f}"
    ])

table = ax4.table(cellText=table_data,
                  colLabels=['Mult', 'F1 Global', 'F1 C1', 'Prec C1', 'Rec C1'],
                  cellLoc='center',
                  loc='center',
                  bbox=[0, 0, 1, 1])
table.auto_set_font_size(False)
table.set_fontsize(9)
table.scale(1, 2)

# Colorer la meilleure ligne
for i in range(len(table_data[0])):
    table[(best_balanced_idx+1, i)].set_facecolor('#90EE90')
    table[(best_balanced_idx+1, i)].set_text_props(weight='bold')

ax4.set_title('Résultats Détaillés', fontsize=14, fontweight='bold', pad=20)

plt.tight_layout()
plt.savefig(f'{MODEL_PATH}/class1_weight_optimization.png', dpi=300, bbox_inches='tight')
print("Visualisations sauvegardées: class1_weight_optimization.png")
plt.close()

# ====================================================================
# MATRICE DE CONFUSION - COMPARAISON
# ====================================================================
print("\n8. MATRICES DE CONFUSION")
print("-"*80)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Baseline
ax1 = axes[0]
cm_base = confusion_matrix(y_test, y_test_pred_base)
sns.heatmap(cm_base, annot=True, fmt='d', cmap='Blues', ax=ax1, cbar=True,
            xticklabels=['C0', 'C1', 'C2'],
            yticklabels=['C0', 'C1', 'C2'])
ax1.set_title(f'Baseline (Mult 1.0x)\nF1 C1 = {baseline_f1_class1:.4f}', 
              fontsize=14, fontweight='bold')
ax1.set_ylabel('Vraie Classe', fontsize=12, fontweight='bold')
ax1.set_xlabel('Classe Prédite', fontsize=12, fontweight='bold')

# Meilleur multiplicateur
ax2 = axes[1]
best_cm = results_df.loc[best_balanced_idx, 'confusion_matrix']
best_mult = results_df.loc[best_balanced_idx, 'multiplier']
best_f1_c1 = results_df.loc[best_balanced_idx, 'f1_class1']

sns.heatmap(best_cm, annot=True, fmt='d', cmap='Greens', ax=ax2, cbar=True,
            xticklabels=['C0', 'C1', 'C2'],
            yticklabels=['C0', 'C1', 'C2'])
ax2.set_title(f'Optimisé (Mult {best_mult:.1f}x)\nF1 C1 = {best_f1_c1:.4f}', 
              fontsize=14, fontweight='bold')
ax2.set_ylabel('Vraie Classe', fontsize=12, fontweight='bold')
ax2.set_xlabel('Classe Prédite', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig(f'{MODEL_PATH}/confusion_matrix_comparison.png', dpi=300, bbox_inches='tight')
print("Matrices de confusion sauvegardées: confusion_matrix_comparison.png")
plt.close()

# ====================================================================
# ENTRAÎNEMENT DU MODÈLE FINAL OPTIMISÉ
# ====================================================================
print("\n9. ENTRAÎNEMENT DU MODÈLE FINAL OPTIMISÉ")
print("-"*80)

best_multiplier = results_df.loc[best_balanced_idx, 'multiplier']
print(f"Multiplicateur optimal: {best_multiplier:.1f}x")

# Calculer les poids finaux
final_class_weights = class_weights_base.copy()
final_class_weights[1] *= best_multiplier
final_sample_weights = np.array([final_class_weights[y] for y in y_train_full])

print("\nPoids finaux des classes:")
for i, w in enumerate(final_class_weights):
    print(f"  Classe {i}: {w:.4f}")

# Entraîner le modèle final
print("\nEntraînement du modèle final...")
final_model = XGBClassifier(**best_params)
final_model.fit(X_train_full, y_train_full, 
                sample_weight=final_sample_weights, verbose=False)

# Évaluation finale
y_test_pred_final = final_model.predict(X_test)

print("\nRAPPORT DE CLASSIFICATION FINAL:")
print(classification_report(y_test, y_test_pred_final, 
                           target_names=['Classe 0', 'Classe 1', 'Classe 2']))

# ====================================================================
# SAUVEGARDE DES RÉSULTATS
# ====================================================================
print("\n10. SAUVEGARDE DES RÉSULTATS")
print("-"*80)

# Sauvegarder les résultats CSV
results_df.to_csv(f'{MODEL_PATH}/class1_optimization_results.csv', index=False)
print("Résultats sauvegardés: class1_optimization_results.csv")

# Sauvegarder la configuration optimale
optimal_config = {
    'baseline': {
        'f1_overall': float(baseline_f1_overall),
        'f1_class1': float(baseline_f1_class1),
        'precision_class1': float(baseline_precision_class1),
        'recall_class1': float(baseline_recall_class1)
    },
    'optimized': {
        'multiplier': float(best_multiplier),
        'f1_overall': float(results_df.loc[best_balanced_idx, 'f1_overall']),
        'f1_class1': float(results_df.loc[best_balanced_idx, 'f1_class1']),
        'precision_class1': float(results_df.loc[best_balanced_idx, 'precision_class1']),
        'recall_class1': float(results_df.loc[best_balanced_idx, 'recall_class1']),
        'class_weights': final_class_weights.tolist()
    },
    'improvements': {
        'f1_overall': float(results_df.loc[best_balanced_idx, 'f1_overall'] - baseline_f1_overall),
        'f1_class1': float(results_df.loc[best_balanced_idx, 'f1_class1'] - baseline_f1_class1)
    }
}

with open(f'{MODEL_PATH}/class1_optimization_config.json', 'w') as f:
    json.dump(optimal_config, f, indent=4)
print("Configuration optimale: class1_optimization_config.json")

# Sauvegarder le modèle optimisé
import joblib
joblib.dump(final_model, f'{MODEL_PATH}/xgboost_optimized_class1.pkl')
print("Modèle optimisé: xgboost_optimized_class1.pkl")

# ====================================================================
# RÉSUMÉ FINAL
# ====================================================================
print("\n" + "="*80)
print("RÉSUMÉ - OPTIMISATION CLASSE 1")
print("="*80)

print(f"\nMULTIPLICATEUR OPTIMAL: {best_multiplier:.1f}x")

print("\nBASELINE:")
print(f"  F1 Global: {baseline_f1_overall:.4f}")
print(f"  F1 Classe 1: {baseline_f1_class1:.4f}")

print("\nOPTIMISÉ:")
print(f"  F1 Global: {results_df.loc[best_balanced_idx, 'f1_overall']:.4f}")
print(f"  F1 Classe 1: {results_df.loc[best_balanced_idx, 'f1_class1']:.4f}")

print("\nAMÉLIORATIONS:")
print(f"  F1 Global: +{improvement_overall:.4f} ({improvement_overall/baseline_f1_overall*100:+.1f}%)")
print(f"  F1 Classe 1: +{improvement_class1:.4f} ({improvement_class1/baseline_f1_class1*100:+.1f}%)")

print("\n" + "="*80)
print("FICHIERS GÉNÉRÉS:")
print("="*80)
print("  1. class1_optimization_results.csv - Tous les résultats")
print("  2. class1_optimization_config.json - Configuration optimale")
print("  3. class1_weight_optimization.png - Visualisations")
print("  4. confusion_matrix_comparison.png - Comparaison matrices")
print("  5. xgboost_optimized_class1.pkl - Modèle optimisé")
print("="*80)

print("\n✅ OPTIMISATION CLASSE 1 TERMINÉE!")
print("="*80)

SCRIPT 3: AMÉLIORATION CLASSE 1 - AJUSTEMENT DES POIDS

1. VÉRIFICATION CUDA
--------------------------------------------------------------------------------
XGBoost version: 2.1.4
Support GPU: OUI

2. CHARGEMENT DES DONNÉES
--------------------------------------------------------------------------------
X_train_full: (7650, 13)
X_test: (957, 13)

Distribution des classes (Train+Val):
  Classe 0: 3871 (50.6%)
  Classe 1: 1583 (20.7%)
  Classe 2: 2196 (28.7%)

3. HYPERPARAMÈTRES DE BASE
--------------------------------------------------------------------------------
Hyperparamètres:
  colsample_bytree: 0.7
  enable_categorical: False
  eval_metric: mlogloss
  gamma: 0.0
  gpu_id: 0
  learning_rate: 0.05
  max_depth: 11
  min_child_weight: 5
  n_estimators: 200
  num_class: 3
  objective: multi:softmax
  predictor: gpu_predictor
  random_state: 42
  reg_alpha: 0.0
  reg_lambda: 3.0
  subsample: 0.6
  tree_method: gpu_hist

4. BASELINE - POIDS BALANCED STANDARD
---------------------------

In [7]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix, 
    f1_score, precision_score, recall_score
)
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
import json
import os
from itertools import product
warnings.filterwarnings('ignore')

# Configuration
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# CONFIGURATION DES CHEMINS
BASE_PATH = 'C:/Users/chaym/Desktop/NasaChallenge'
DATA_PATH = f'{BASE_PATH}/data/processed'
MODEL_PATH = f'{BASE_PATH}/models/xgboost_model'

print("="*80)
print("XGBOOST - HYPERPARAMETER TUNING + FINAL TRAINING")
print("="*80)

# ====================================================================
# 1. VÉRIFICATION CUDA
# ====================================================================
print("\n1. VÉRIFICATION CUDA")
print("-"*80)

print(f"XGBoost version: {xgb.__version__}")
build_info = xgb.build_info()

if not build_info.get('USE_CUDA', False):
    print("ATTENTION: XGBoost n'a pas été compilé avec CUDA")
    print("Le modèle utilisera le CPU")
    device = 'cpu'
else:
    print("Support GPU: OUI")
    device = 'cuda'

# ====================================================================
# 2. CHARGEMENT DES DONNÉES
# ====================================================================
print("\n2. CHARGEMENT DES DONNÉES")
print("-"*80)

X_train = pd.read_csv(f'{DATA_PATH}/step6_X_train.csv')
y_train = pd.read_csv(f'{DATA_PATH}/step6_y_train.csv').squeeze()

X_validate = pd.read_csv(f'{DATA_PATH}/step6_X_val.csv')
y_validate = pd.read_csv(f'{DATA_PATH}/step6_y_val.csv').squeeze()

X_test = pd.read_csv(f'{DATA_PATH}/step6_X_test.csv')
y_test = pd.read_csv(f'{DATA_PATH}/step6_y_test.csv').squeeze()

class_names = {0: 'False Positive', 1: 'Candidate', 2: 'Confirmed'}

print(f"Train:      {X_train.shape[0]:>6} samples, {X_train.shape[1]:>3} features")
print(f"Validation: {X_validate.shape[0]:>6} samples, {X_validate.shape[1]:>3} features")
print(f"Test:       {X_test.shape[0]:>6} samples, {X_test.shape[1]:>3} features")

# Distribution des classes
print("\nDistribution des classes (Train):")
for cls in [0, 1, 2]:
    count = (y_train == cls).sum()
    pct = count / len(y_train) * 100
    print(f"   {class_names[cls]:20s}: {count:>5} ({pct:>5.2f}%)")

# ====================================================================
# 3. VÉRIFICATION DES FEATURES
# ====================================================================
print("\n3. VÉRIFICATION DES FEATURES CRITIQUES")
print("-"*80)

required_features = [
    'koi_duration', 'koi_duration_err1',
    'koi_depth', 'koi_depth_err1',
    'koi_model_snr'
]

for feat in required_features:
    if feat in X_train.columns:
        print(f"   ✓ {feat}")
    else:
        raise ValueError(f"Feature manquante: {feat}")

# ====================================================================
# 4. FEATURE ENGINEERING
# ====================================================================
print("\n4. FEATURE ENGINEERING")
print("-"*80)

def engineer_transit_features(X):
    X_enhanced = X.copy()
    
    if 'koi_depth' in X.columns and 'koi_duration' in X.columns:
        X_enhanced['transit_depth_duration_ratio'] = (
            X['koi_depth'] / (X['koi_duration'] + 1e-6)
        )
        print("   ✓ transit_depth_duration_ratio")
    
    if 'koi_model_snr' in X.columns:
        X_enhanced['snr_log'] = np.log1p(X['koi_model_snr'])
        X_enhanced['snr_squared'] = X['koi_model_snr'] ** 2
        print("   ✓ snr_log, snr_squared")
    
    return X_enhanced

X_train_enh = engineer_transit_features(X_train)
X_val_enh = engineer_transit_features(X_validate)
X_test_enh = engineer_transit_features(X_test)

print(f"\nFeatures finales: {X_train_enh.shape[1]}")

# ====================================================================
# 5. CALCUL DES POIDS DE CLASSES
# ====================================================================
print("\n5. CALCUL DES POIDS DE CLASSES")
print("-"*80)

all_classes = np.array([0, 1, 2])
auto_weights = compute_class_weight(
    class_weight='balanced', 
    classes=all_classes, 
    y=y_train
)

class_weights = {
    0: auto_weights[0] * 0.8,
    1: auto_weights[1] * 1.5,  # BOOST Candidate
    2: auto_weights[2] * 1.0
}

print("Poids calculés:")
for cls, weight in class_weights.items():
    boost = " [BOOST +50%]" if cls == 1 else ""
    print(f"   {class_names[cls]:20s}: {weight:.4f}x{boost}")

sample_weights_train = np.array([class_weights[cls] for cls in y_train])

# ====================================================================
# 6. PHASE 1: HYPERPARAMETER TUNING (Train/Val)
# ====================================================================
print("\n" + "="*80)
print("PHASE 1: HYPERPARAMETER TUNING")
print("="*80)

# Grille d'hyperparamètres à tester
param_grid = {
    'learning_rate': [0.01, 0.03, 0.05],
    'max_depth': [8, 10, 12],
    'min_child_weight': [20, 30, 40],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

print(f"\nNombre total de combinaisons: {np.prod([len(v) for v in param_grid.values()])}")
print("\nTest en cours...")

# Préparer les datasets pour tuning
dtrain_tune = xgb.DMatrix(X_train_enh, label=y_train, weight=sample_weights_train)
dval_tune = xgb.DMatrix(X_val_enh, label=y_validate)

best_score = float('inf')
best_params = None
results = []

# Grid Search
for lr, md, mcw, sub, col in product(
    param_grid['learning_rate'],
    param_grid['max_depth'],
    param_grid['min_child_weight'],
    param_grid['subsample'],
    param_grid['colsample_bytree']
):
    params = {
        'objective': 'multi:softprob',
        'num_class': 3,
        'eval_metric': 'mlogloss',
        'tree_method': 'hist' if device == 'cpu' else 'gpu_hist',
        'device': device,
        'learning_rate': lr,
        'max_depth': md,
        'min_child_weight': mcw,
        'subsample': sub,
        'colsample_bytree': col,
        'alpha': 0.1,
        'lambda': 0.1,
        'gamma': 0.001,
        'random_state': 42,
        'verbosity': 0
    }
    
    # Entraînement
    evals = [(dtrain_tune, 'train'), (dval_tune, 'valid')]
    model_temp = xgb.train(
        params,
        dtrain_tune,
        num_boost_round=1000,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=False
    )
    
    val_score = model_temp.best_score
    
    results.append({
        'learning_rate': lr,
        'max_depth': md,
        'min_child_weight': mcw,
        'subsample': sub,
        'colsample_bytree': col,
        'val_score': val_score,
        'best_iteration': model_temp.best_iteration
    })
    
    if val_score < best_score:
        best_score = val_score
        best_params = params.copy()
    
    print(f"lr={lr:.2f}, md={md}, mcw={mcw}, sub={sub:.1f}, col={col:.1f} -> val_loss={val_score:.6f}")

# Afficher les meilleurs résultats
print("\n" + "="*80)
print("MEILLEURS HYPERPARAMÈTRES TROUVÉS")
print("="*80)
print(f"\nMeilleur score validation: {best_score:.6f}")
print("\nHyperparamètres:")
print(f"   learning_rate:     {best_params['learning_rate']}")
print(f"   max_depth:         {best_params['max_depth']}")
print(f"   min_child_weight:  {best_params['min_child_weight']}")
print(f"   subsample:         {best_params['subsample']}")
print(f"   colsample_bytree:  {best_params['colsample_bytree']}")

# Sauvegarder les résultats du tuning
results_df = pd.DataFrame(results).sort_values('val_score')
os.makedirs(MODEL_PATH, exist_ok=True)
results_df.to_csv(f'{MODEL_PATH}/hyperparameter_tuning_results.csv', index=False)
print(f"\nRésultats sauvegardés: {MODEL_PATH}/hyperparameter_tuning_results.csv")

# ====================================================================
# 7. PHASE 2: ENTRAÎNEMENT FINAL (Train+Val) / Test
# ====================================================================
print("\n" + "="*80)
print("PHASE 2: ENTRAÎNEMENT FINAL")
print("="*80)

# Combiner Train et Val pour l'entraînement final
X_train_val = pd.concat([X_train_enh, X_val_enh], axis=0)
y_train_val = pd.concat([y_train, y_validate], axis=0)

# Recalculer les poids pour le dataset combiné
sample_weights_combined = np.array([class_weights[cls] for cls in y_train_val])

print(f"\nDataset combiné (Train+Val): {X_train_val.shape[0]} samples")

# Préparer les datasets finaux
dtrain_final = xgb.DMatrix(X_train_val, label=y_train_val, weight=sample_weights_combined)
dtest_final = xgb.DMatrix(X_test_enh, label=y_test)

# Entraînement final avec meilleurs hyperparamètres
print("\nEntraînement final en cours...\n")

evals_final = [(dtrain_final, 'train'), (dtest_final, 'test')]

model_final = xgb.train(
    best_params,
    dtrain_final,
    num_boost_round=3000,
    evals=evals_final,
    early_stopping_rounds=100,
    verbose_eval=100
)

print(f"\nEntraînement terminé!")
print(f"   Best Iteration: {model_final.best_iteration}")
print(f"   Best Test Loss: {model_final.best_score:.6f}")

# ====================================================================
# 8. ÉVALUATION FINALE SUR TEST SET
# ====================================================================
print("\n" + "="*80)
print("ÉVALUATION FINALE SUR TEST SET")
print("="*80)

y_test_pred_proba = model_final.predict(dtest_final, iteration_range=(0, model_final.best_iteration))
y_test_pred = y_test_pred_proba.argmax(axis=1)

# Métriques
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='weighted', zero_division=0)
test_recall = recall_score(y_test, y_test_pred, average='weighted', zero_division=0)
test_f1_macro = f1_score(y_test, y_test_pred, average='macro', zero_division=0)
test_f1_weighted = f1_score(y_test, y_test_pred, average='weighted', zero_division=0)

print("\nPerformance Metrics:")
print(f"   Accuracy:           {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"   Precision (Avg):    {test_precision:.4f}")
print(f"   Recall (Avg):       {test_recall:.4f}")
print(f"   F1-Score (Macro):   {test_f1_macro:.4f}")
print(f"   F1-Score (Weighted):{test_f1_weighted:.4f}")

print("\nClassification Report:")
print(classification_report(
    y_test, 
    y_test_pred,
    target_names=['False Positive', 'Candidate', 'Confirmed'],
    digits=4,
    zero_division=0
))

# Performance classe 1
class1_mask = y_test == 1
if class1_mask.sum() > 0:
    class1_acc = accuracy_score(y_test[class1_mask], y_test_pred[class1_mask])
    print(f"\nPERFORMANCE CLASSE 1 (Candidate): {class1_acc:.4f} ({class1_acc*100:.2f}%)")

# ====================================================================
# 9. VISUALISATIONS
# ====================================================================
print("\n9. VISUALISATIONS")
print("-"*80)

# Matrice de confusion
cm = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['False Positive', 'Candidate', 'Confirmed'],
            yticklabels=['False Positive', 'Candidate', 'Confirmed'])
plt.title('Confusion Matrix - Test Set', fontsize=16, fontweight='bold')
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
plt.tight_layout()
plt.savefig(f'{MODEL_PATH}/confusion_matrix_final.png', dpi=300)
print("Matrice de confusion: confusion_matrix_final.png")
plt.close()

# Feature importance
importance_dict = model_final.get_score(importance_type='gain')
feature_importance = pd.DataFrame({
    'feature': list(importance_dict.keys()),
    'importance': list(importance_dict.values())
}).sort_values('importance', ascending=False)

print("\nTOP 15 FEATURES:")
print(feature_importance.head(15).to_string(index=False))

plt.figure(figsize=(12, 8))
top_20 = feature_importance.head(20)
plt.barh(top_20['feature'], top_20['importance'], color='steelblue')
plt.xlabel('Importance (Gain)')
plt.title('Top 20 Features', fontsize=16, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig(f'{MODEL_PATH}/feature_importance_final.png', dpi=300)
print("Feature importance: feature_importance_final.png")
plt.close()

# ====================================================================
# 10. SAUVEGARDE
# ====================================================================
print("\n10. SAUVEGARDE")
print("-"*80)

model_final.save_model(f'{MODEL_PATH}/exoplanet_xgboost_final.json')
print(f"Modèle: exoplanet_xgboost_final.json")

metadata = {
    'timestamp': datetime.now().isoformat(),
    'xgboost_version': xgb.__version__,
    'device': device,
    'training_strategy': 'Train+Val combined for final model',
    'best_hyperparameters': {
        'learning_rate': best_params['learning_rate'],
        'max_depth': best_params['max_depth'],
        'min_child_weight': best_params['min_child_weight'],
        'subsample': best_params['subsample'],
        'colsample_bytree': best_params['colsample_bytree']
    },
    'best_iteration': int(model_final.best_iteration),
    'best_score': float(model_final.best_score),
    'test_metrics': {
        'accuracy': float(test_accuracy),
        'precision': float(test_precision),
        'recall': float(test_recall),
        'f1_macro': float(test_f1_macro),
        'f1_weighted': float(test_f1_weighted)
    },
    'class_weights': {str(k): float(v) for k, v in class_weights.items()},
    'features_used': [
        'koi_duration', 'koi_duration_err1',
        'koi_depth', 'koi_depth_err1',
        'koi_model_snr', 'snr_log', 'snr_squared',
        'transit_depth_duration_ratio'
    ],
    'training_samples': int(X_train_val.shape[0]),
    'test_samples': int(X_test_enh.shape[0])
}

with open(f'{MODEL_PATH}/model_metadata_final.json', 'w') as f:
    json.dump(metadata, f, indent=4)
print("Métadonnées: model_metadata_final.json")

print("\n" + "="*80)
print("PIPELINE COMPLET TERMINÉ")
print("="*80)
print(f"\nFichiers dans {MODEL_PATH}/:")
print("   - hyperparameter_tuning_results.csv")
print("   - exoplanet_xgboost_final.json")
print("   - model_metadata_final.json")
print("   - confusion_matrix_final.png")
print("   - feature_importance_final.png")

XGBOOST - HYPERPARAMETER TUNING + FINAL TRAINING

1. VÉRIFICATION CUDA
--------------------------------------------------------------------------------
XGBoost version: 2.1.4
Support GPU: OUI

2. CHARGEMENT DES DONNÉES
--------------------------------------------------------------------------------
Train:        6694 samples,  13 features
Validation:    956 samples,  13 features
Test:          957 samples,  13 features

Distribution des classes (Train):
   False Positive      :  3387 (50.60%)
   Candidate           :  1385 (20.69%)
   Confirmed           :  1922 (28.71%)

3. VÉRIFICATION DES FEATURES CRITIQUES
--------------------------------------------------------------------------------
   ✓ koi_duration
   ✓ koi_duration_err1
   ✓ koi_depth
   ✓ koi_depth_err1
   ✓ koi_model_snr

4. FEATURE ENGINEERING
--------------------------------------------------------------------------------
   ✓ transit_depth_duration_ratio
   ✓ snr_log, snr_squared
   ✓ transit_depth_duration_ratio
   ✓ sn