# Entra√Ænement des Mod√®les de D√©tection de Fraudes

Ce notebook impl√©mente l'entra√Ænement des mod√®les Random Forest et XGBoost avec optimisation des hyperparam√®tres.

## Objectifs

- Charger et pr√©parer les donn√©es avec features engineering
- Impl√©menter et entra√Æner Random Forest et XGBoost
- Optimiser les hyperparam√®tres avec validation crois√©e
- G√©rer le d√©s√©quilibre des classes avec SMOTE
- Comparer les performances des mod√®les
- Sauvegarder le meilleur mod√®le

In [None]:
# Configuration et imports
import sys
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Ajout du chemin racine au sys.path
ROOT_DIR = Path.cwd().parent
sys.path.append(str(ROOT_DIR))

# Imports de base
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import json

# Imports ML
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Imports d'optimisation
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

# Imports locaux
from src.data.data_loader import DataLoader
from src.data.preprocessor import Preprocessor
from src.data.feature_engineer import FeatureEngineer
from src.models.random_forest_model import RandomForestModel
from src.models.xgboost_model import XGBoostModel
from src.utils.metrics import calculate_metrics, pr_auc_score

# Configuration des graphiques
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("‚úÖ Environnement configur√© avec succ√®s")

## 1. Chargement des Donn√©es

In [None]:
# Chargement des donn√©es avec features engineering
print("üîÑ Chargement des donn√©es...")

# Option 1: Utiliser les donn√©es avec features engineering du notebook pr√©c√©dent
features_path = "../data/processed/features_engineered.csv"
if os.path.exists(features_path):
    print("üìÇ Chargement du dataset avec features engineering...")
    df = pd.read_csv(features_path)
else:
    print("üìÇ Features engineering non trouv√©, chargement des donn√©es brutes...")
    data_loader = DataLoader()
    df = data_loader.load_data(file_path="../data/raw/creditcard.csv")
    
    # Application du feature engineering
    feature_engineer = FeatureEngineer()
    df = feature_engineer.fit_transform(df)

print(f"üìä Donn√©es charg√©es : {df.shape}")
print(f"üìà Distribution des classes :\n{df['Class'].value_counts(normalize=True)}")

In [None]:
# Pr√©paration des donn√©es
print("üîÑ Pr√©paration des donn√©es...")

# S√©paration features/cible
X = df.drop('Class', axis=1)
y = df['Class']

# Premier split : train/validation/test
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

# Second split : train/validation
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.176, random_state=42, stratify=y_temp
)

print(f"üìä Train : {X_train.shape}, Validation : {X_val.shape}, Test : {X_test.shape}")
print(f"üìà Ratio classes train : {y_train.value_counts(normalize=True).to_dict()}")
print(f"üìà Ratio classes validation : {y_val.value_counts(normalize=True).to_dict()}")
print(f"üìà Ratio classes test : {y_test.value_counts(normalize=True).to_dict()}")

## 2. Gestion du D√©s√©quilibre des Classes

In [None]:
# Application de SMOTE pour √©quilibrer les classes
print("üîÑ Application de SMOTE...")

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"üìä Donn√©es avant SMOTE : {X_train.shape}")
print(f"üìä Donn√©es apr√®s SMOTE : {X_train_balanced.shape}")
print(f"üìà Distribution apr√®s SMOTE : {y_train_balanced.value_counts(normalize=True).to_dict()}")

# Visualisation de l'√©quilibrage
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Avant SMOTE
y_train.value_counts().plot(kind='bar', ax=ax1, color=['skyblue', 'salmon'])
ax1.set_title('Distribution des Classes - Avant SMOTE')
ax1.set_xlabel('Classe')
ax1.set_ylabel('Nombre d\'√©chantillons')
ax1.set_xticklabels(['L√©gitime (0)', 'Frauduleuse (1)'], rotation=0)

# Apr√®s SMOTE
y_train_balanced.value_counts().plot(kind='bar', ax=ax2, color=['skyblue', 'salmon'])
ax2.set_title('Distribution des Classes - Apr√®s SMOTE')
ax2.set_xlabel('Classe')
ax2.set_ylabel('Nombre d\'√©chantillons')
ax2.set_xticklabels(['L√©gitime (0)', 'Frauduleuse (1)'], rotation=0)

plt.tight_layout()
plt.show()

## 3. Entra√Ænement de Base des Mod√®les

In [None]:
# Fonction d'√©valuation des mod√®les
def evaluate_model(model, X_train, y_train, X_val, y_val, model_name):
    """√âvalue un mod√®le et retourne les m√©triques"""
    print(f"\nüîÑ √âvaluation de {model_name}...")
    
    # Entra√Ænement
    model.fit(X_train, y_train)
    
    # Pr√©dictions
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    y_proba_train = model.predict_proba(X_train)[:, 1]
    y_proba_val = model.predict_proba(X_val)[:, 1]
    
    # M√©triques
    metrics_train = calculate_metrics(y_train, y_pred_train, y_proba_train.reshape(-1, 1))
    metrics_val = calculate_metrics(y_val, y_pred_val, y_proba_val.reshape(-1, 1))
    
    print(f"üìä {model_name} - Train PR-AUC: {metrics_train['pr_auc']:.4f}")
    print(f"üìä {model_name} - Validation PR-AUC: {metrics_val['pr_auc']:.4f}")
    
    return {
        'model': model,
        'metrics_train': metrics_train,
        'metrics_val': metrics_val,
        'y_pred_val': y_pred_val,
        'y_proba_val': y_proba_val
    }

In [None]:
# Entra√Ænement des mod√®les de base
print("üîÑ Entra√Ænement des mod√®les de base...")

# Random Forest
rf_model = RandomForestModel({
    'n_estimators': 100,
    'max_depth': 10,
    'random_state': 42,
    'n_jobs': -1
})

rf_results = evaluate_model(rf_model, X_train_balanced, y_train_balanced, X_val, y_val, "Random Forest")

# XGBoost
xgb_model = XGBoostModel({
    'n_estimators': 100,
    'max_depth': 6,
    'learning_rate': 0.1,
    'random_state': 42
})

xgb_results = evaluate_model(xgb_model, X_train_balanced, y_train_balanced, X_val, y_val, "XGBoost")

In [None]:
# Comparaison des performances de base
models_comparison = pd.DataFrame({
    'Mod√®le': ['Random Forest', 'XGBoost'],
    'PR-AUC Train': [rf_results['metrics_train']['pr_auc'], xgb_results['metrics_train']['pr_auc']],
    'PR-AUC Validation': [rf_results['metrics_val']['pr_auc'], xgb_results['metrics_val']['pr_auc']],
    'Precision': [rf_results['metrics_val']['precision'], xgb_results['metrics_val']['precision']],
    'Recall': [rf_results['metrics_val']['recall'], xgb_results['metrics_val']['recall']],
    'F1-Score': [rf_results['metrics_val']['f1'], xgb_results['metrics_val']['f1']]
})

print("üìä Comparaison des performances de base :")
display(models_comparison.round(4))

## 4. Optimisation des Hyperparam√®tres

In [None]:
# Fonctions d'optimisation Optuna
def objective_rf(trial):
    """Fonction objectif pour Random Forest"""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 10, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'random_state': 42,
        'n_jobs': -1
    }
    
    model = RandomForestModel(params)
    model.fit(X_train_balanced, y_train_balanced)
    
    y_proba = model.predict_proba(X_val)[:, 1]
    pr_auc = pr_auc_score(y_val, y_proba)
    
    return pr_auc

def objective_xgb(trial):
    """Fonction objectif pour XGBoost"""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'random_state': 42
    }
    
    model = XGBoostModel(params)
    model.fit(X_train_balanced, y_train_balanced)
    
    y_proba = model.predict_proba(X_val)[:, 1]
    pr_auc = pr_auc_score(y_val, y_proba)
    
    return pr_auc

In [None]:
# Optimisation Random Forest
print("üîÑ Optimisation des hyperparam√®tres Random Forest...")

study_rf = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42),
    pruner=MedianPruner()
)

study_rf.optimize(objective_rf, n_trials=20, timeout=300)

print("‚úÖ Optimisation RF termin√©e")
print(".4f")
print(f"üìä Meilleurs param√®tres RF : {study_rf.best_params}")

In [None]:
# Optimisation XGBoost
print("üîÑ Optimisation des hyperparam√®tres XGBoost...")

study_xgb = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42),
    pruner=MedianPruner()
)

study_xgb.optimize(objective_xgb, n_trials=20, timeout=300)

print("‚úÖ Optimisation XGB termin√©e")
print(".4f")
print(f"üìä Meilleurs param√®tres XGB : {study_xgb.best_params}")

In [None]:
# Entra√Ænement des mod√®les optimis√©s
print("üîÑ Entra√Ænement des mod√®les optimis√©s...")

# Random Forest optimis√©
rf_optimized = RandomForestModel(study_rf.best_params)
rf_opt_results = evaluate_model(rf_optimized, X_train_balanced, y_train_balanced, X_val, y_val, "Random Forest Optimis√©")

# XGBoost optimis√©
xgb_optimized = XGBoostModel(study_xgb.best_params)
xgb_opt_results = evaluate_model(xgb_optimized, X_train_balanced, y_train_balanced, X_val, y_val, "XGBoost Optimis√©")

In [None]:
# Comparaison finale
final_comparison = pd.DataFrame({
    'Mod√®le': ['RF Base', 'RF Optimis√©', 'XGB Base', 'XGB Optimis√©'],
    'PR-AUC Validation': [
        rf_results['metrics_val']['pr_auc'],
        rf_opt_results['metrics_val']['pr_auc'],
        xgb_results['metrics_val']['pr_auc'],
        xgb_opt_results['metrics_val']['pr_auc']
    ],
    'Precision': [
        rf_results['metrics_val']['precision'],
        rf_opt_results['metrics_val']['precision'],
        xgb_results['metrics_val']['precision'],
        xgb_opt_results['metrics_val']['precision']
    ],
    'Recall': [
        rf_results['metrics_val']['recall'],
        rf_opt_results['metrics_val']['recall'],
        xgb_results['metrics_val']['recall'],
        xgb_opt_results['metrics_val']['recall']
    ],
    'F1-Score': [
        rf_results['metrics_val']['f1'],
        rf_opt_results['metrics_val']['f1'],
        xgb_results['metrics_val']['f1'],
        xgb_opt_results['metrics_val']['f1']
    ]
})

print("üìä Comparaison finale des mod√®les :")
display(final_comparison.round(4))

# S√©lection du meilleur mod√®le
best_model_idx = final_comparison['PR-AUC Validation'].idxmax()
best_model_name = final_comparison.loc[best_model_idx, 'Mod√®le']
best_pr_auc = final_comparison.loc[best_model_idx, 'PR-AUC Validation']

print(f"\nüèÜ MEILLEUR MOD√àLE : {best_model_name}")
print(".4f")

## 5. Validation Crois√©e Finale

In [None]:
# Validation crois√©e sur le meilleur mod√®le
print("üîÑ Validation crois√©e finale...")

# Combinaison des donn√©es d'entra√Ænement et validation pour CV
X_train_full = pd.concat([X_train_balanced, X_val])
y_train_full = pd.concat([y_train_balanced, y_val])

# S√©lection du meilleur mod√®le
if best_model_name == 'RF Optimis√©':
    best_model = rf_optimized
    model_params = study_rf.best_params
elif best_model_name == 'XGB Optimis√©':
    best_model = xgb_optimized
    model_params = study_xgb.best_params
else:
    best_model = rf_optimized  # Par d√©faut
    model_params = study_rf.best_params

# Validation crois√©e
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(
    best_model.model, X_train_full, y_train_full, 
    cv=cv, scoring='average_precision', n_jobs=-1
)

print("üìä Scores de validation crois√©e (PR-AUC) :")
for i, score in enumerate(cv_scores, 1):
    print(".4f")
print(".4f")
print(".4f")

## 6. √âvaluation Finale sur le Test Set

In [None]:
# √âvaluation finale sur les donn√©es de test
print("üîÑ √âvaluation finale sur le test set...")

# Entra√Ænement sur toutes les donn√©es d'entra√Ænement
best_model.fit(X_train_full, y_train_full)

# Pr√©dictions sur le test set
y_pred_test = best_model.predict(X_test)
y_proba_test = best_model.predict_proba(X_test)[:, 1]

# M√©triques finales
final_metrics = calculate_metrics(y_test, y_pred_test, y_proba_test.reshape(-1, 1))

print("üìä M√âTRIQUES FINALES SUR LE TEST SET :")
print(f"PR-AUC : {final_metrics['pr_auc']:.4f}")
print(f"ROC-AUC : {final_metrics['roc_auc']:.4f}")
print(f"Precision : {final_metrics['precision']:.4f}")
print(f"Recall : {final_metrics['recall']:.4f}")
print(f"F1-Score : {final_metrics['f1']:.4f}")
print(f"MCC : {final_metrics['mcc']:.4f}")

# Rapport de classification d√©taill√©
print("\nüìã RAPPORT DE CLASSIFICATION D√âTAILL√â :")
print(classification_report(y_test, y_pred_test, target_names=['L√©gitime', 'Frauduleuse']))

In [None]:
# Matrice de confusion
cm = confusion_matrix(y_test, y_pred_test)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['L√©gitime', 'Frauduleuse'],
            yticklabels=['L√©gitime', 'Frauduleuse'])
plt.title('Matrice de Confusion - Test Set')
plt.xlabel('Pr√©diction')
plt.ylabel('R√©alit√©')
plt.tight_layout()
plt.show()

In [None]:
# Courbe ROC et Precision-Recall
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Courbe ROC
fpr, tpr, _ = roc_curve(y_test, y_proba_test)
roc_auc = auc(fpr, tpr)

ax1.plot(fpr, tpr, color='darkorange', lw=2, label='.2f')
ax1.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
ax1.set_xlim([0.0, 1.0])
ax1.set_ylim([0.0, 1.05])
ax1.set_xlabel('Taux de Faux Positifs')
ax1.set_ylabel('Taux de Vrais Positifs')
ax1.set_title('Courbe ROC')
ax1.legend(loc="lower right")
ax1.grid(True)

# Courbe Precision-Recall
from sklearn.metrics import precision_recall_curve
precision, recall, _ = precision_recall_curve(y_test, y_proba_test)
pr_auc = auc(recall, precision)

ax2.plot(recall, precision, color='blue', lw=2, label='.2f')
ax2.set_xlim([0.0, 1.0])
ax2.set_ylim([0.0, 1.05])
ax2.set_xlabel('Rappel')
ax2.set_ylabel('Pr√©cision')
ax2.set_title('Courbe Precision-Recall')
ax2.legend(loc="lower left")
ax2.grid(True)

plt.tight_layout()
plt.show()

## 7. Sauvegarde du Mod√®le

In [None]:
# Sauvegarde du mod√®le et des m√©tadonn√©es
print("üíæ Sauvegarde du mod√®le final...")

import joblib
import os

# Cr√©ation des dossiers
os.makedirs("../models/trained", exist_ok=True)
os.makedirs("../models/metadata", exist_ok=True)

# Sauvegarde du mod√®le
model_path = "../models/trained/best_model.pkl"
joblib.dump(best_model, model_path)
print(f"‚úÖ Mod√®le sauvegard√© : {model_path}")

# Sauvegarde des m√©tadonn√©es
metadata = {
    'model_name': best_model_name,
    'model_type': type(best_model).__name__,
    'best_params': model_params,
    'final_metrics': final_metrics,
    'cv_scores': cv_scores.tolist(),
    'cv_mean': cv_scores.mean(),
    'cv_std': cv_scores.std(),
    'training_date': datetime.now().isoformat(),
    'data_shape': X_train_full.shape,
    'feature_names': X_train_full.columns.tolist(),
    'optuna_study': {
        'rf_best_score': study_rf.best_value,
        'xgb_best_score': study_xgb.best_value,
        'rf_params': study_rf.best_params,
        'xgb_params': study_xgb.best_params
    }
}

metadata_path = "../models/metadata/model_metadata.json"
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2, default=str)

print(f"‚úÖ M√©tadonn√©es sauvegard√©es : {metadata_path}")

## 8. R√©sum√© et Conclusions

In [None]:
print("üéØ R√âSUM√â DE L'ENTRA√éNEMENT DES MOD√àLES")
print("=" * 60)

print(f"üèÜ MEILLEUR MOD√àLE : {best_model_name}")
print(f"üìä PR-AUC Validation Crois√©e : {cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}")
print(f"üìä PR-AUC Test Final : {final_metrics['pr_auc']:.4f}")
print(f"üìä ROC-AUC Test Final : {final_metrics['roc_auc']:.4f}")

print("\nüîß TECHNIQUES UTILIS√âES :")
print("‚Ä¢ Feature Engineering avanc√©")
print("‚Ä¢ Gestion du d√©s√©quilibre avec SMOTE")
print("‚Ä¢ Optimisation des hyperparam√®tres avec Optuna")
print("‚Ä¢ Validation crois√©e stratifi√©e")
print("‚Ä¢ M√©triques adapt√©es aux classes d√©s√©quilibr√©es")

print("\nüìà AM√âLIORATIONS APPORT√âES :")
print(".4f")
print(".4f")
print(".4f")

print("\nüíæ FICHIERS SAUVEGARD√âS :")
print(f"‚Ä¢ Mod√®le : {model_path}")
print(f"‚Ä¢ M√©tadonn√©es : {metadata_path}")
print("‚Ä¢ Features Engineer : ../models/feature_engineer.pkl")

print("\nüöÄ MOD√àLE PR√äT POUR LA PRODUCTION !")