ML 2 ‚Äî Entra√Ænement d‚Äôun mod√®le pr√©dictif sur le diab√®te

ü©∫ 1. Chargement et pr√©paration des donn√©es

In [1]:
import pandas as pd
import numpy as np
import joblib

# Pour la mod√©lisation et la pr√©paration des donn√©es
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Pour l'√©valuation des performances
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

‚öôÔ∏è 2. Pr√©paration du jeu d'entra√Ænement et de test

In [None]:
print("--- 1. Configuration des chemins de fichiers ---")
OUTPUT_PREDICTIONS_FILE = 'predictions_test.csv'
MODEL_ARTIFACT = 'modele_diabete_XX.pkl'  # <-- Remplacer XX par tes initiales

print("\n--- 2. Chargement des ensembles Train et Test ---")

try:
    df_train = pd.read_csv('data/diabetes_clean.csv')
    df_test_raw = pd.read_csv('data/test_without_class.csv')
except FileNotFoundError as e:
    print(f"ERREUR : Fichier non trouv√© - {e.filename}")
    exit()

X_train_full = df_train.drop('class', axis=1)
y_train_full = df_train['class']

test_ids = df_test_raw['ID']
X_test = df_test_raw.drop('ID', axis=1).copy()

X_test.columns = [col.lower().replace(' ', '_') for col in X_test.columns]

for col in X_test.columns:
    if X_test[col].dtype == 'object':
        if col == 'gender':
            mapping = {'Female': 0, 'Male': 1}
        else:
            mapping = {'No': 0, 'Yes': 1}
        X_test[col] = X_test[col].map(mapping)
        X_test[col] = X_test[col].fillna(0).astype(int)

assert X_test.select_dtypes(include='object').empty, "Certaines colonnes du test ne sont pas num√©riques."
print("Encodage du test termin√© ‚úÖ")

--- 1. Configuration des chemins de fichiers ---

--- 2. Chargement des ensembles Train et Test ---
-> Encodage binaire (0/1) des colonnes textuelles du fichier de test...
Aper√ßu des features encod√©es de Test :
   age  gender  polyuria  polydipsia  sudden_weight_loss  weakness  \
0   50       0         0           0                   0         1   
1   55       1         0           1                   0         1   
2   67       1         1           1                   0         1   
3   45       1         0           0                   0         0   
4   37       1         0           0                   0         0   

   polyphagia  genital_thrush  visual_blurring  itching  irritability  \
0           0               0                1        1             0   
1           0               1                0        0             1   
2           1               1                0        1             1   
3           1               1                0        0             0   
4

# ============================================================
# 3. PIPELINE DE PR√âTRAITEMENT ET ENTRA√éNEMENT FINAL
# ============================================================

In [3]:
print("\n--- 3. Cr√©ation et Entra√Ænement du Pipeline Final ---")

# Standardisation : toutes les colonnes num√©riques
numerical_features = X_train_full.select_dtypes(include=['int64', 'float64']).columns
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'
)

# Mod√®le am√©lior√© Random Forest
best_model = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight='balanced',
    random_state=42
)

# Cr√©ation du pipeline
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', best_model)
])

# Validation crois√©e pour √©valuer la stabilit√© du mod√®le
scores = cross_val_score(final_pipeline, X_train_full, y_train_full, cv=5)
print(f"Score moyen (CV=5) : {scores.mean():.3f}")

# Entra√Ænement final sur tout le jeu d'entra√Ænement
final_pipeline.fit(X_train_full, y_train_full)
print("‚úÖ Entra√Ænement final termin√© sur l'ensemble complet.")


--- 3. Cr√©ation et Entra√Ænement du Pipeline Final ---
Score moyen (CV=5) : 0.981
‚úÖ Entra√Ænement final termin√© sur l'ensemble complet.
