In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# !pip install catboost (catboost-1.2.7)
from catboost import CatBoostRegressor

from sklearn.metrics import (
    classification_report, accuracy_score, roc_auc_score, mean_squared_error, confusion_matrix, r2_score
)
# import tensorflow as tf
# from tensorflow.models import Sequential
# from tensorflow.layers import Dense, Dropout
# from tensorflow.optimizers import Adam
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, XGBRegressor

In [42]:
dataset = pd.read_parquet('../data/dataset_features.parquet')
target = 'total_cout_accident'

In [43]:
X_train, X_test, y_train, y_test = train_test_split(dataset.drop(target, axis=1), 
                                                    dataset[target], 
                                                    test_size=0.2, 
                                                    random_state=111)

In [44]:
def output_performance(model):
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f'Mean Squared Error (MSE): {mse:.4f}')
    print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')
    print(f'R-squared (R²): {r2:.4f}')

In [45]:
mod_hist = HistGradientBoostingRegressor()
mod_hist.fit(X_train, y_train)

output_performance(mod_hist)

Mean Squared Error (MSE): 10.8252
Root Mean Squared Error (RMSE): 3.2902
R-squared (R²): 0.1751


In [46]:
xgb = XGBRegressor(verbose=2) 
xgb.fit(X_train, y_train)

output_performance(xgb)

Parameters: { "verbose" } are not used.



Mean Squared Error (MSE): 10.8541
Root Mean Squared Error (RMSE): 3.2946
R-squared (R²): 0.1729


In [48]:
cat = CatBoostRegressor(verbose=0)
cat.fit(X_train, y_train)

output_performance(cat)

Mean Squared Error (MSE): 10.8344
Root Mean Squared Error (RMSE): 3.2916
R-squared (R²): 0.1744


In [None]:

# Séparer les colonnes numériques et non numériques
numerical_columns, non_numerical_columns = split_columns(dataset)
 
# Exclure la colonne cible des colonnes d'entrée
numerical_columns = [col for col in numerical_columns if col != 'gravite_blessure']
 
# Définir les variables d'entrée et la cible
X = dataset.drop(columns=['gravite_blessure'])
y = dataset['gravite_blessure']
 
# Diviser les ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
# Vérifier les colonnes disponibles dans X_train
available_columns = X_train.columns.tolist()
 
# Filtrer les colonnes disponibles
numerical_columns = [col for col in numerical_columns if col in available_columns]
non_numerical_columns = [col for col in non_numerical_columns if col in available_columns]
 
# Vérifier les colonnes
print("Colonnes numériques  :", numerical_columns)
print("Colonnes non numériques  :", non_numerical_columns)
 
# Pipeline de prétraitement
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_columns),
        ('cat', categorical_transformer, non_numerical_columns)
    ]
)
 
# Transformation des données
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
 
# Modèles de classification supervisée utiliser
models = {
    "Logistic Regression": LogisticRegression(max_iter=500, random_state=42, verbose=2),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    #"Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, verbose=2),
    "Support Vector Machine": SVC(kernel='linear', probability=True, random_state=42,verbose=2),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    #"XGBoost": XGBClassifier(objective='multi:softmax', num_class=5, random_state=42)
}
 
# Initialisation pour stocker les résultats
results = {}
 
for model_name, model in models.items():
    print(f"\n--- Modèle : {model_name} ---")
   
    # Entraînement
    model.fit(X_train_transformed, y_train)
   
    # Prédictions
    y_pred = model.predict(X_test_transformed)
    y_pred_prob = model.predict_proba(X_test_transformed) if hasattr(model, "predict_proba") else None
   
    # Métriques d'évaluation
    accuracy = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
   
    # AUC
    if y_pred_prob is not None:
        y_test_onehot = pd.get_dummies(y_test).values  # Encodage one-hot pour le calcul de l'AUC
        auc = roc_auc_score(y_test_onehot, y_pred_prob, multi_class='ovr', average='weighted')
    else:
        auc = None
   
    # RMSE
    if y_pred_prob is not None:
        rmse = np.sqrt(mean_squared_error(y_test_onehot, y_pred_prob))
    else:
        rmse = None  
   
    # Rapport de classification
    print("Rapport de classification :")
    print(classification_report(y_test, y_pred))
   
    # Résultats
    results[model_name] = {
        "Accuracy": accuracy,
        "AUC": auc,
        "RMSE": rmse,
        "Confusion Matrix": cm
    }
 
# Réseau de neurones pour la classification multi-classes
y_train_encoded = pd.get_dummies(y_train).values  # Encodage OneHot pour y_train
y_test_encoded = pd.get_dummies(y_test).values    # Encodage OneHot pour y_test
 
nn_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_transformed.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(len(y.unique()), activation='softmax')  # Nombre de classes = len(y.unique())
])
nn_model.compile(optimizer=Adam(learning_rate=0.001),
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])
 
print("\n--- Réseau de Neurones ---")
nn_model.fit(X_train_transformed, y_train_encoded, epochs=20, batch_size=32, verbose=1)
 
# Évaluation du réseau de neurones
nn_loss, nn_accuracy = nn_model.evaluate(X_test_transformed, y_test_encoded, verbose=0)
y_pred_prob_nn = nn_model.predict(X_test_transformed)
nn_auc = roc_auc_score(y_test_encoded, y_pred_prob_nn, multi_class='ovr', average='weighted')
nn_rmse = np.sqrt(mean_squared_error(y_test_encoded, y_pred_prob_nn))
 
print("\nRésultats Réseau de Neurones:")
print(f"Accuracy: {nn_accuracy:.4f}")
print(f"AUC: {nn_auc:.4f}")
print(f"RMSE: {nn_rmse:.4f}")
 
# Résumé final des performances
print("\n--- Résumé des performances ---")
for model_name, metrics in results.items():
    print(f"{model_name}:")
    for metric, value in metrics.items():
        if metric == "Confusion Matrix":
            print(f"  {metric}:\n{value}")
        else:
            print(f"  {metric}: {value:.4f}" if value is not None else f"  {metric}: N/A")
print(f"Neural Network:")
print(f"  Accuracy: {nn_accuracy:.4f}")
print(f"  AUC: {nn_auc:.4f}")
print(f"  RMSE: {nn_rmse:.4f}")