In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# !pip install catboost (catboost-1.2.7)
from catboost import CatBoostRegressor

from sklearn.metrics import (
    classification_report, accuracy_score, roc_auc_score, mean_squared_error, confusion_matrix, r2_score
)

from xgboost import XGBRegressor

In [17]:
dataset = pd.read_parquet('../data/dataset_features.parquet')
target = 'total_cout_accident'

In [23]:
dataset.dtypes

conducteur_vieux                           float64
jeune_conducteur                           float64
est_une_femme                              float64
est_equipe_securite                        float64
est_equipe_securite_2                      float64
est_un_conducteur                            int64
limite_de_vitesse_elevee                     int64
limite_de_vitesse_faible                     int64
est_une_route_principale                     int64
est_en_agglomeration                         int64
conditions_lumineuses_defavorables         float64
conditions_meteorologiques_defavorables    float64
accident_pendant_la_journee                  int64
a_hurte_un_obstacle_fixe                   float64
a_hurte_un_obstacle_mobile                 float64
est_une_voie_a_sens_unique                 float64
est_une_route_etroite                      float64
est_une_route_condition_normale            float64
est_un_vehicule_leger                      float64
est_une_voiture_personelle     

In [18]:
X_train, X_test, y_train, y_test = train_test_split(dataset.drop(target, axis=1), 
                                                    dataset[target], 
                                                    test_size=0.2, 
                                                    random_state=111)

In [19]:
def output_performance(model):
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f'Mean Squared Error (MSE): {mse:.4f}')
    print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')
    print(f'R-squared (R²): {r2:.4f}')

In [25]:
y_train.describe()

count    223148.000000
mean          3.187907
std           3.555951
min           1.000000
25%           1.000000
50%           1.000000
75%           5.000000
max          80.000000
Name: total_cout_accident, dtype: float64

In [None]:
mod_hist = HistGradientBoostingRegressor(max_iter=10000, random_state=111)
mod_hist.fit(X_train, y_train)
#le modele se trompe de 3 points en moyenne
output_performance(mod_hist)

Mean Squared Error (MSE): 7.4954
Root Mean Squared Error (RMSE): 2.7378
R-squared (R²): 0.4288


In [16]:
xgb = XGBRegressor(max_depth=10, n_estimators=800, random_state=111, min_child_weight=3, learning_rate=0.01) 
xgb.fit(X_train, y_train)

output_performance(xgb)

Mean Squared Error (MSE): 7.5656
Root Mean Squared Error (RMSE): 2.7506
R-squared (R²): 0.4235


In [14]:
cat = CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=10, random_state=111)
cat.fit(X_train, y_train)

output_performance(cat)

0:	learn: 3.4157906	total: 208ms	remaining: 3m 27s
1:	learn: 3.2965464	total: 261ms	remaining: 2m 10s
2:	learn: 3.1972156	total: 313ms	remaining: 1m 44s
3:	learn: 3.1130755	total: 369ms	remaining: 1m 31s
4:	learn: 3.0414539	total: 425ms	remaining: 1m 24s
5:	learn: 2.9831451	total: 502ms	remaining: 1m 23s
6:	learn: 2.9329780	total: 556ms	remaining: 1m 18s
7:	learn: 2.8921976	total: 616ms	remaining: 1m 16s
8:	learn: 2.8579142	total: 675ms	remaining: 1m 14s
9:	learn: 2.8295565	total: 742ms	remaining: 1m 13s
10:	learn: 2.8070818	total: 792ms	remaining: 1m 11s
11:	learn: 2.7872561	total: 837ms	remaining: 1m 8s
12:	learn: 2.7712297	total: 885ms	remaining: 1m 7s
13:	learn: 2.7575161	total: 946ms	remaining: 1m 6s
14:	learn: 2.7458774	total: 994ms	remaining: 1m 5s
15:	learn: 2.7363281	total: 1.04s	remaining: 1m 3s
16:	learn: 2.7280875	total: 1.08s	remaining: 1m 2s
17:	learn: 2.7214826	total: 1.12s	remaining: 1m 1s
18:	learn: 2.7160474	total: 1.17s	remaining: 1m
19:	learn: 2.7105388	total: 1.22s

In [None]:

# Séparer les colonnes numériques et non numériques
numerical_columns, non_numerical_columns = split_columns(dataset)
 
# Exclure la colonne cible des colonnes d'entrée
numerical_columns = [col for col in numerical_columns if col != 'gravite_blessure']
 
# Définir les variables d'entrée et la cible
X = dataset.drop(columns=['gravite_blessure'])
y = dataset['gravite_blessure']
 
# Diviser les ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
# Vérifier les colonnes disponibles dans X_train
available_columns = X_train.columns.tolist()
 
# Filtrer les colonnes disponibles
numerical_columns = [col for col in numerical_columns if col in available_columns]
non_numerical_columns = [col for col in non_numerical_columns if col in available_columns]
 
# Vérifier les colonnes
print("Colonnes numériques  :", numerical_columns)
print("Colonnes non numériques  :", non_numerical_columns)
 
# Pipeline de prétraitement
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_columns),
        ('cat', categorical_transformer, non_numerical_columns)
    ]
)
 
# Transformation des données
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
 
# Modèles de classification supervisée utiliser
models = {
    "Logistic Regression": LogisticRegression(max_iter=500, random_state=42, verbose=2),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    #"Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, verbose=2),
    "Support Vector Machine": SVC(kernel='linear', probability=True, random_state=42,verbose=2),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    #"XGBoost": XGBClassifier(objective='multi:softmax', num_class=5, random_state=42)
}
 
# Initialisation pour stocker les résultats
results = {}
 
for model_name, model in models.items():
    print(f"\n--- Modèle : {model_name} ---")
   
    # Entraînement
    model.fit(X_train_transformed, y_train)
   
    # Prédictions
    y_pred = model.predict(X_test_transformed)
    y_pred_prob = model.predict_proba(X_test_transformed) if hasattr(model, "predict_proba") else None
   
    # Métriques d'évaluation
    accuracy = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
   
    # AUC
    if y_pred_prob is not None:
        y_test_onehot = pd.get_dummies(y_test).values  # Encodage one-hot pour le calcul de l'AUC
        auc = roc_auc_score(y_test_onehot, y_pred_prob, multi_class='ovr', average='weighted')
    else:
        auc = None
   
    # RMSE
    if y_pred_prob is not None:
        rmse = np.sqrt(mean_squared_error(y_test_onehot, y_pred_prob))
    else:
        rmse = None  
   
    # Rapport de classification
    print("Rapport de classification :")
    print(classification_report(y_test, y_pred))
   
    # Résultats
    results[model_name] = {
        "Accuracy": accuracy,
        "AUC": auc,
        "RMSE": rmse,
        "Confusion Matrix": cm
    }
 
# Réseau de neurones pour la classification multi-classes
y_train_encoded = pd.get_dummies(y_train).values  # Encodage OneHot pour y_train
y_test_encoded = pd.get_dummies(y_test).values    # Encodage OneHot pour y_test
 
nn_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_transformed.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(len(y.unique()), activation='softmax')  # Nombre de classes = len(y.unique())
])
nn_model.compile(optimizer=Adam(learning_rate=0.001),
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])
 
print("\n--- Réseau de Neurones ---")
nn_model.fit(X_train_transformed, y_train_encoded, epochs=20, batch_size=32, verbose=1)
 
# Évaluation du réseau de neurones
nn_loss, nn_accuracy = nn_model.evaluate(X_test_transformed, y_test_encoded, verbose=0)
y_pred_prob_nn = nn_model.predict(X_test_transformed)
nn_auc = roc_auc_score(y_test_encoded, y_pred_prob_nn, multi_class='ovr', average='weighted')
nn_rmse = np.sqrt(mean_squared_error(y_test_encoded, y_pred_prob_nn))
 
print("\nRésultats Réseau de Neurones:")
print(f"Accuracy: {nn_accuracy:.4f}")
print(f"AUC: {nn_auc:.4f}")
print(f"RMSE: {nn_rmse:.4f}")
 
# Résumé final des performances
print("\n--- Résumé des performances ---")
for model_name, metrics in results.items():
    print(f"{model_name}:")
    for metric, value in metrics.items():
        if metric == "Confusion Matrix":
            print(f"  {metric}:\n{value}")
        else:
            print(f"  {metric}: {value:.4f}" if value is not None else f"  {metric}: N/A")
print(f"Neural Network:")
print(f"  Accuracy: {nn_accuracy:.4f}")
print(f"  AUC: {nn_auc:.4f}")
print(f"  RMSE: {nn_rmse:.4f}")