In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
import statsmodels.api as sm 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import warnings
warnings.filterwarnings("ignore")

print("Début du notebook.")

Début du notebook.


In [None]:
X_train = pd.read_csv("../data/processed/X_train.csv")
X_test = pd.read_csv("../data/processed/X_test.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").squeeze()
y_test = pd.read_csv("../data/processed/y_test.csv").squeeze()

X_train = X_train.drop(columns=["z (Wh/km)", "Erwltp (g/km)", "Fuel consumption", "Electric range (km)"])
X_test = X_test.drop(columns=["z (Wh/km)", "Erwltp (g/km)", "Fuel consumption", "Electric range (km)"])

In [3]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso":Lasso(),
    "Elastic Net":ElasticNet(),
    "Random Forest": RandomForestRegressor(),
    "KNN": KNeighborsRegressor()
}

results = [] 

for name, model in models.items():
    
    model.fit(X_train, y_train)
    y_pred=model.predict(X_test)
    
    r2_cross_val = cross_val_score(model,X_train,y_train).mean()
    r2_train = model.score(X_train, y_train) 
    r2_test = model.score(X_test, y_test) 
    mae = mean_absolute_error(y_test, y_pred) 
    mse = mean_squared_error(y_test, y_pred) 
    rmse = np.sqrt(mse)
    
    results.append({ 
        'Modèle': name,
        'R² Cross Validation': r2_cross_val,
        'R² Train': r2_train,
        'R² Test': r2_test, 
        'MAE': mae, 
        'MSE': mse, 
        'RMSE': rmse 
    })

results_df = pd.DataFrame(results).sort_values(by='R² Test', ascending=False)
print("\nRésumé des performances :") 
print(results_df)


Résumé des performances :
              Modèle  R² Cross Validation  R² Train   R² Test        MAE  \
4      Random Forest         9.949226e-01  0.998012  0.993935   2.426078   
5                KNN         9.898879e-01  0.994041  0.989878   3.294370   
1              Ridge         9.344312e-01  0.934468  0.935798  12.855034   
0  Linear Regression        -2.001800e+21  0.934468  0.935797  12.855326   
2              Lasso         9.266005e-01  0.926648  0.927446  13.687712   
3        Elastic Net         8.777564e-01  0.877803  0.877820  17.637387   

          MSE       RMSE  
4   28.753167   5.362198  
5   47.989274   6.927429  
1  304.390133  17.446780  
0  304.395216  17.446926  
2  343.986807  18.546881  
3  579.268673  24.068001  


In [4]:
#Feature selection by Backward Elimination 

X_train_be = sm.add_constant(X_train) # Ajoute l’intercept (constante) 
model_be = sm.OLS(y_train, X_train_be).fit() 
print("\n [Backward Elimination - p-values]") 
print(model_be.summary()) 
#On sélectionne uniquement les variables avec p-value < 0.05 
selected_features = model_be.pvalues[model_be.pvalues < 0.05].index.tolist() 
selected_features = [f for f in selected_features if f != 'const'] # On enlève l'intercept 
print('Nombre de variables initiales :', len(X_train.columns)) 
print('Nombre de variables retenues :', len(selected_features)) 
print(f"\nVariables retenues après backward elimination : {selected_features}") 
#On garde uniquement les variables sélectionnées 
X_train_selected = X_train[selected_features] 
X_test_selected = X_test[selected_features] 


 [Backward Elimination - p-values]
                            OLS Regression Results                            
Dep. Variable:           Ewltp (g/km)   R-squared:                       0.934
Model:                            OLS   Adj. R-squared:                  0.934
Method:                 Least Squares   F-statistic:                 1.061e+05
Date:                Wed, 17 Dec 2025   Prob (F-statistic):               0.00
Time:                        12:44:16   Log-Likelihood:            -1.8248e+06
No. Observations:              424827   AIC:                         3.650e+06
Df Residuals:                  424769   BIC:                         3.650e+06
Df Model:                          57                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
co

In [5]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso":Lasso(),
    "Elastic Net":ElasticNet(),
    "Random Forest": RandomForestRegressor(),
    "KNN": KNeighborsRegressor()
}

results_r = [] 

for name, model in models.items():
     
    model.fit(X_train_selected, y_train)
    y_pred=model.predict(X_test_selected)
    
    r2_cross_val = cross_val_score(model,X_train_selected,y_train).mean()
    r2_train = model.score(X_train_selected, y_train) 
    r2_test = model.score(X_test_selected, y_test) 
    mae = mean_absolute_error(y_test, y_pred) 
    mse = mean_squared_error(y_test, y_pred) 
    rmse = np.sqrt(mse)
    
    results_r.append({ 
        'Modèle': name,
        'R² Cross Validation': r2_cross_val,
        'R² Train': r2_train,
        'R² Test': r2_test, 
        'MAE': mae, 
        'MSE': mse, 
        'RMSE': rmse 
    })

results_r_df = pd.DataFrame(results_r).sort_values(by='R² Test', ascending=False) 
print("\nRésumé des performances sur les données réduites :") 
print(results_r_df)


Résumé des performances sur les données réduites :
              Modèle  R² Cross Validation  R² Train   R² Test        MAE  \
4      Random Forest             0.994104  0.997681  0.992326   2.524182   
5                KNN             0.986014  0.992072  0.985833   3.320510   
0  Linear Regression             0.792661  0.792732  0.792315  22.646579   
1              Ridge             0.792661  0.792732  0.792315  22.646586   
2              Lasso             0.785689  0.785729  0.785227  23.006620   
3        Elastic Net             0.722661  0.722703  0.722426  27.506783   

           MSE       RMSE  
4    36.384180   6.031930  
5    67.165903   8.195481  
0   984.655530  31.379221  
1   984.655536  31.379221  
2  1018.259356  31.910176  
3  1316.005301  36.276787  


In [None]:
#Optimisation des modèles ayant les meilleurs resultats 
#on continue avec toutes les features  
 
X_sub, _, y_sub, _ = train_test_split(X_train, y_train, train_size=0.4, random_state=42)

param_grids = { 
    "Random Forest": { 'model': RandomForestRegressor(random_state=42), 'params': { 'n_estimators': [100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5] } }, 
    "KNN": { 'model': KNeighborsRegressor(), 'params': { 'n_neighbors': list(range(3, 21, 2)), 'weights': ['uniform', 'distance'] } }}


#Pour stocker les résultats 
results_opt = [] 


for name, cfg in param_grids.items():  

    print(f"\nEntraînement du modèle {name}...") 

    grid = GridSearchCV(cfg['model'], cfg['params'], cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2) 
    grid.fit(X_train, y_train) 
 
    print(f"Meilleurs paramètres pour {name} : {grid.best_params_ if grid.best_params_ else '---'}") 
    print(f"Meilleur MSE : {-grid.best_score_:.4f}") 
 
    # Prédictions 
    y_pred = grid.predict(X_test) 
 
    # Métriques 
    r2 = r2_score(y_test, y_pred) 
    mae = mean_absolute_error(y_test, y_pred) 
    mse = mean_squared_error(y_test, y_pred) 
    rmse = np.sqrt(mse) 
 
    # Stockage dans un tableau 
    results_opt.append({ 
        'Modèle': name, 
        'R²': r2, 
        'MAE': mae, 
        'MSE': mse, 
        'RMSE': rmse 
    }) 

#Affichage des résultats sous forme de DataFrame 

results_opt_df = pd.DataFrame(results_opt).sort_values(by='R²', ascending=False) 
print("\nRésumé des performances des modèles optimisés:") 
print(results_opt_df) 


Entraînement du modèle Random Forest...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time= 7.8min
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=14.9min
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=15.1min
[CV] END max_depth=None, min_samples_split=5, n_estimators=200; total time=15.2min
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time= 4.4min
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time= 4.4min
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time= 8.7min
[CV] END max_depth=10, min_samples_split=5, n_estimators=100; total time= 4.4min
[CV] END max_depth=10, min_samples_split=5, n_estimators=100; total time= 4.4min
[CV] END max_depth=10, min_samples_split=5, n_estimators=200; total time= 8.7min
[CV] END max_depth=20, min_samples_split=2, n_estimators=100; total time= 6.8min

In [17]:
#Meilleur MSE : 24.2485
final_model = RandomForestRegressor(max_depth= None, min_samples_split= 5, n_estimators= 200)
final_model.fit(X_train, y_train)

    
#print('R² Cross Validation:',cross_val_score(final_model,X_train,y_train).mean())
y_pred=final_model.predict(X_test)
print('R² Train:',final_model.score(X_train, y_train) )
print('R² Test:',final_model.score(X_test, y_test) )
print('MAE:',mean_absolute_error(y_test, y_pred) )
print('MSE:',mean_squared_error(y_test, y_pred) )
print('RMSE:',np.sqrt(mean_squared_error(y_test, y_pred)))

R² Train: 0.9977759507337463
R² Test: 0.9938984243698631
MAE: 2.432486634179769
MSE: 28.928180638626962
RMSE: 5.37849241317927


In [8]:

final_model = RandomForestRegressor(max_depth= None, min_samples_split= 5, n_estimators= 200)
final_model.fit(X_train, y_train)
print("Random Forest Regressor")
feats = {}  
for feature, importance in zip(X_train.columns, final_model.feature_importances_):  
    feats[feature] = importance  
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Importance'})  
importances.sort_values(by='Importance', ascending=False).head(10)


Random Forest Regressor


Unnamed: 0,Importance
ec (cm3),0.533727
Ft_PETROLPHEV,0.190991
At1 (mm),0.126229
ep (KW),0.047752
m (kg),0.038742
Ft_PETROLELECTRIC,0.026109
W (mm),0.012487
Ft_DIESELPHEV,0.009461
Fm_P,0.005868
At2 (mm),0.002644
