In [17]:
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error
from sklearn.preprocessing import StandardScaler

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Charger la data PCA transformée
df = pd.read_csv('Transformed data/pca_transformed_dataset_Yield_strength.csv')

# Séparer les features et la target
X = df.drop(columns=['Yield strength (MPa)'])
y = df['Yield strength (MPa)']

# Séparer en ensemble d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [11]:
# Initialiser les modèles de régression
models = {
    'LinearRegression': LinearRegression(),
    'RidgeRegression': Ridge(),
    'LassoRegression': Lasso(),
    'ElasticNetRegression': ElasticNet(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor()
}


In [19]:
# Définir les grilles de recherche d'hyperparamètres pour GridSearch
param_grids = {
    'LinearRegression': {},
    'RidgeRegression': {'alpha': [0.1, 1.0, 10.0, 100.0]},
    'LassoRegression': {'alpha': [0.1, 1.0, 10.0, 100.0]},
    'ElasticNetRegression': {'alpha': [0.1, 1.0, 10.0], 'l1_ratio': [0.2, 0.5, 0.8]},
    'DecisionTreeRegressor': {'max_depth': [5, 10, 20, None], 'min_samples_split': [2, 10, 20]},
    'RandomForestRegressor': {'n_estimators': [100, 200], 'max_depth': [10, 20, None]},
    'GradientBoostingRegressor': {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 10]}
}


In [20]:
# Définir les scorers
scoring = {
    'MSE': make_scorer(mean_squared_error, greater_is_better=False),
    'RMSE': make_scorer(root_mean_squared_error, greater_is_better=False),
    'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
    'R2': 'r2'
}


Quelle est la meilleure métrique pour refit dans notre cas ?

Pour la prédiction de la Yield strength (MPa), une métrique comme RMSE est souvent utile car elle est directement interprétable en termes de l'unité de la variable cible (MPa), ce qui permet de comprendre facilement l'ampleur des erreurs.

RMSE pourrait être un bon choix pour refit, car il permet d'avoir une mesure intuitive de l'erreur moyenne en termes d'unités MPa. Cela facilitera l'interprétation de la performance du modèle.

In [21]:
best_models = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    grid = GridSearchCV(estimator=model, param_grid=param_grids[model_name], scoring=scoring, refit='RMSE', cv=5, return_train_score=True)
    grid.fit(X_train, y_train)
    
    best_models[model_name] = grid.best_estimator_
    
    print(f"Best parameters for {model_name}: {grid.best_params_}")
    
    # Prédire sur le test set
    y_pred = grid.predict(X_test)
    
    # Calculer et afficher les métriques pour chaque modèle
    mse = mean_squared_error(y_test, y_pred)
    rmse_val = root_mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"MSE for {model_name}: {mse}")
    print(f"RMSE for {model_name}: {rmse_val}")
    print(f"MAE for {model_name}: {mae}")
    print(f"R² for {model_name}: {r2}")
    print("\n")


Training LinearRegression...
Best parameters for LinearRegression: {}
MSE for LinearRegression: 5065.129082509918
RMSE for LinearRegression: 71.16972026437871
MAE for LinearRegression: 55.64773116015236
R² for LinearRegression: 0.43951421574595717


Training RidgeRegression...
Best parameters for RidgeRegression: {'alpha': 100.0}
MSE for RidgeRegression: 5141.937372424805
RMSE for RidgeRegression: 71.70730348036248
MAE for RidgeRegression: 55.5529556863696
R² for RidgeRegression: 0.4310149348966673


Training LassoRegression...
Best parameters for LassoRegression: {'alpha': 1.0}
MSE for LassoRegression: 5063.329078047133
RMSE for LassoRegression: 71.15707328191016
MAE for LassoRegression: 55.65621083466622
R² for LassoRegression: 0.43971339663484477


Training ElasticNetRegression...
Best parameters for ElasticNetRegression: {'alpha': 0.1, 'l1_ratio': 0.2}
MSE for ElasticNetRegression: 5084.195399717164
RMSE for ElasticNetRegression: 71.30354408945718
MAE for ElasticNetRegression: 55.5

KeyboardInterrupt: 