# Libraries

In [1]:
import sys
import os

# Add the root directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('weld_quality_prediction'), '..')))

from data_preprocessing.functions import *
from sklearn.model_selection import train_test_split
import warnings
import pandas as pd
import numpy as np

from itertools import product
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


# Suppress all FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Data Extraction

In [2]:
data = pd.read_csv('../data/welddb.data', sep=' ', header=None)

column_names = [
    "Carbon_concentration",
    "Silicon_concentration",
    "Manganese_concentration",
    "Sulphur_concentration",
    "Phosphorus_concentration",
    "Nickel_concentration",
    "Chromium_concentration",
    "Molybdenum_concentration",
    "Vanadium_concentration",
    "Copper_concentration",
    "Cobalt_concentration",
    "Tungsten_concentration",
    "Oxygen_concentration",
    "Titanium_concentration",
    "Nitrogen_concentration",
    "Aluminium_concentration",
    "Boron_concentration",
    "Niobium_concentration",
    "Tin_concentration",
    "Arsenic_concentration",
    "Antimony_concentration",
    "Current",
    "Voltage",
    "AC_or_DC",
    "Electrode_positive_or_negative",
    "Heat_input",
    "Interpass_temperature",
    "Type_of_weld",
    "Post_weld_heat_treatment_temperature",
    "Post_weld_heat_treatment_time",
    "Yield_strength",
    "Ultimate_tensile_strength",
    "Elongation",
    "Reduction_of_Area",
    "Charpy_temperature",
    "Charpy_impact_toughness",
    "Hardness",
    "50%_FATT",
    "Primary_ferrite_in_microstructure",
    "Ferrite_with_second_phase",
    "Acicular_ferrite",
    "Martensite",
    "Ferrite_with_carbide_aggregate",
    "Weld_ID"
]

sulphur_and_phosphorus_columns = ["Sulphur_concentration","Phosphorus_concentration"]

other_concentration_columns = ["Carbon_concentration",
        "Silicon_concentration",
        "Manganese_concentration",
        "Nickel_concentration",
        "Chromium_concentration",
        "Molybdenum_concentration",
        "Vanadium_concentration",
        "Copper_concentration",
        "Cobalt_concentration",
        "Tungsten_concentration",
        "Oxygen_concentration",
        "Titanium_concentration",
        "Nitrogen_concentration",
        'Nitrogen_concentration_residual',
        "Aluminium_concentration",
        "Boron_concentration",
        "Niobium_concentration",
        "Tin_concentration",
        "Arsenic_concentration",
        "Antimony_concentration"]

label_names = ['Yield_strength', 'Ultimate_tensile_strength', 'Elongation', 'Reduction_of_Area', 'Charpy_temperature', 
                   'Charpy_impact_toughness', 'Hardness', '50%_FATT', 'Primary_ferrite_in_microstructure', 'Ferrite_with_second_phase', 
                   'Acicular_ferrite', 'Martensite', 'Ferrite_with_carbide_aggregate', 'Hardness_load']

physical_ordinal_properties_columns = [
        'Current', 
        'Voltage',
        'Heat_input',
        'Interpass_temperature',
        'Post_weld_heat_treatment_temperature',
        'Post_weld_heat_treatment_time', 
    ]

physical_categorical_properties_columns = [
    'AC_or_DC',
    'Electrode_positive_or_negative',
    'Type_of_weld'
]

data.columns = column_names

# Model Training

# K-Cross Validation

In [3]:
from itertools import product
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

class ModelSelector:
    
    def __init__(self, *, model_class, labels_selected : list, data, param_grid_data, param_grid_model, kfold_splits):
        self.model_class = model_class
        self.labels_selected = labels_selected
        self.data = data
        self.param_grid_data = param_grid_data
        self.param_grid_model = param_grid_model
        self.kfold_splits = kfold_splits
        
    def extract_x_y(self):
        #Replace 'N' value with Nan
        data = replace_data(self.data)

        #Selection fof the data for supervised learning
        data_with_label = data.copy()
        data_with_label = data[data_with_label[self.labels_selected].notna().all(axis=1)]

        #Separation of features and labels
        self.X, self.y = choose_labels(data_with_label, labels_chosen=self.labels_selected)
            
    def split(self, test_size=0.2, random_state=42):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, shuffle=True, random_state=random_state)

    def cross_validation(self):
        # KFold cross-validation
        kfolds = KFold(n_splits=5)
        
        # Dictionnaire pour stocker les RMSE moyens pour chaque combinaison d'hyperparamètres
        results = {}

        # Générer toutes les combinaisons d'hyperparamètres
        param_grid_total = self.param_grid_data | self.param_grid_model
        keys, values = zip(*param_grid_total.items())
        combinations = list(product(*values))

        # Itérer sur chaque combinaison d'hyperparamètres
        for combo in combinations:
            params = dict(zip(keys, combo))
            
            rmse_list = []  # Pour stocker les RMSE de chaque fold

            for fold, (train_index, val_index) in enumerate(kfolds.split(self.X_train, self.y_train)):
                model = self.model_class(**{k:v for k,v in params.items() if k in self.param_grid_model})
                
                X_sub_train = self.X_train.iloc[train_index, :]
                X_sub_val = self.X_train.iloc[val_index, :]
                y_sub_train = self.y_train.iloc[train_index]
                y_sub_val = self.y_train.iloc[val_index]

                # Appliquer le pipeline avec les paramètres
                X_sub_train, X_sub_val, y_sub_train, y_sub_val = pipeline_training_set(
                    training_set=X_sub_train, training_labels=y_sub_train,
                    testing_set=X_sub_val, testing_labels=y_sub_val, 
                    labels_chosen=self.labels_selected,
                    categorical_strategy=params['CategoricalStrategies'], 
                    ordinal_strategy=params['OrdinalStrategies'], 
                    is_PCA=True, 
                    pca_percent_explained_variance=0.85,
                    scaler_strategy=params['ScalerStrategy'], 
                    pca_columns=params['PcaColumns'], 
                    less_than_strategy=params['LessThanStrategy']
                )

                # Entraînement et prédiction
                model.fit(X_sub_train, y_sub_train)
                y_sub_pred = model.predict(X_sub_val)
                mse = mean_squared_error(y_sub_pred, y_sub_val)
                rmse = np.sqrt(mse)
                rmse_list.append(rmse)

            # Calculer la moyenne des RMSE pour cette combinaison
            mean_rmse = np.mean(rmse_list)
            results[tuple(combo)] = mean_rmse

        # Trouver les meilleurs hyperparamètres
        best_combo = min(results, key=results.get)
        self.best_rmse = results[best_combo]
        self.best_params = dict(zip(keys, best_combo))

        print(f"Best hyperparameters with cross-validation : {self.best_params}")
        print(f"Best Mean RMSE on cross-validation : {self.best_rmse}")
    
    def score(self):
        # Appliquer le pipeline avec les paramètres
        X_train, X_test, y_train, y_test = pipeline_training_set(
                    training_set=self.X_train, training_labels=self.y_train,
                    testing_set=self.X_test, testing_labels=self.y_test, 
                    labels_chosen=self.labels_selected,
                    categorical_strategy=self.best_params['CategoricalStrategies'], 
                    ordinal_strategy=self.best_params['OrdinalStrategies'], 
                    is_PCA=True, 
                    pca_percent_explained_variance=0.85,
                    scaler_strategy=self.best_params['ScalerStrategy'], 
                    pca_columns=self.best_params['PcaColumns'], 
                    less_than_strategy=self.best_params['LessThanStrategy']
                )
        model = self.model_class(**{k:v for k,v in self.best_params.items() if k in self.param_grid_model})
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_pred, y_test)
        rmse = np.sqrt(mse)
        print(f"Final RMSE on test set : {rmse}")


In [4]:
# Dictionnaire d'hyperparamètres concernant les données

param_grid_data = {
        'OrdinalStrategies': ["mean"],
        'CategoricalStrategies': ["most_frequent"],
        'ScalerStrategy': ["standard"],
        'pca_percent_explained_variance' : [0.80],
        'PcaColumns': ['concentration'],
        'LessThanStrategy': ['max', 'mean']
    }

# Dictionnaire d'hyperparamètres concernant les modèles

param_grid_linear = {
    # 'Regularization': [None, 'l1', 'l2', 'elasticnet'],
    # 'Alpha': [0.01, 0.1, 1, 10],  # S'applique seulement si 'l1' ou 'l2' est sélectionné
    'fit_intercept': [True, False]
}


In [5]:
model = ModelSelector(model_class=LinearRegression, labels_selected=[
                      "Yield_strength"], data=data, param_grid_data=param_grid_data, param_grid_model=param_grid_linear, kfold_splits = 5)

model.extract_x_y()
model.split()
model.cross_validation()
model.score()

NameError: name 'labels_selected' is not defined