In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./welddb/welddb_cleaned_for_pca.csv')

### Handling missing values for the yeild strength  target value with iterative imputation uisng the bayesian ridge estimator

In [3]:
# drop all other target columns

df = df.drop(columns=['Ultimate tensile strength (MPa)', 'Elongation (%)',
       'Reduction of Area (%)', 'Charpy temperature (deg C)',
       'Charpy impact toughness (J)', 'Hardness (kgmm-2)', '50 % FATT',
       'Primary ferrite in microstructure (%)',
       'Ferrite with second phase (%)', 'Acicular ferrite (%)',
       'Martensite (%)', 'Ferrite with carbide aggregate (%)'])


In [4]:
# Filter the dataset to include only rows where 'Yield strength (MPa)' is not missing
df_yield_strength = df[df['Yield strength (MPa)'].notna()]

# Check the shape of the dataset after filtering
df_yield_strength.shape

(780, 40)

In [5]:
# Count the number of missing values in each column before the imputation
missing_values_before_imputation = df_yield_strength.isnull().sum()
missing_values_before_imputation = missing_values_before_imputation[missing_values_before_imputation > 0]

# Display the columns that had missing values and their counts
missing_values_before_imputation

Post weld heat treatment temperature (deg C)     13
Post weld heat treatment time (hours)            13
Puissance (W)                                   113
dtype: int64

In [6]:
from sklearn.experimental import enable_iterative_imputer  # Required for Iterative Imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge  # Importer BayesianRidge

# Initialize the Iterative Imputer
iterative_imputer = IterativeImputer(estimator=BayesianRidge(), n_nearest_features=None,
                           imputation_order='ascending', random_state=42)
# Apply the iterative imputer on the dataset (excluding the target 'Yield strength (MPa)')
df_features = df_yield_strength.drop(columns=['Yield strength (MPa)'])
df_imputed_iterative = pd.DataFrame(iterative_imputer.fit_transform(df_features), columns=df_features.columns)

# Add back the target 'Yield strength (MPa)'
df_imputed_iterative['Yield strength (MPa)'] = df_yield_strength['Yield strength (MPa)'].values

# Display the first few rows of the imputed dataset
df_imputed_iterative.head()


Unnamed: 0,Carbon concentration (weight%),Silicon concentration (weight%),Manganese concentration (weight%),Sulphur concentration (weight%),Phosphorus concentration (weight%),Nickel concentration (weight%),Chromium concentration (weight%),Molybdenum concentration (weight%),Vanadium concentration (weight%),Copper concentration (weight%),...,Type of weld_GTAA,Type of weld_MMA,Type of weld_NGGMA,Type of weld_NGSAW,Type of weld_SA,Type of weld_SAA,Type of weld_ShMA,Type of weld_TSA,Puissance (W),Yield strength (MPa)
0,-1.612394,-0.2542,-1.447095,-0.136415,-0.048693,-0.318155,-0.427452,-0.56357,-0.146903,-0.289924,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.527791,392.0
1,-1.612394,-0.2542,-1.447095,-0.136415,-0.048693,-0.318155,-0.427452,-0.56357,-0.146903,-0.289924,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.527791,370.0
2,-1.612394,-0.165249,-0.452386,-0.225627,0.05355,-0.318155,-0.427452,-0.56357,-0.146903,-0.289924,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.527791,413.0
3,-1.612394,-0.165249,-0.452386,-0.225627,0.05355,-0.318155,-0.427452,-0.56357,-0.146903,-0.289924,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.527791,402.0
4,-1.319395,0.190556,0.594677,-0.225627,0.05355,-0.318155,-0.427452,-0.56357,-0.146903,-0.289924,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.527791,468.0


### XgBoost training