# Preprocessing

<span style="color: cyan;">Load data</span>

In [72]:
# Define column names based on welddb.info
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

column_names = [
    'carbon_wt_pct',                    #1
    'silicon_wt_pct',                   #2
    'manganese_wt_pct',                 #3
    'sulphur_wt_pct',                   #4
    'phosphorus_wt_pct',                #5
    'nickel_wt_pct',                    #6
    'chromium_wt_pct',                  #7
    'molybdenum_wt_pct',                #8
    'vanadium_wt_pct',                  #9
    'copper_wt_pct',                    #10
    'cobalt_wt_pct',                    #11
    'tungsten_wt_pct',                  #12
    'oxygen_ppm',                       #13
    'titanium_ppm',                     #14
    'nitrogen_ppm',                     #15
    'aluminium_ppm',                    #16
    'boron_ppm',                        #17
    'niobium_ppm',                      #18
    'tin_ppm',                          #19
    'arsenic_ppm',                      #20
    'antimony_ppm',                     #21
    'current_a',                        #22
    'voltage_v',                        #23
    'ac_dc',                            #24
    'electrode_polarity',               #25
    'heat_input_kj_per_mm',             #26
    'interpass_temp_c',                 #27
    'weld_type',                        #28
    'pwht_temp_c',                      #29
    'pwht_time_h',                      #30
    'yield_strength_mpa',               #31
    'ultimate_tensile_strength_mpa',    #32
    'elongation_pct',                   #33
    'reduction_area_pct',               #34
    'charpy_temp_c',                    #35
    'charpy_toughness_j',               #36
    'hardness_kg_per_mm2',              #37
    'fatt50_pct',                       #38
    'primary_ferrite_pct',              #39
    'ferrite_second_phase_pct',         #40
    'acicular_ferrite_pct',             #41
    'martensite_pct',                   #42
    'ferrite_carbide_aggregate_pct',    #43
    'weld_id'                           #44
]

# Load the data
data = pd.read_csv('welddb/welddb.data', sep=r'\s+', names=column_names, na_values='N')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1652 entries, 0 to 1651
Data columns (total 44 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   carbon_wt_pct                  1652 non-null   float64
 1   silicon_wt_pct                 1652 non-null   float64
 2   manganese_wt_pct               1652 non-null   float64
 3   sulphur_wt_pct                 1648 non-null   object 
 4   phosphorus_wt_pct              1642 non-null   float64
 5   nickel_wt_pct                  697 non-null    float64
 6   chromium_wt_pct                784 non-null    float64
 7   molybdenum_wt_pct              793 non-null    object 
 8   vanadium_wt_pct                928 non-null    object 
 9   copper_wt_pct                  578 non-null    object 
 10  cobalt_wt_pct                  129 non-null    object 
 11  tungsten_wt_pct                75 non-null     object 
 12  oxygen_ppm                     1256 non-null   f

## Cleaning

<span style="color: cyan;">Gestion des colonnes avec valeurs numériques ambigues</span>

In [73]:
def clean_numeric(val):
    """Convert strings like '<0.002' to numeric values"""
    if pd.isna(val):
        return np.nan
    if isinstance(val, str):
        if val.startswith('<'):
            return float(val[1:]) * 0.5  # Use half the detection limit
        elif val.startswith('>'):
            return float(val[1:]) * 1.5  # Use 1.5x the upper limit
    return val

# Separate numeric and categorical features
categorical_features = ['ac_dc', 'electrode_polarity', 'weld_type']
numeric_feature_cols = [col for col in column_names if col not in categorical_features and col != 'weld_id']

# Apply cleaning to all numeric columns (features and targets)
for col in numeric_feature_cols:
    if col in data.columns:
        data[col] = data[col].apply(clean_numeric)
        data[col] = pd.to_numeric(data[col], errors='coerce')

print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1652 entries, 0 to 1651
Data columns (total 44 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   carbon_wt_pct                  1652 non-null   float64
 1   silicon_wt_pct                 1652 non-null   float64
 2   manganese_wt_pct               1652 non-null   float64
 3   sulphur_wt_pct                 1648 non-null   float64
 4   phosphorus_wt_pct              1642 non-null   float64
 5   nickel_wt_pct                  697 non-null    float64
 6   chromium_wt_pct                784 non-null    float64
 7   molybdenum_wt_pct              793 non-null    float64
 8   vanadium_wt_pct                928 non-null    float64
 9   copper_wt_pct                  578 non-null    float64
 10  cobalt_wt_pct                  129 non-null    float64
 11  tungsten_wt_pct                75 non-null     float64
 12  oxygen_ppm                     1256 non-null   f

<span style="color: cyan;">Dummy variables</span>

In [74]:
# Convert categorical features to dummy variables
print("\nConverting categorical features to dummy variables...")
categorical_dummies = []
for cat_col in categorical_features:
    if cat_col in data.columns:
        # Create dummy variables, drop first to avoid multicollinearity
        dummies = pd.get_dummies(data[cat_col], prefix=cat_col, drop_first=True, dtype=float)
        categorical_dummies.append(dummies)
        print(f"  {cat_col}: {len(dummies.columns)} dummy variable(s) created")

# Combine all dummies into one DataFrame
if categorical_dummies:
    categorical_df = pd.concat(categorical_dummies, axis=1)
    dummy_feature_names = categorical_df.columns.tolist()
else:
    categorical_df = pd.DataFrame()
    dummy_feature_names = []

print(f"\nNumeric features ({len(numeric_feature_cols)}):")
print(f"{numeric_feature_cols}")
print(f"\nCategorical dummy features ({len(dummy_feature_names)}):")
print(f"{dummy_feature_names}\n\n")

# complete dataframe with numeric_feature_cols df and categorical_df
with_dummies = data.copy()
with_dummies.drop(columns=categorical_features, inplace=True)
with_dummies = pd.concat([with_dummies, categorical_df], axis=1)

print(with_dummies.info())




Converting categorical features to dummy variables...
  ac_dc: 1 dummy variable(s) created
  electrode_polarity: 2 dummy variable(s) created
  weld_type: 9 dummy variable(s) created

Numeric features (40):
['carbon_wt_pct', 'silicon_wt_pct', 'manganese_wt_pct', 'sulphur_wt_pct', 'phosphorus_wt_pct', 'nickel_wt_pct', 'chromium_wt_pct', 'molybdenum_wt_pct', 'vanadium_wt_pct', 'copper_wt_pct', 'cobalt_wt_pct', 'tungsten_wt_pct', 'oxygen_ppm', 'titanium_ppm', 'nitrogen_ppm', 'aluminium_ppm', 'boron_ppm', 'niobium_ppm', 'tin_ppm', 'arsenic_ppm', 'antimony_ppm', 'current_a', 'voltage_v', 'heat_input_kj_per_mm', 'interpass_temp_c', 'pwht_temp_c', 'pwht_time_h', 'yield_strength_mpa', 'ultimate_tensile_strength_mpa', 'elongation_pct', 'reduction_area_pct', 'charpy_temp_c', 'charpy_toughness_j', 'hardness_kg_per_mm2', 'fatt50_pct', 'primary_ferrite_pct', 'ferrite_second_phase_pct', 'acicular_ferrite_pct', 'martensite_pct', 'ferrite_carbide_aggregate_pct']

Categorical dummy features (12):
['ac_

<span style="color: cyan;">Delete columns with more than 50% missing values  <br/>
⚠️ All categorical features are kept since none of them have more than 50% missing values ⚠️</span>


In [75]:
# --------- Remove numeric features with too many missing values (>50%) ---------
missing_ratio = with_dummies.isna().mean()
cols = [c for c in with_dummies.columns if (missing_ratio[c] <= 0.5) or (c in dummy_feature_names)]
ready_dataset = with_dummies[cols].copy()

print(ready_dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1652 entries, 0 to 1651
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   carbon_wt_pct         1652 non-null   float64
 1   silicon_wt_pct        1652 non-null   float64
 2   manganese_wt_pct      1652 non-null   float64
 3   sulphur_wt_pct        1648 non-null   float64
 4   phosphorus_wt_pct     1642 non-null   float64
 5   vanadium_wt_pct       928 non-null    float64
 6   oxygen_ppm            1256 non-null   float64
 7   titanium_ppm          935 non-null    float64
 8   nitrogen_ppm          1183 non-null   float64
 9   aluminium_ppm         905 non-null    float64
 10  current_a             1404 non-null   float64
 11  voltage_v             1404 non-null   float64
 12  heat_input_kj_per_mm  1652 non-null   float64
 13  interpass_temp_c      1614 non-null   float64
 14  pwht_temp_c           1639 non-null   float64
 15  pwht_time_h          

<span style="color: cyan;">delete lines with >30% missing values</span>

In [76]:
# Compter les valeurs manquantes par ligne
missing_per_row = df_clean.isnull().sum(axis=1)

# Garder seulement les lignes avec ≤ 30% de valeurs manquantes
threshold_pct = 30
threshold_count = (threshold_pct / 100) * df_clean.shape[1]

# Cleaned dataframe
df_clean = df_clean[missing_per_row <= threshold_count]

# print new dataframe
print(df_clean.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1652 entries, 0 to 1651
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   carbon_wt_pct         1652 non-null   float64
 1   silicon_wt_pct        1652 non-null   float64
 2   manganese_wt_pct      1652 non-null   float64
 3   sulphur_wt_pct        1648 non-null   float64
 4   phosphorus_wt_pct     1642 non-null   float64
 5   vanadium_wt_pct       928 non-null    float64
 6   oxygen_ppm            1256 non-null   float64
 7   titanium_ppm          935 non-null    float64
 8   nitrogen_ppm          1183 non-null   float64
 9   aluminium_ppm         905 non-null    float64
 10  current_a             1404 non-null   float64
 11  voltage_v             1404 non-null   float64
 12  heat_input_kj_per_mm  1652 non-null   float64
 13  interpass_temp_c      1614 non-null   float64
 14  pwht_temp_c           1639 non-null   float64
 15  pwht_time_h          

## Feature analysis

## PCA

## Data analyse

## Justification of target variables