# Standardisation of the data: scaling
This notebook also handles encoding for categorical values and creates the train and test datasets.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("../data/data_imputed_2025_11_05.csv")
print(df.head())

       C    Si    Mn      S      P   Ni        Cr        Mo         V  \
0  0.037  0.30  0.65  0.008  0.012  0.0  1.346813  0.506516  0.060599   
1  0.037  0.30  0.65  0.008  0.012  0.0  1.346813  0.506516  0.060599   
2  0.037  0.30  0.65  0.008  0.012  0.0  1.346813  0.506516  0.060599   
3  0.037  0.31  1.03  0.007  0.014  0.0  1.124183  0.404005  0.099095   
4  0.037  0.31  1.03  0.007  0.014  0.0  1.124183  0.404005  0.099095   

         Cu  ...  AcicularFerrite  Martensite  FerriteCarbide  \
0  0.202601  ...              NaN         NaN             NaN   
1  0.202601  ...              NaN         NaN             NaN   
2  0.202601  ...              NaN         NaN             NaN   
3  0.185464  ...              NaN         NaN             NaN   
4  0.185464  ...             40.0         0.0             0.0   

                          WeldID  MechanicalTestDone  PrimaryFerrite_missing  \
0    Evans-Ni/CMn-1990/1991-0Aaw                   1                       1   
1  Evans-N

In [3]:
columns = [
    "C", "Si", "Mn", "S", "P", "Ni", "Cr", "Mo", "V", "Cu", "Co", "W",
    "O", "Ti", "N", "Al", "B", "Nb", "Sn", "As", "Sb",
    "Current", "Voltage", "AC_DC", "ElectrodePolarity", "HeatInput",
    "InterpassTemp", "WeldType", "PWHT_Temp", "PWHT_Time",
    "YieldStrength", "UTS", "Elongation", "ReductionArea",
    "CharpyTemp", "CharpyImpact", "Hardness", "FATT50", "PrimaryFerrite",
    "Ferrite2ndPhase", "AcicularFerrite", "Martensite", "FerriteCarbide",
    "WeldID"
]

In [4]:
process_param_columns = ['Current', 'Voltage','AC_DC', 'ElectrodePolarity', 'HeatInput', 'InterpassTemp', 'WeldType', 'PWHT_Temp', 'PWHT_Time']

In [5]:
chem_cols = ['C', 'Si', 'Mn', 'S', 'P', 'Ni', 'Cr', 'Mo', 'V', 'Cu', 'Co', 'W', 'O','Ti', 'N', 'Al', 'B', 'Nb', 'Sn', 'As', 'Sb']

In [6]:
mech_cols = [
    'YieldStrength', 'UTS', 'Elongation', 'ReductionArea',
    'CharpyTemp', 'CharpyImpact', 'Hardness', 'FATT50'
]

In [7]:
micro_cols = [
    'PrimaryFerrite', 'Ferrite2ndPhase', 'AcicularFerrite',
    'Martensite', 'FerriteCarbide'
]

Definition of the different data: differentiation between categorical and numerical data, useful to define which scaling is going to be used on each part.    
Definition of the targets and the features to excludes the targets from the scaling step.

In [9]:
process_num = ["Current","Voltage","HeatInput","InterpassTemp"]
process_cat = ["AC_DC","ElectrodePolarity","WeldType"]

features = chem_cols + process_num + process_cat + micro_cols
targets = "YieldStrength"

x = df[features].copy()
y = df[targets].copy()

Performing a split train/test to avoid any leak during fitting of the scalers.  Parameters: 
* test_size = 0.2 to keep 80% of the data for learning, while still evaluating the performance of the model in a safe and reliable way. 
* random_state = 42 to define a standard granularity to make sure splitting is identical at each execution.

In [10]:
# Separation of labelled and unlabelled data
df_labeled = df[df[targets].notna()].copy()
df_unlabeled = df[df[targets].isna()].copy()

X_labeled = df_labeled.drop(columns=[targets])
y_labeled = df_labeled[targets]

# Train/test split on labelled data
X_train_lab, X_test, y_train_lab, y_test = train_test_split(
    X_labeled, y_labeled,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

# Building of the entire train/test set (using unlabelled data as well for the train, and only labelled data for the test)
X_train = pd.concat([X_train_lab, df_unlabeled.drop(columns=[targets])], axis=0)
y_train = pd.concat([y_train_lab, pd.Series([None]*len(df_unlabeled), index=df_unlabeled.index, name=targets)])

# Reindexation
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

X_train.shape, y_train.shape, X_test.shape, y_test.shape


((1500, 47), (1500,), (152, 47), (152,))

Definition of the different scaling transformation to perform on each block:
* **Chemical composition columns** : MinMaxScaler : the data is bounded and skewed, with a majority of small values (concentrations). MinMaxScaler will bring back those values in the [0;1] interval, without altering the proportions.  
* **Process parameters, numerical**: RobustScaler : centered values with the most outliers, RobustScaler centers on the median.  
* **Process parameters, categorical**: OneHotEncoder : categorical data is treated and scaled using a categorical scaler, here OneHotEncoder.  
* **Microstructure** : MinMaxScaler, for the same reasons as the chemical block : the data is bounded.  
* **Mechanical data** : StandardScaler : the data derives from controlled physical processes, which tend to disperse the data following a normal distribution.

In [11]:
# Numerical columns (block)
num_chem = [c for c in chem_cols if c in X_train.columns]
num_proc = [c for c in process_num if c in X_train.columns]
num_micro = [c for c in micro_cols if c in X_train.columns]

# Categorial columns
cat_proc = [c for c in process_cat if c in X_train.columns]

# Composition of the column transformer
preprocess = ColumnTransformer(
    transformers=[
        ("chem_minmax", MinMaxScaler(), num_chem),
        ("proc_robust", RobustScaler(), num_proc),
        ("micro_minmax", MinMaxScaler(), num_micro),
        ("proc_ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_proc),
    ],
    remainder="drop"
)


Following errors when defining the pipeline, some outliers had to be re-treated. In the following cell, I look for the columns that contain the outliers I found, and replace them. 
* I found an interval, and decided to take its middle. 
* I found 2 values '<1', that I rounded to 0.5 for scaling.

Creation of a pipeline to fit (on the training set) and then to transform (using train/test) in a reproductible way.

In [12]:
pipeline = Pipeline([
    ("preprocess", preprocess)
])

pipeline.fit(X_train)

X_train_scaled = pipeline.transform(X_train)
X_test_scaled  = pipeline.transform(X_test)

X_train_scaled.shape, X_test_scaled.shape

((1500, 43), (152, 43))

Reverting back to a DataFrame after scaling of the data. 

In [13]:
output_feature_names = []

output_feature_names += num_chem
output_feature_names += num_proc
output_feature_names += num_micro

if len(cat_proc) > 0:
    onehot = pipeline.named_steps["preprocess"].named_transformers_["proc_ohe"]
    onehot_feature_names = onehot.get_feature_names_out(cat_proc).tolist()
else:
    onehot_feature_names = []

output_feature_names += onehot_feature_names

len(output_feature_names), X_train_scaled.shape[1]

X_train_scaled_df = pd.DataFrame(X_train_scaled, index=X_train.index, columns=output_feature_names)
X_test_scaled_df  = pd.DataFrame(X_test_scaled,  index=X_test.index,  columns=output_feature_names)

X_train_scaled_df.head()

Unnamed: 0,C,Si,Mn,S,P,Ni,Cr,Mo,V,Cu,...,ElectrodePolarity_-,ElectrodePolarity_0,WeldType_FCA,WeldType_GMAA,WeldType_GTAA,WeldType_MMA,WeldType_NGGMA,WeldType_NGSAW,WeldType_SA,WeldType_TSA
0,0.373913,0.687873,0.384615,0.043165,0.098775,0.392871,0.556092,0.299208,0.048866,0.336307,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.130435,0.665421,0.353846,0.064748,0.198586,0.28244,0.554067,0.100018,0.041814,0.232243,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.347826,0.687873,0.2,0.057554,0.11541,0.425454,0.554118,0.362732,0.048885,0.312851,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.017391,0.643283,0.194872,0.05036,0.156998,0.535931,0.5523,0.336926,0.067934,0.319796,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.321739,0.473842,0.297436,0.100719,0.198586,0.316028,0.552511,0.100018,0.050693,0.287444,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [14]:
# Normalised train/test
train_out = X_train_scaled_df.copy()
test_out  = X_test_scaled_df.copy()

if y_train is not None:
    train_out[targets] = y_train
    test_out[targets]  = y_test

# Saving the datasets
train_out.to_csv("../data/train_normalised.csv", index=False)
test_out.to_csv("../data/test_normalised.csv", index=False)
