In [1]:
import pandas as pd
import numpy as np
from sklearn.semi_supervised import LabelPropagation, SelfTrainingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_csv('../data/train_normalised.csv')
test = pd.read_csv('../data/test_normalised.csv')

print('Train shape:', train.shape)
print('Test shape:', test.shape)

Train shape: (1500, 44)
Test shape: (152, 44)


In [3]:
test.head()

Unnamed: 0,C,Si,Mn,S,P,Ni,Cr,Mo,V,Cu,...,ElectrodePolarity_0,WeldType_FCA,WeldType_GMAA,WeldType_GTAA,WeldType_MMA,WeldType_NGGMA,WeldType_NGSAW,WeldType_SA,WeldType_TSA,YieldStrength
0,0.130435,0.696791,0.297436,0.043165,0.156998,0.443777,0.55396,0.281222,0.037683,0.632455,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,601.0
1,0.06087,0.598693,0.14359,0.05036,0.140363,0.484281,0.556106,0.415514,0.053522,0.329641,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,371.0
2,0.026087,0.643283,0.194872,0.043165,0.173633,0.538967,0.552964,0.352073,0.065024,0.318907,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,422.0
3,0.017391,0.625447,0.169231,0.05036,0.140363,0.481524,0.556017,0.405258,0.054064,0.322622,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,390.0
4,0.104348,0.670037,0.589744,0.035971,0.140363,0.437052,0.5539,0.254775,0.075633,0.298206,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,543.0


In [4]:
test.columns

Index(['C', 'Si', 'Mn', 'S', 'P', 'Ni', 'Cr', 'Mo', 'V', 'Cu', 'Co', 'W', 'O',
       'Ti', 'N', 'Al', 'B', 'Nb', 'Sn', 'As', 'Sb', 'Current', 'Voltage',
       'HeatInput', 'InterpassTemp', 'PrimaryFerrite', 'Ferrite2ndPhase',
       'AcicularFerrite', 'Martensite', 'FerriteCarbide', 'AC_DC_AC',
       'AC_DC_DC', 'ElectrodePolarity_+', 'ElectrodePolarity_-',
       'ElectrodePolarity_0', 'WeldType_FCA', 'WeldType_GMAA', 'WeldType_GTAA',
       'WeldType_MMA', 'WeldType_NGGMA', 'WeldType_NGSAW', 'WeldType_SA',
       'WeldType_TSA', 'YieldStrength'],
      dtype='object')

In [5]:
X_train = train.drop(columns=['YieldStrength', 'Ferrite2ndPhase', 'Martensite', 'FerriteCarbide', 'AcicularFerrite', 'PrimaryFerrite'])
y_train = train['YieldStrength']

X_test = test.drop(columns=['YieldStrength', 'Ferrite2ndPhase', 'Martensite', 'FerriteCarbide', 'AcicularFerrite', 'PrimaryFerrite'])
y_test = test['YieldStrength']

In [6]:
# Inspect which columns in X_train contain NaN values
from IPython.display import display
# Count NaNs per column and keep only columns with at least one NaN
nan_counts = X_train.isna().sum()
nan_counts = nan_counts[nan_counts > 0].sort_values(ascending=False)
print('Columns with NaNs and counts:')
print(nan_counts)
print('\nFraction missing per column:')
print((nan_counts / len(X_train)).round(4))
cols_with_nans = nan_counts.index.tolist()
print('\nColumns with any NaNs:', cols_with_nans)
# Show sample rows that contain NaNs in those columns
if cols_with_nans:
    print('\nSample rows containing NaNs (first 20):')
    display(X_train.loc[X_train[cols_with_nans].isna().any(axis=1), cols_with_nans].head(20))
# Also print how many rows have any NaN across these columns
if cols_with_nans:
    n_rows_with_nan = X_train[cols_with_nans].isna().any(axis=1).sum()
    print(f'\nNumber of rows with any NaN in these columns: {n_rows_with_nan} / {len(X_train)}')

Columns with NaNs and counts:
Series([], dtype: int64)

Fraction missing per column:
Series([], dtype: float64)

Columns with any NaNs: []


In [7]:
# Prepare the target for semi-supervised classification
# Bin YieldStrength into 3 categories using quantiles for balanced classes
y_train_no_na = y_train.dropna()
bins = pd.qcut(y_train_no_na, q=3, retbins=True)[1]
# Apply the same bins to train and test
y_train_binned = pd.cut(y_train, bins=bins, labels=['low', 'medium', 'high'], include_lowest=True)
y_test_binned  = pd.cut(y_test,  bins=bins, labels=['low', 'medium', 'high'], include_lowest=True)
# Ensure we can fill a new category 'unlabeled' on a Categorical without error
if hasattr(y_train_binned, 'cat'):
    if 'unlabeled' not in y_train_binned.cat.categories:
        y_train_binned = y_train_binned.cat.add_categories(['unlabeled'])
# Fill NaNs in training with placeholder so LabelEncoder can fit
y_train_binned_filled = y_train_binned.fillna('unlabeled')

# Fit LabelEncoder on the (filled) training labels
le = LabelEncoder()
le.fit(y_train_binned_filled.astype(str))

# Transform training labels to integers and mark original NaNs as -1 (unlabeled)
y_train_encoded = le.transform(y_train_binned_filled.astype(str))
y_train_encoded = np.where(y_train_binned.isna(), -1, y_train_encoded)

# Prepare test labels: fill NaNs with the training mode to avoid unseen categories
train_mode = y_train_binned.mode().iloc[0] if not y_train_binned.mode().empty else 'medium'
y_test_binned_filled = y_test_binned.fillna(train_mode)

# Replace any unseen test categories with train_mode before encoding
unknowns = set(y_test_binned_filled.dropna().unique()) - set(le.classes_)
if unknowns:
    y_test_binned_filled = y_test_binned_filled.replace(list(unknowns), train_mode)

# Final test encoding
y_test_encoded = le.transform(y_test_binned_filled.astype(str))

In [8]:
# Model 1: Label Propagation
lp_model = LabelPropagation(kernel='rbf', gamma=20, n_neighbors=7, max_iter=1000)
lp_model.fit(X_train, y_train_encoded)
y_pred_lp = lp_model.predict(X_test)
accuracy_lp = accuracy_score(y_test_encoded, y_pred_lp)
print(f'Label Propagation Accuracy: {accuracy_lp:.4f}')

Label Propagation Accuracy: 0.6382


In [None]:
# Model 2: Self-Training Classifier
base_classifier = SVC(probability=True, kernel='rbf', gamma='scale', C=1.0)
st_model = SelfTrainingClassifier(base_classifier, threshold=0.9, criterion='threshold', max_iter=1000)
st_model.fit(X_train, y_train_encoded)
y_pred_st = st_model.predict(X_test)
accuracy_st = accuracy_score(y_test_encoded, y_pred_st)
print(f'Self-Training Classifier Accuracy: {accuracy_st:.4f}')

Self-Training Classifier Accuracy: 0.3487


In [10]:
# Compare performances
print(f'\nComparison:')
print(f'Label Propagation: {accuracy_lp:.4f}')
print(f'Self-Training Classifier: {accuracy_st:.4f}')
if accuracy_lp > accuracy_st:
    print('Label Propagation performed better.')
elif accuracy_st > accuracy_lp:
    print('Self-Training Classifier performed better.')
else:
    print('Both models performed equally.')


Comparison:
Label Propagation: 0.6382
Self-Training Classifier: 0.3487
Label Propagation performed better.
