In [None]:
!pip install feature-engine
!pip install autofeat
!pip install scikit-learn
!pip install pandas numpy scikit-learn matplotlib seaborn
!pip install lightgbm xgboost shap
!pip install networkx scipy scikit-image opencv-python


In [None]:
!pip install --upgrade --force-reinstall numpy
import os
os.kill(os.getpid(), 9)

In [None]:
!pip install numpy==1.24.4 shap --force-reinstall
import os
os.kill(os.getpid(), 9)

In [None]:
# ============================
# 📦 Standard Libraries
# ============================
import os
import math
import gc
import zipfile
import joblib
import warnings
from time import time
from datetime import datetime

# ============================
# 📊 Data & Processing
# ============================
import pandas as pd
import numpy as np
from scipy import ndimage
from scipy.spatial import distance

# ============================
# 📈 Visualization
# ============================
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline  # For Colab notebook inline plots

# ============================
# 🧠 Machine Learning Models
# ============================
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    accuracy_score
)

# ============================
# ⚡ Boosting Models
# ============================
import lightgbm as lgb
import xgboost as xgb

# ============================
# 🔍 Model Interpretation
# ============================
import shap

# ============================
# 🧩 Graph Analysis (optional)
# ============================
import networkx as nx

# ============================
# 🖼️ Image Processing (optional)
# ============================
from skimage import measure
import cv2

# ============================
# ⚠️ Warning Settings
# ============================
warnings.filterwarnings("ignore")


In [None]:

zf = zipfile.ZipFile('data.zip')

#load train data
df_wafers = pd.read_csv(zf.open('wafers_train.csv'))
df_wafers.info()
df_wafers.nunique()

#load test data
df_wafers_test = pd.read_csv(zf.open('wafers_test.csv'))
df_wafers_test.head()
df_wafers_test.nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10415400 entries, 0 to 10415399
Data columns (total 5 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   WaferName     object
 1   DieX          int64 
 2   DieY          int64 
 3   IsGoodDie     bool  
 4   IsScratchDie  bool  
dtypes: bool(2), int64(2), object(1)
memory usage: 258.3+ MB


Unnamed: 0,0
WaferName,455
DieX,71
DieY,71
IsGoodDie,2


# Merging Multi-Segment Feature Sets

In [None]:
df_0_6 = pd.read_parquet('selected_features_with_raw_0-6.parquet')
df_6_10 = pd.read_parquet('selected_features_with_raw_10-0.parquet')
df_10_16 = pd.read_parquet('selected_features_with_raw_16-11.parquet')
df_16 = pd.read_parquet('selected_features_with_raw_16.parquet')
df_17 = pd.read_parquet('selected_features_with_raw_17.parquet')




In [None]:
def smart_merge(df_base, df_new, on):
    new_cols = [col for col in df_new.columns if col not in df_base.columns or col in on]
    return df_base.merge(df_new[new_cols], on=on, how='left')

merge_cols = ['WaferName', 'DieX', 'DieY', 'IsGoodDie', 'IsScratchDie']
df_full = df_wafers.copy()
df_full = smart_merge(df_full, df_0_6, merge_cols)
df_full = smart_merge(df_full, df_6_10, merge_cols)
df_full = smart_merge(df_full, df_10_16, merge_cols)
df_full = smart_merge(df_full, df_16, merge_cols)
df_full = smart_merge(df_full, df_17, merge_cols)

#feature reduction

In [None]:
id_cols = ['WaferName', 'DieX', 'DieY', 'IsGoodDie', 'IsScratchDie']
X = df_full.drop(columns=id_cols)
y = df_full['IsScratchDie']

X.fillna(0, inplace=True)



In [None]:
ids = df_full[id_cols]

In [None]:

selector = VarianceThreshold(threshold=1e-5)
X_reduced = selector.fit_transform(X)
selected_columns_var = X.columns[selector.get_support()]
X = pd.DataFrame(X_reduced, columns=selected_columns_var)


In [None]:
X

In [None]:


X_sample = X.sample(2000_000, random_state=42)
y_sample = y.loc[X_sample.index]

model = lgb.LGBMClassifier(n_estimators=100, class_weight='balanced', random_state=42)
model.fit(X_sample, y_sample)

importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values(by='importance', ascending=False)

top_k = 50
selected_columns_lgb = importance_df['feature'].head(top_k).tolist()

X = X[selected_columns_lgb]



[LightGBM] [Info] Number of positive: 15372, number of negative: 1984628
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.172334 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6179
[LightGBM] [Info] Number of data points in the train set: 2000000, number of used features: 59
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


In [None]:
ids = ids.loc[X.index]
y = y.loc[X.index]

df_reduced_full = pd.concat([ids.reset_index(drop=True), y.reset_index(drop=True), X.reset_index(drop=True)], axis=1)

In [None]:
df_reduced_full

Unnamed: 0,WaferName,DieX,DieY,IsGoodDie,IsScratchDie,IsScratchDie.1,LocalGoodDensity_KDE_Norm,BadDensity_X_Std,LocalDefectToGoodDensityRatio,MaxBadStreak,...,MaxStreakDirection_5x5,GoodWeightedSum_3x3,DefectClusterAspectRatio,BadRatio_5x5,BadRatio_3x3,BadNeighbors_5x5,GoodNeighborRadiusCount_r3,DefectClusterCircularity,WaferSize,BadNeighborRadiusCount_r3
0,8PFrAN,0,27,True,False,False,0.475173,1.766671,0.404109,0.0,...,0.0,3.414214,0.0,0.0,0.0,0.0,14.0,0.0,4049.0,0.0
1,8PFrAN,0,28,True,False,False,0.491233,1.766671,0.396970,0.0,...,0.0,4.414214,0.0,0.0,0.0,0.0,15.0,0.0,4049.0,0.0
2,8PFrAN,0,29,True,False,False,0.504673,1.766671,0.390696,0.0,...,0.0,4.414214,0.0,0.0,0.0,0.0,16.0,0.0,4049.0,0.0
3,8PFrAN,0,30,True,False,False,0.515619,1.766671,0.385012,0.0,...,0.0,4.414214,0.0,0.0,0.0,0.0,17.0,0.0,4049.0,0.0
4,8PFrAN,0,31,True,False,False,0.524270,1.766671,0.379631,0.0,...,0.0,4.414214,0.0,0.0,0.0,0.0,17.0,0.0,4049.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10415395,CofPVD,64,36,True,False,False,0.530722,1.541415,0.158881,0.0,...,0.0,4.414214,0.0,0.0,0.0,0.0,17.0,0.0,3405.0,0.0
10415396,CofPVD,64,37,True,False,False,0.520924,1.541415,0.170272,0.0,...,0.0,4.414214,0.0,0.0,0.0,0.0,17.0,0.0,3405.0,0.0
10415397,CofPVD,64,38,True,False,False,0.508081,1.541415,0.181597,0.0,...,0.0,4.414214,0.0,0.0,0.0,0.0,16.0,0.0,3405.0,0.0
10415398,CofPVD,64,39,True,False,False,0.491932,1.541415,0.192854,0.0,...,0.0,4.414214,0.0,0.0,0.0,0.0,15.0,0.0,3405.0,0.0


In [None]:
df_reduced_full = df_reduced_full.loc[:, ~df_reduced_full.columns.duplicated()]
df_reduced_full.to_parquet('df_reduced_full.parquet')


#Memory cleaning

In [None]:


del df_full
del df_0_6, df_6_10, df_10_16, df_17
del df_wafers
del df_reduced_full
del X, y
del ids
gc.collect()

34

#Stratified Data Splitting & Set Preparation

In [None]:
df_reduced_full = pd.read_parquet('df_reduced_full.parquet')


In [None]:
target_col = 'IsScratchDie'
df_pos = df_reduced_full[df_reduced_full['IsScratchDie'] == True]
df_neg = df_reduced_full[df_reduced_full['IsScratchDie'] == False]

print(f"Positive samples: {len(df_pos)}")
print(f"Negative samples: {len(df_neg)}")


Positive samples: 80319
Negative samples: 10335081


In [None]:
df_pos_val = df_pos.sample(5000, random_state=1)
df_pos_test = df_pos.drop(df_pos_val.index).sample(5000, random_state=2)
df_pos_train = df_pos.drop(df_pos_val.index).drop(df_pos_test.index)

df_neg_val = df_neg.sample(5000 * 20, random_state=3)
df_neg_test = df_neg.drop(df_neg_val.index).sample(5000 * 20, random_state=4)
df_neg_train = df_neg.drop(df_neg_val.index).drop(df_neg_test.index)


df_train = pd.concat([df_pos_train, df_neg_train], ignore_index=True).sample(frac=1, random_state=42)
df_val = pd.concat([df_pos_val, df_neg_val], ignore_index=True).sample(frac=1, random_state=42)
df_test = pd.concat([df_pos_test, df_neg_test], ignore_index=True).sample(frac=1, random_state=42)


In [None]:
target_col = 'IsScratchDie'
id_cols = ['WaferName', 'DieX', 'DieY', 'IsGoodDie']

X_train = df_train.drop(columns=id_cols + [target_col])
y_train = df_train[target_col]

X_val = df_val.drop(columns=id_cols + [target_col])
y_val = df_val[target_col]

X_test = df_test.drop(columns=id_cols + [target_col])
y_test = df_test[target_col]

X_train.fillna(0, inplace=True)
X_val.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)


In [None]:
# from imblearn.over_sampling import RandomOverSampler

# ros = RandomOverSampler(random_state=42)
# X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

In [None]:


results = pd.DataFrame(columns=['Model', 'Precision', 'Recall', 'F1', 'Accuracy', 'Train Time (s)'])
def evaluate_model(model, model_name, X_train, y_train, X_val, y_val):
    start_time = time()
    model.fit(X_train, y_train)
    train_time = time() - start_time

    y_pred = model.predict(X_val)

    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    accuracy = accuracy_score(y_val, y_pred)

    results.loc[len(results)] = [model_name, precision, recall, f1, accuracy, round(train_time, 2)]


#Modlis

##RandomForestClassifier

In [None]:

evaluate_model(RandomForestClassifier(n_estimators=20, class_weight='balanced', random_state=42),
               'Random Forest', X_train, y_train, X_val, y_val)

##LogisticRegression

In [None]:

evaluate_model(LogisticRegression(max_iter=100, class_weight='balanced', random_state=42),
               'Logistic Regression', X_train, y_train, X_val, y_val)

##LGBMClassifier

In [None]:

evaluate_model(lgb.LGBMClassifier(n_estimators=10, class_weight='balanced', random_state=42),
               'LightGBM', X_train, y_train, X_val, y_val)

[LightGBM] [Info] Number of positive: 70319, number of negative: 10135081
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.860077 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6011
[LightGBM] [Info] Number of data points in the train set: 10205400, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


##xgboost

In [None]:

evaluate_model(xgb.XGBClassifier(n_estimators=10, scale_pos_weight=20, use_label_encoder=False, eval_metric='logloss'),
               'XGBoost', X_train, y_train, X_val, y_val)


##Ridge Classifier

In [None]:

evaluate_model(
    RidgeClassifier(class_weight='balanced', random_state=42),
    'Ridge Classifier',
    X_train, y_train, X_val, y_val
)


##SGDClassifier

In [None]:

evaluate_model(
    SGDClassifier(loss='log_loss', class_weight='balanced', max_iter=10, random_state=42),
    'SGD Classifier',
    X_train, y_train, X_val, y_val
)

## Extra Trees Classifier

In [None]:

evaluate_model(
    ExtraTreesClassifier(n_estimators=20, class_weight='balanced', random_state=42),
    'Extra Trees',
    X_train, y_train, X_val, y_val
)


In [None]:
results

Unnamed: 0,Model,Precision,Recall,F1,Accuracy,Train Time (s)
0,Random Forest,0.988735,0.7548,0.856073,0.987914,556.15
1,Logistic Regression,0.505304,0.9526,0.660336,0.953333,157.26
2,LightGBM,0.56983,0.9882,0.722844,0.963914,21.34
3,XGBoost,0.87217,0.917,0.894024,0.989648,23.26
4,Ridge Classifier,0.572295,0.9626,0.717823,0.963962,13.38
5,SGD Classifier,0.048751,1.0,0.092971,0.070848,37.98
6,Extra Trees,0.988994,0.7548,0.856171,0.987924,292.45


#the winner is xgboost

In [None]:

best_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=20,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)

best_model.fit(X_train, y_train)

import joblib

joblib.dump(best_model, 'xgboost_best_model.joblib')

['xgboost_best_model.joblib']

In [None]:

y_test_pred = best_model.predict(X_test)

print(classification_report(y_test, y_test_pred, digits=4))
print(confusion_matrix(y_test, y_test_pred))

              precision    recall  f1-score   support

       False     0.9969    0.9942    0.9955    100000
        True     0.8902    0.9376    0.9133      5000

    accuracy                         0.9915    105000
   macro avg     0.9436    0.9659    0.9544    105000
weighted avg     0.9918    0.9915    0.9916    105000

[[99422   578]
 [  312  4688]]


In [None]:
df_real_test = pd.read_parquet('test_features.parquet')


In [None]:
df_real_test.columns

Index(['WaferName', 'DieX', 'DieY', 'IsGoodDie', 'DefectDirection',
       'DefectDirectionEncoded', 'NormDieX', 'NormDieY', 'DistanceFromCenter',
       'AngleFromCenter',
       ...
       'IsInEdgeBand_r1', 'IsInEdgeBand_r2', 'IsInEdgeBand',
       'EdgeBand_BadDensity', 'EdgeBand_GoodDensity', 'EdgeBand_BadGoodRatio',
       'IsBadInEdgeBand', 'LocalEntropy', 'LocalBadRatio_3x3',
       'LocalBadStd_3x3'],
      dtype='object', length=154)

In [None]:
all_features = df_reduced_full.drop(columns=['WaferName', 'DieX', 'DieY', 'IsGoodDie', 'IsScratchDie']).columns

cleaned_features = [col.replace('_x', '') for col in all_features]

In [None]:
valid_features = [col for col in cleaned_features if col in df_real_test.columns]
X_real_test = df_real_test[valid_features].copy()
X_real_test.fillna(0, inplace=True)

In [None]:


model = joblib.load('xgboost_best_model.joblib')
predictions = model.predict(X_real_test)

df_submission = df_real_test[['WaferName', 'DieX', 'DieY']].copy()
df_submission['IsScratchDie'] = predictions

df_submission.to_csv('submission.csv', index=False)

In [None]:
df_submission

Unnamed: 0,WaferName,DieX,DieY,IsScratchDie
0,06svz3,0,24,0
1,06svz3,0,25,0
2,06svz3,0,26,0
3,06svz3,0,27,0
4,06svz3,0,28,0
...,...,...,...,...
1128330,zrafYP,60,33,0
1128331,zrafYP,60,34,0
1128332,zrafYP,60,35,0
1128333,zrafYP,60,36,0


