In [1]:
import pandas as pd
pd.options.display.max_columns=1000
pd.options.display.max_rows=1000

import optuna

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.base import clone

from lightgbm import LGBMClassifier

In [2]:
train=pd.read_csv('/kaggle/input/playground-series-s4e3/train.csv')
test=pd.read_csv('/kaggle/input/playground-series-s4e3/test.csv')

print('The dimension of the train dataset is:', train.shape)
print('The dimension of the test dataset is:', test.shape)

The dimension of the train dataset is: (19219, 35)
The dimension of the test dataset is: (12814, 28)


In [3]:
TARGET = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', \
          'Dirtiness', 'Bumps', 'Other_Faults']

In [4]:
train.head()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,0,584,590,909972,909977,16,8,5,2274,113,140,1358,0,1,50,0.7393,0.4,0.5,0.0059,1.0,1.0,0.0,1.2041,0.9031,0.699,-0.5,-0.0104,0.1417,0,0,0,1,0,0,0
1,1,808,816,728350,728372,433,20,54,44478,70,111,1687,1,0,80,0.7772,0.2878,0.2581,0.0044,0.25,1.0,1.0,2.6365,0.7782,1.7324,0.7419,-0.2997,0.9491,0,0,0,0,0,0,1
2,2,39,192,2212076,2212144,11388,705,420,1311391,29,141,1400,0,1,40,0.0557,0.5282,0.9895,0.1077,0.2363,0.3857,0.0,4.0564,2.179,2.2095,-0.0105,-0.0944,1.0,0,0,1,0,0,0,0
3,3,781,789,3353146,3353173,210,16,29,3202,114,134,1387,0,1,40,0.7202,0.3333,0.3333,0.0044,0.375,0.931,1.0,2.3222,0.7782,1.4314,0.6667,-0.0402,0.4025,0,0,1,0,0,0,0
4,4,1540,1560,618457,618502,521,72,67,48231,82,111,1692,0,1,300,0.1211,0.5347,0.0842,0.0192,0.2105,0.9861,1.0,2.7694,1.415,1.8808,0.9158,-0.2455,0.9998,0,0,0,0,0,0,1


# Preprocessing

In [5]:
prep_train = train.copy()
prep_test = test.copy()

In [6]:
def preprocess_data(df):
    data = df.copy()

    # Calculate the difference between the maximum Y and maximum X coordinates
    data['Max_Ratio'] = data['Y_Maximum'] - data['X_Maximum']

    # Calculate the difference between the minimum Y and minimum X coordinates
    data['Min_Ratio'] = data['Y_Minimum'] - data['X_Minimum']

    # Calculate the ratio of X perimeter to Y perimeter
    data['XY_Perimeter_Ratio'] = data['X_Perimeter'] / data['Y_Perimeter']

    # Calculate the range of luminosity values
    data['Luminosity_Range'] = data['Maximum_of_Luminosity'] - data['Minimum_of_Luminosity']

    # Calculate the perimeter density of defects
    data['Perimeter_Density'] = (data['X_Perimeter'] + data['Y_Perimeter']) / data['Pixels_Areas']

    # Multiply luminosity index by defect area to capture overall brightness
    data['Luminosity_Area'] = data['Luminosity_Index'] * data['Pixels_Areas']

    # Calculate compactness measure using defect area and perimeter
    data['Compactness'] = (data['Pixels_Areas'] ** 2) / (4 * 3.14159 * (data['X_Perimeter'] + data['Y_Perimeter']) ** 2)

    # Apply square root transformation to defect area
    data['Area_Square_Root'] = np.sqrt(data['Pixels_Areas'])

    return data

prep_train = preprocess_data(prep_train)
prep_test = preprocess_data(prep_test)

In [7]:
feature_col = prep_test.drop(['id'], axis=1).columns.to_list()

In [8]:
col_transformer = ColumnTransformer(
        transformers=[('num', StandardScaler(), feature_col)
                     ])

pipe = Pipeline([
    ('preproc', col_transformer)
])

pipe = pipe.fit(prep_test[feature_col])
X = pd.DataFrame(pipe.transform(prep_train[feature_col]), columns=pipe.get_feature_names_out())
X_test = pd.DataFrame(pipe.transform(prep_test[feature_col]), columns=pipe.get_feature_names_out())

y = train[TARGET]

# Model

Parameters are selected according to the number of iterations of cross-validation

In [10]:
parameters = {
    'Pastry':{
        'learning_rate': 0.010710262403597084, 
        'n_estimators': 504, 
        'colsample_bytree': 0.3140895459420173, 
        'subsample': 0.5939057452444504, 
        'min_child_samples': 31
    },
    'Z_Scratch':{
        'learning_rate': 0.01569433512213428, 
        'n_estimators': 492, 
        'colsample_bytree': 0.4968444677073035, 
        'subsample': 0.606995044280757, 
        'min_child_samples': 31
    },
    'K_Scatch':{
        'learning_rate': 0.014264221889723944, 
        'n_estimators': 400, 
        'colsample_bytree': 0.30223557188149325, 
        'subsample': 0.6949748360357105, 
        'min_child_samples': 13
    },
    'Stains':{
        'learning_rate': 0.028478116705565524, 
        'n_estimators': 440, 
        'colsample_bytree': 0.5066658895490173, 
        'subsample': 0.67936834049078, 
        'min_child_samples': 10
    },
    'Dirtiness':{
        'learning_rate': 0.011882764971590956, 
        'n_estimators': 452, 
        'colsample_bytree': 0.3279727914611652, 
        'subsample': 0.6965919903497487, 
        'min_child_samples': 42
    },
    'Bumps':{
        'learning_rate': 0.01638639931249338, 
        'n_estimators': 456, 
        'colsample_bytree': 0.3176377326244274, 
        'subsample': 0.6454430401759436, 
        'min_child_samples': 33
    },
    'Other_Faults':{
        'learning_rate': 0.010726273561901629, 
        'n_estimators': 523, 
        'colsample_bytree': 0.3980300421198631, 
        'subsample': 0.9003748316673983, 
        'min_child_samples': 47
    },
}

In [11]:
models = {key: LGBMClassifier(**parameters[key], random_state=42, verbose=0) for key in TARGET}

In [12]:
def cross_val_models(estimators, n_splits=5, verbose=True):
    valid_score = []
    pred_test_list = []
    
    skf = StratifiedKFold(n_splits=n_splits,shuffle=True, random_state=5)
    all_models = []
    
    
    for fold, (train_ind, valid_ind) in enumerate(skf.split(X, y[TARGET[0]])):
        # define train set
        X_train = X.iloc[train_ind]
        y_train = y.iloc[train_ind]
        # define valid set
        X_valid = X.iloc[valid_ind]
        y_valid = y.iloc[valid_ind] 
        
        fold_models = estimators.copy()
        fold_train_predict = {}
        fold_valid_predict = {}
        for key in fold_models.keys():
            fold_models[key].fit(X_train, y_train[key]);   
            fold_train_predict[key] = fold_models[key].predict_proba(X_train)[:, 1]
            fold_valid_predict[key] = fold_models[key].predict_proba(X_valid)[:, 1]
            
        all_models.append(models)
        
        if verbose:
            valid_score.append(roc_auc_score(y_valid, pd.DataFrame.from_dict(fold_valid_predict)))

            print(f"Fold: {fold}", end=' ')
            print(f"Train Acc: {roc_auc_score(y_train, pd.DataFrame.from_dict(fold_train_predict))}", end=' ')
            print(f"Valid Acc: {valid_score[fold]}")

        
    if verbose:
        print(f'Mean valid score: {np.mean(valid_score)}\n')
            
    return all_models

n_splits = 5
models = cross_val_models(models, n_splits)

Fold: 0 Train Acc: 0.9565570815116153 Valid Acc: 0.8884692019577455
Fold: 1 Train Acc: 0.9561726777509726 Valid Acc: 0.8929948524469652
Fold: 2 Train Acc: 0.9556409427753574 Valid Acc: 0.8902072599426647
Fold: 3 Train Acc: 0.9566214002802506 Valid Acc: 0.8817454244606484
Fold: 4 Train Acc: 0.9562151422009955 Valid Acc: 0.8897253527962119
Mean valid score: 0.8886284183208473



In [13]:
fold_test_predict = pd.DataFrame(data=[[0]*len(TARGET)]*len(X_test), columns=TARGET)

In [14]:
# Predicting the test dataset for submission
for fold_models in models:
    for key in fold_models:
        fold_test_predict[key] += fold_models[key].predict_proba(X_test)[:, 1]
        
fold_test_predict /= n_splits

In [15]:
submission=pd.read_csv('/kaggle/input/playground-series-s4e3/sample_submission.csv')

submission[TARGET] = fold_test_predict
submission.to_csv("submission_lgbm.csv",index=False)

# Optuna
Hyper parameter selection with optuna

In [16]:
# def LGBMobjective(trial):
    
#     param = {
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.03),
#         "n_estimators": trial.suggest_int("n_estimators", 400, 600),
# #         'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10),
# #         'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10),
# #         "max_depth": trial.suggest_int("max_depth", 6, 14),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 0.9),
#         "subsample": trial.suggest_float("subsample", 0.5, 1.0),
#         "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
#         "random_state": 42, 
#         "force_col_wise":True, 
#         "verbose":0,
#     }

#     # Initialize and train the model
#     model = LGBMClassifier(**param)

#     # Perform cross-validation
#     auc_mean = cross_val_score(model, np.array(X), y_target, scoring='roc_auc', cv=5).mean()
    
#     return 1 - auc_mean

In [17]:
# studies = {}
# for key in TARGET:
#     y_target = np.array(y[key])
#     # Optuna optimization loop
#     studies[key] = optuna.create_study(direction='minimize')  # Invert direction to minimize 1 - ROC AUC
#     studies[key].optimize(LGBMobjective, n_trials=100)

In [18]:
# for key in TARGET:
#     print(f"{key} = {studies[key].best_params}")