**Changes:** 

1. Using LGBM in Imbalanced Settings | Not affected much

2. Fill Null Values | Improved performance a bit (Influenced by [here](https://www.kaggle.com/code/gingerz/s4e8-binary-prediction-poisonous-mushrooms#Handling-Missing-Values-for-Numerical-Features))

> * None for categorical features
> * Median for numeric features

3. **Automatic** LGBM Optimization  with Optuna | Improved performance significantly




In [1]:
#! pip install optuna-integration[lightgbm]

In [2]:
import numpy as np
import pandas as pd

import lightgbm as lgb
# import optuna.integration.lightgbm as lgbm
from lightgbm import early_stopping,log_evaluation

import optuna

from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.metrics import matthews_corrcoef
from sklearn.impute import SimpleImputer

In [3]:
train = pd.read_csv("/kaggle/input/playground-series-s4e8/train.csv",index_col = "id")
test = pd.read_csv("/kaggle/input/playground-series-s4e8/test.csv",index_col = "id")

In [4]:
X = train.drop("class",axis = 1)
y = train["class"]

#------
#automatic optimization needs labels to be encoded
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = pd.Series(label_encoder.fit_transform(y))
#-----

cat_cols = list(X.select_dtypes(include = "object").columns)
numeric_cols = list(X.select_dtypes(include = "float64").columns)

num_imputer = SimpleImputer(strategy = "median")
cat_imputer = SimpleImputer(strategy = "constant",fill_value="None")

X[numeric_cols] = num_imputer.fit_transform(X[numeric_cols])
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

test[numeric_cols] = num_imputer.transform(test[numeric_cols])
test[cat_cols] = cat_imputer.transform(test[cat_cols])

X[cat_cols] = X[cat_cols].astype("category")
test[cat_cols] = test[cat_cols].astype("category")

```python
#automatic optuna lgbm optimization 
gbdt_params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "boosting_type": "gbdt",
        "verbosity": -1,
        "random_state": 42
    }
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

dtrain = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols)
dvalid = lgb.Dataset(X_test, label=y_test, categorical_feature=cat_cols)

tuner = lgbm.train(
    gbdt_params,
    dtrain,
    valid_sets=[dvalid],
    num_boost_round=1000,
    callbacks=[early_stopping(100), #early stopping rounds
               log_evaluation(100)] #displays the performance per 100 iterations
)
tuner.params
```

In [5]:
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS,random_state = 0, shuffle = True)


# ratio = len(y) / (2 * y.value_counts())
# class_weight = {"p": ratio.values[0],
#                "e": ratio.values[1]}

gbdt_params = {'objective': 'binary',
 'metric': 'binary_logloss',
 'boosting_type': 'gbdt',
 'verbosity': -1,
 'random_state': 42,
 'feature_pre_filter': False,
 'lambda_l1': 1.0599778843158775e-08,
 'lambda_l2': 7.045232732025109,
 'num_leaves': 250,
 'feature_fraction': 0.4,
 'bagging_fraction': 1.0,
 'bagging_freq': 0,
 'min_child_samples': 100,
 'num_iterations': 1000}

scores = []
test_predictions = []

for fold,(train_idx,val_idx) in enumerate(skf.split(X,y)):
    X_train,X_val = X.iloc[train_idx],X.iloc[val_idx]
    y_train,y_val = y.iloc[train_idx],y.iloc[val_idx]

    dtrain = lgb.Dataset(X_train,label = y_train,categorical_feature = cat_cols)
    dval = lgb.Dataset(X_val,label = y_val,categorical_feature = cat_cols)

    tuned_model = lgb.train(gbdt_params,
                          dtrain, valid_sets= [dval],
                          categorical_feature = cat_cols,
                          callbacks = [early_stopping(100),log_evaluation(100)]
                          )

    preds = tuned_model.predict(X_val)
    score = matthews_corrcoef(y_val,np.round(preds))
    scores.append(score)
    
    print(f"LGBM Tuned Score Fold {fold+1}:", score)

    test_preds = np.round(tuned_model.predict(test))
    test_predictions.append(test_preds)

print("LGBM Tuned Average Score:", np.mean(scores))



Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.0366812
[200]	valid_0's binary_logloss: 0.0360994
[300]	valid_0's binary_logloss: 0.0361155
Early stopping, best iteration is:
[245]	valid_0's binary_logloss: 0.0360759
LGBM Tuned Score Fold 1: 0.9847151073703864




Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.0367796
[200]	valid_0's binary_logloss: 0.0362067
[300]	valid_0's binary_logloss: 0.0362039
Early stopping, best iteration is:
[273]	valid_0's binary_logloss: 0.0361875
LGBM Tuned Score Fold 2: 0.9846134235080479




Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.0363113
[200]	valid_0's binary_logloss: 0.0357683
[300]	valid_0's binary_logloss: 0.0357984
Early stopping, best iteration is:
[211]	valid_0's binary_logloss: 0.0357579
LGBM Tuned Score Fold 3: 0.9848067746758655




Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.0364632
[200]	valid_0's binary_logloss: 0.0358557
[300]	valid_0's binary_logloss: 0.0358733
Early stopping, best iteration is:
[234]	valid_0's binary_logloss: 0.0358331
LGBM Tuned Score Fold 4: 0.9847609338258563




Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.0366104
[200]	valid_0's binary_logloss: 0.0360365
[300]	valid_0's binary_logloss: 0.0360444
Early stopping, best iteration is:
[237]	valid_0's binary_logloss: 0.036015
LGBM Tuned Score Fold 5: 0.9846456387739867
LGBM Tuned Average Score: 0.9847083756308287


In [6]:
test_results = pd.DataFrame(np.array(test_predictions).T,columns = ["Fold1","Fold2","Fold3","Fold4","Fold5"])
test_results.head()

Unnamed: 0,Fold1,Fold2,Fold3,Fold4,Fold5
0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0


In [7]:
submission = pd.read_csv("/kaggle/input/playground-series-s4e8/sample_submission.csv")
submission["class"] = label_encoder.inverse_transform(test_results.mode(axis = 1)[0].values.astype(int))
submission.to_csv("submission.csv", index=False)