In [1]:
# ! pip install optuna-integration[lightgbm]
import numpy as np
import pandas as pd

import lightgbm as lgb
from lightgbm import early_stopping,log_evaluation
# import optuna
# from optuna.samplers import TPESampler

from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.metrics import matthews_corrcoef
from sklearn.impute import SimpleImputer

In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s4e8/train.csv",index_col = "id").sample(frac = 0.1).reset_index(drop = True) #time constraint
test = pd.read_csv("/kaggle/input/playground-series-s4e8/test.csv",index_col = "id").sample(frac = 0.1).reset_index(drop = True) #time constraint

X = train.drop("class",axis = 1)
y = train["class"]

#------
#automatic optimization needs labels to be encoded
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = pd.Series(label_encoder.fit_transform(y))
#-----

cat_cols = list(X.select_dtypes(include = "object").columns)
numeric_cols = list(X.select_dtypes(include = "float64").columns)

num_imputer = SimpleImputer(strategy = "median")
cat_imputer = SimpleImputer(strategy = "constant",fill_value="None")

X[numeric_cols] = num_imputer.fit_transform(X[numeric_cols])
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

test[numeric_cols] = num_imputer.transform(test[numeric_cols])
test[cat_cols] = cat_imputer.transform(test[cat_cols])

X[cat_cols] = X[cat_cols].astype("category")
test[cat_cols] = test[cat_cols].astype("category")

```python
sample_size = 10000 

def objective(trial):

    dart_params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "boosting_type": "dart",
        "verbosity": -1,
        "random_state": 42,
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 0.8),
        "num_leaves": trial.suggest_int("num_leaves", 30, 100),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100)
    }


    X_sample, _, y_sample, _ = train_test_split(X, y, train_size=sample_size, stratify=y)

    
    kf = StratifiedKFold(n_splits=5)
    scores = []
    
    for train_index, valid_index in kf.split(X_sample, y_sample):
        X_train, X_valid = X_sample.iloc[train_index], X_sample.iloc[valid_index]
        y_train, y_valid = y_sample.iloc[train_index], y_sample.iloc[valid_index]
        
        dtrain = lgb.Dataset(X_train, label=y_train)
        dvalid = lgb.Dataset(X_valid, label=y_valid)
        
        dart_model = lgb.train(dart_params,
                               train_set=dtrain,
                               valid_sets=[dvalid],
                               num_boost_round=500,
                               callbacks=[lgb.callback.log_evaluation(100)])
        
        preds = dart_model.predict(X_valid)
        preds = np.round(preds).astype(int)
        score = matthews_corrcoef(y_valid, preds)
        scores.append(score)
    
  
    return np.mean(scores)

study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(objective, n_trials=10)

print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")
```

In [3]:
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS,random_state = 0, shuffle = True)


dart_params = {"feature_fraction": 0.5422790925276052,
    "num_leaves": 89,
    "lambda_l1": 3.0085938483559657e-07,
    "lambda_l2": 6.309353125820099e-05,
    "bagging_fraction": 0.5663819462275271,
    "bagging_freq": 6,
    "min_child_samples": 42,
    'objective': 'binary',
    'metric': 'binary_logloss',
     'boosting_type': 'dart',
              }



scores = []
test_predictions = []

for fold,(train_idx,val_idx) in enumerate(skf.split(X,y)):
    X_train,X_val = X.iloc[train_idx],X.iloc[val_idx]
    y_train,y_val = y.iloc[train_idx],y.iloc[val_idx]

    dtrain = lgb.Dataset(X_train,label = y_train,categorical_feature = cat_cols)
    dval = lgb.Dataset(X_val,label = y_val,categorical_feature = cat_cols)

    tuned_model = lgb.train(dart_params,
                          dtrain, valid_sets= [dval],
                          categorical_feature = cat_cols,
                            num_boost_round = 500,
                          callbacks = [log_evaluation(100)]
                          )

    preds = tuned_model.predict(X_val)
    score = matthews_corrcoef(y_val,np.round(preds))
    scores.append(score)
    
    print(f"LGBM Tuned Score Fold {fold+1}:", score)

    test_preds = np.round(tuned_model.predict(test))
    test_predictions.append(test_preds)

print("LGBM Tuned Average Score:", np.mean(scores))

[LightGBM] [Info] Number of positive: 136577, number of negative: 112778
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021010 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 942
[LightGBM] [Info] Number of data points in the train set: 249355, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.547721 -> initscore=0.191467
[LightGBM] [Info] Start training from score 0.191467
[100]	valid_0's binary_logloss: 0.062445
[200]	valid_0's binary_logloss: 0.0446146
[300]	valid_0's binary_logloss: 0.0407489
[400]	valid_0's binary_logloss: 0.0397888
[500]	valid_0's binary_logloss: 0.0396381
LGBM Tuned Score Fold 1: 0.9835630850297004
[LightGBM] [Info] Number of positive: 136577, number of negative: 112778
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014963 seconds.
You can set 