In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

<br>
<br>
<br>

### Data Collection

In [3]:
# loading preprocessed & feature selected data
train_data = pd.read_csv("../pipeline_data/6_train_feature_selected.csv")
train_data.head(3)

Unnamed: 0,C6,C1_2,C1_3,C2_0,C2_3,C3_0,C4_1,C4_41,C5_1,C5_4,...,N15,N17,N18,N19,N22,N23,N24,N33,N35,Dependent_Variable
0,0,0,0,0,0,0,1,0,1,0,...,0.0,0.026969,0.735849,-1.075934,-1.0,0.6875,-0.317073,-0.471698,-0.333333,0
1,0,1,0,0,1,0,0,0,1,0,...,0.0,-4.699231,-1.056604,-0.473452,-1.0,-0.25,-0.731707,1.037736,-1.555556,1
2,0,0,0,1,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,-0.4375,-0.780488,-0.45283,-0.444444,1


In [4]:
train_data.shape

(28050, 30)

In [5]:
# seperating features
X = train_data.iloc[:, :-1]
X.head()

Unnamed: 0,C6,C1_2,C1_3,C2_0,C2_3,C3_0,C4_1,C4_41,C5_1,C5_4,...,N12,N15,N17,N18,N19,N22,N23,N24,N33,N35
0,0,0,0,0,0,0,1,0,1,0,...,0.0,0.0,0.026969,0.735849,-1.075934,-1.0,0.6875,-0.317073,-0.471698,-0.333333
1,0,1,0,0,1,0,0,0,1,0,...,0.0,0.0,-4.699231,-1.056604,-0.473452,-1.0,-0.25,-0.731707,1.037736,-1.555556
2,0,0,0,1,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.4375,-0.780488,-0.45283,-0.444444
3,0,0,0,0,0,0,0,0,0,0,...,0.0,1.0,-0.013778,-0.773585,0.894816,1.0,-0.3125,0.121951,0.283019,0.0
4,0,0,0,0,0,0,1,0,0,1,...,0.0,0.0,-0.417206,0.0,-0.180067,-1.0,0.0,0.0,-0.09434,-0.666667


In [6]:
# seperating target
y = train_data.iloc[:, -1]
y = np.ravel(y)
print(y.shape)
y

(28050,)


array([0, 1, 1, ..., 0, 0, 1], dtype=int64)

<br>
<br>
<br>

### LightGBM

Hyperparameter Tuning

In [7]:
import optuna
from lightgbm import LGBMClassifier
from optuna.integration import LightGBMPruningCallback
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [17]:
# optuna objective function
def objective(trial, X, y):

    # parameters grid
    param_grid = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 2000, step=100),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step=0.1
        )
    }

    # 5 fold cv
    cv = StratifiedKFold(n_splits=5, shuffle=True)
    # will hold scores
    cv_scores = np.empty(5)

    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # training model
        model = LGBMClassifier(objective="binary", verbose=-1, **param_grid)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="auc",
            early_stopping_rounds=100,
            callbacks=[
                LightGBMPruningCallback(trial, "auc")
            ]
        )

        # getting auc score
        preds = model.predict_proba(X_test)[:, 1]
        cv_scores[idx] = roc_auc_score(y_test, preds)

    # return mean score
    return np.mean(cv_scores)

In [None]:
# optuna study
study = optuna.create_study(direction="maximize", study_name="LGBM Classifier")
func = lambda trial: objective(trial, X, y)
# optimizing the study
study.optimize(func, timeout=180)

In [None]:
# [I 2023-03-24 12:32:15,524] Trial 321 finished with value: 0.7520746168126591 and parameters: {'n_estimators': 700, 'learning_rate': 0.1921286133667085, 'num_leaves': 2360, 'max_depth': 9, 'min_data_in_leaf': 1000, 'lambda_l1': 30, 'lambda_l2': 20, 'min_gain_to_split': 2.993720055467331, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.5}. Best is trial 233 with value: 0.756908868345348.

In [45]:
# best set of parameters
best_params = {
    'n_estimators': 700,
    'learning_rate': 0.1921286133667085,
    'num_leaves': 2360,
    'max_depth': 9,
    'min_data_in_leaf': 1000,
    'lambda_l1': 30,
    'lambda_l2': 20,
    'min_gain_to_split': 2.993720055467331,
    'bagging_fraction': 0.9,
    'bagging_freq': 1,
    'feature_fraction': 0.5
    }

<br>
<br>
<br>

Model performance with best parameters

In [46]:
# classifier with best parameters
clf = LGBMClassifier(**best_params)

In [47]:
clf.fit(X, y)



In [48]:
# training score
predictions = clf.predict_proba(X)[:, 1]
roc_auc_score(y, predictions)

0.7649881246064937

In [49]:
# cross val score
scores = cross_val_score(clf, X=X, y=y, scoring='roc_auc', cv=10, n_jobs=-1)
scores.mean()

0.7532991712805323

In [51]:
# feature importance
pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)[:10]

N33      53
N1       44
N24      37
N11      25
N3       22
N10.1    21
N17      19
N8       17
N23      16
C3_0     16
dtype: int32