# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import optuna
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_contour
from lightgbm import LGBMClassifier

In [2]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
#from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from  sklearn  import  set_config
set_config(display='diagram')

# Loading data

In [4]:
with open('airline_dataset.pkl', 'rb') as f:
    data = pickle.load(f)

In [5]:
X, y = data

# Prepare dataset for cross validation

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)

In [7]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
X_train_cv = []
X_test_cv = []
y_train_cv = []
y_test_cv = []

In [9]:
for idx, (train_index, test_index) in enumerate(kf.split(X_train)):
    X_train_cv.append(X_train[train_index])
    X_test_cv.append(X_train[test_index])
    y_train_cv.append(y_train[train_index])
    y_test_cv.append(y_train[test_index])

In [10]:
X_train_cv[0].shape, y_train_cv[0].shape, X_test_cv[0].shape, y_test_cv[0].shape

((72512, 27), (72512,), (18128, 27), (18128,))

In [11]:
def objective(trial):
    param_grid = {

            "boosting_type": trial.suggest_categorical("boosting_type", ['gbdt','dart']),   
            "n_estimators": trial.suggest_int("n_estimators", 100,2000),
            "min_child_samples": trial.suggest_int("min_child_samples", 1,50),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
            "num_leaves": trial.suggest_int("num_leaves", 10, 1000),   
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0, 1),
            "reg_alpha": trial.suggest_float("reg_alpha", 0, 0.1),
            "reg_lambda": trial.suggest_float("reg_lambda", 0, 0.1),
            "min_split_gain": trial.suggest_float("min_split_gain", 0, 1),       
            "min_child_weight": trial.suggest_float("min_child_weight", 0, 1),
      
          }  
    
    accuracy = np.empty(5)

    for i in range(idx+1):
        # Create an XGBoost classifier with the given hyperparameters
        model = LGBMClassifier(**param_grid)

        # Train the model on the training data
        model.fit(X_train_cv[i], y_train_cv[i])

        # Make predictions on the testing data
        y_pred = model.predict(X_test_cv[i])

        # Calculate the accuracy of the model
        accuracy[i] = f1_score(y_test_cv[i], y_pred)        

    return np.mean(accuracy)

In [12]:
study = optuna.create_study(direction="maximize", study_name="LightGBM Classifier")

[32m[I 2023-04-02 23:04:55,824][0m A new study created in memory with name: LightGBM Classifier[0m


In [None]:
study.optimize(objective, n_trials=1024, n_jobs=-1, show_progress_bar=True)

In [14]:
study.best_params

{'boosting_type': 'dart',
 'n_estimators': 729,
 'min_child_samples': 5,
 'learning_rate': 0.09058025591953338,
 'num_leaves': 146,
 'colsample_bytree': 0.9773512265322472,
 'reg_alpha': 0.04836405507163106,
 'reg_lambda': 0.08467358653637265,
 'min_split_gain': 0.6060617236332217,
 'min_child_weight': 0.19214701592888236}

In [15]:
study.best_value

0.9593524038016085