# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import optuna
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_contour
from lightgbm import LGBMClassifier

In [2]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
#from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from  sklearn  import  set_config
set_config(display='diagram')

# Loading data

In [4]:
with open('airline_dataset.pkl', 'rb') as f:
    data = pickle.load(f)

In [5]:
X, y = data

# Prepare dataset for cross validation

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)

In [7]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
X_train_cv = []
X_test_cv = []
y_train_cv = []
y_test_cv = []

In [9]:
for idx, (train_index, test_index) in enumerate(kf.split(X_train)):
    X_train_cv.append(X_train[train_index])
    X_test_cv.append(X_train[test_index])
    y_train_cv.append(y_train[train_index])
    y_test_cv.append(y_train[test_index])

In [10]:
X_train_cv[0].shape, y_train_cv[0].shape, X_test_cv[0].shape, y_test_cv[0].shape

((72512, 27), (72512,), (18128, 27), (18128,))

In [11]:
def objective(trial):
    param_grid = {

            "boosting_type": trial.suggest_categorical("boosting_type", ['gbdt','dart']),   
            "n_estimators": trial.suggest_int("n_estimators", 100,2000),
            "min_child_samples": trial.suggest_int("min_child_samples", 1,50),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
            "num_leaves": trial.suggest_int("num_leaves", 10, 1000),   
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0, 1),
            "reg_alpha": trial.suggest_float("reg_alpha", 0, 0.1),
            "reg_lambda": trial.suggest_float("reg_lambda", 0, 0.1),
            "min_split_gain": trial.suggest_float("min_split_gain", 0, 1),       
            "min_child_weight": trial.suggest_float("min_child_weight", 0, 1),
      
          }  
    
    accuracy = np.empty(5)

    for i in range(idx+1):
        # Create an XGBoost classifier with the given hyperparameters
        model = LGBMClassifier(**param_grid)

        # Train the model on the training data
        model.fit(X_train_cv[i], y_train_cv[i])

        # Make predictions on the testing data
        y_pred = model.predict(X_test_cv[i])

        # Calculate the accuracy of the model
        accuracy[i] = f1_score(y_test_cv[i], y_pred)        

    return np.mean(accuracy)

In [12]:
study = optuna.create_study(direction="maximize", study_name="LightGBM Classifier")

[32m[I 2023-04-02 23:04:55,824][0m A new study created in memory with name: LightGBM Classifier[0m


In [13]:
study.optimize(objective, n_trials=1024, n_jobs=-1, show_progress_bar=True)

  self._init_valid()


  0%|          | 0/1024 [00:00<?, ?it/s]

[32m[I 2023-04-02 23:05:28,184][0m Trial 0 finished with value: 0.9248534913241528 and parameters: {'boosting_type': 'gbdt', 'n_estimators': 455, 'min_child_samples': 15, 'learning_rate': 0.02063843917519303, 'num_leaves': 23, 'colsample_bytree': 0.10394735417927936, 'reg_alpha': 0.013171673003407892, 'reg_lambda': 0.08032145048804318, 'min_split_gain': 0.2604913696165442, 'min_child_weight': 0.3102408716601631}. Best is trial 0 with value: 0.9248534913241528.[0m
[32m[I 2023-04-02 23:06:24,608][0m Trial 3 finished with value: 0.9563930413224735 and parameters: {'boosting_type': 'gbdt', 'n_estimators': 1074, 'min_child_samples': 18, 'learning_rate': 0.02814328763286577, 'num_leaves': 55, 'colsample_bytree': 0.44162228271482373, 'reg_alpha': 0.029127921514823574, 'reg_lambda': 0.018357492556407874, 'min_split_gain': 0.9378306224620119, 'min_child_weight': 0.9904672552112376}. Best is trial 3 with value: 0.9563930413224735.[0m
[32m[I 2023-04-02 23:06:59,394][0m Trial 5 finished wi

[32m[I 2023-04-03 01:45:31,822][0m Trial 18 finished with value: 0.9566178775060532 and parameters: {'boosting_type': 'dart', 'n_estimators': 1508, 'min_child_samples': 33, 'learning_rate': 0.010806090622691784, 'num_leaves': 447, 'colsample_bytree': 0.8545053364969182, 'reg_alpha': 0.03094388881660018, 'reg_lambda': 0.09911382542892043, 'min_split_gain': 0.4730113059688307, 'min_child_weight': 0.3488716446807155}. Best is trial 16 with value: 0.9581454631856079.[0m
[32m[I 2023-04-03 01:46:05,955][0m Trial 19 finished with value: 0.957263225851366 and parameters: {'boosting_type': 'dart', 'n_estimators': 1473, 'min_child_samples': 34, 'learning_rate': 0.010310168022332674, 'num_leaves': 167, 'colsample_bytree': 0.8708695085396244, 'reg_alpha': 0.03229243006239531, 'reg_lambda': 0.09400848139222587, 'min_split_gain': 0.4707836586449117, 'min_child_weight': 0.3934409409375298}. Best is trial 16 with value: 0.9581454631856079.[0m
[32m[I 2023-04-03 01:57:35,535][0m Trial 20 finishe

[32m[I 2023-04-03 04:28:33,904][0m Trial 36 finished with value: 0.958921302415374 and parameters: {'boosting_type': 'dart', 'n_estimators': 942, 'min_child_samples': 28, 'learning_rate': 0.09711461761174865, 'num_leaves': 112, 'colsample_bytree': 0.8984131705867493, 'reg_alpha': 0.05215896155753625, 'reg_lambda': 0.0887966302996666, 'min_split_gain': 0.5940561081131466, 'min_child_weight': 0.3040548871638823}. Best is trial 36 with value: 0.958921302415374.[0m
[32m[I 2023-04-03 04:31:24,980][0m Trial 38 finished with value: 0.9584230851003778 and parameters: {'boosting_type': 'dart', 'n_estimators': 926, 'min_child_samples': 47, 'learning_rate': 0.08638023864199376, 'num_leaves': 112, 'colsample_bytree': 0.9245410723417901, 'reg_alpha': 0.04960553197101623, 'reg_lambda': 0.08428290512937675, 'min_split_gain': 0.616763830378552, 'min_child_weight': 0.3246605124590146}. Best is trial 36 with value: 0.958921302415374.[0m
[32m[I 2023-04-03 04:41:03,054][0m Trial 37 finished with v

[32m[I 2023-04-03 05:35:15,107][0m Trial 53 finished with value: 0.9581524228360051 and parameters: {'boosting_type': 'dart', 'n_estimators': 806, 'min_child_samples': 24, 'learning_rate': 0.09490648862808204, 'num_leaves': 320, 'colsample_bytree': 0.654532799537596, 'reg_alpha': 0.04514017523483636, 'reg_lambda': 0.09862991398804338, 'min_split_gain': 0.5111758772371898, 'min_child_weight': 0.3668709585271577}. Best is trial 36 with value: 0.958921302415374.[0m
[32m[I 2023-04-03 05:37:54,744][0m Trial 54 finished with value: 0.9586112492759806 and parameters: {'boosting_type': 'dart', 'n_estimators': 786, 'min_child_samples': 23, 'learning_rate': 0.08513224623266229, 'num_leaves': 573, 'colsample_bytree': 0.8913209584127401, 'reg_alpha': 0.04327579754447769, 'reg_lambda': 0.08772929593015204, 'min_split_gain': 0.5149395975926486, 'min_child_weight': 0.37919856454560963}. Best is trial 36 with value: 0.958921302415374.[0m
[32m[I 2023-04-03 05:40:09,961][0m Trial 56 finished wit

[32m[I 2023-04-03 06:44:30,001][0m Trial 69 finished with value: 0.9588196917917129 and parameters: {'boosting_type': 'dart', 'n_estimators': 712, 'min_child_samples': 5, 'learning_rate': 0.09567161808253108, 'num_leaves': 248, 'colsample_bytree': 0.9861029225867148, 'reg_alpha': 0.049301212353641044, 'reg_lambda': 0.07705391679977844, 'min_split_gain': 0.6114089487610781, 'min_child_weight': 0.13062231490736742}. Best is trial 64 with value: 0.9593524038016085.[0m
[32m[I 2023-04-03 06:49:23,514][0m Trial 71 finished with value: 0.9589935355127783 and parameters: {'boosting_type': 'dart', 'n_estimators': 856, 'min_child_samples': 5, 'learning_rate': 0.09599160539289776, 'num_leaves': 80, 'colsample_bytree': 0.9575829615778069, 'reg_alpha': 0.06102342866989431, 'reg_lambda': 0.06888203124965528, 'min_split_gain': 0.579353916072961, 'min_child_weight': 0.10190172377084351}. Best is trial 64 with value: 0.9593524038016085.[0m
[32m[I 2023-04-03 06:52:22,021][0m Trial 74 finished wi

[32m[I 2023-04-03 07:13:35,830][0m Trial 89 finished with value: 0.9589802464551067 and parameters: {'boosting_type': 'dart', 'n_estimators': 382, 'min_child_samples': 11, 'learning_rate': 0.08709960864446706, 'num_leaves': 130, 'colsample_bytree': 0.9716449314963419, 'reg_alpha': 0.06036857769118707, 'reg_lambda': 0.07410595931998673, 'min_split_gain': 0.6949231152645043, 'min_child_weight': 0.14491805301565477}. Best is trial 64 with value: 0.9593524038016085.[0m
[32m[I 2023-04-03 07:16:09,925][0m Trial 91 finished with value: 0.9584947671640309 and parameters: {'boosting_type': 'dart', 'n_estimators': 488, 'min_child_samples': 11, 'learning_rate': 0.0947484979541967, 'num_leaves': 88, 'colsample_bytree': 0.9132397789272139, 'reg_alpha': 0.0691453478301144, 'reg_lambda': 0.08165357309707995, 'min_split_gain': 0.599852493300399, 'min_child_weight': 0.0014244821439346467}. Best is trial 64 with value: 0.9593524038016085.[0m
[32m[I 2023-04-03 07:19:37,058][0m Trial 93 finished w

[32m[I 2023-04-03 08:03:18,443][0m Trial 111 finished with value: 0.957432159616717 and parameters: {'boosting_type': 'gbdt', 'n_estimators': 653, 'min_child_samples': 2, 'learning_rate': 0.08547906736451527, 'num_leaves': 210, 'colsample_bytree': 0.9228151710133817, 'reg_alpha': 0.05430035437771782, 'reg_lambda': 0.08532728457987411, 'min_split_gain': 0.5070299047069728, 'min_child_weight': 0.2653794760955786}. Best is trial 64 with value: 0.9593524038016085.[0m
[32m[I 2023-04-03 08:07:09,860][0m Trial 105 finished with value: 0.9590667973212721 and parameters: {'boosting_type': 'dart', 'n_estimators': 994, 'min_child_samples': 7, 'learning_rate': 0.0886189038593045, 'num_leaves': 146, 'colsample_bytree': 0.9225523203457126, 'reg_alpha': 0.06288903491767087, 'reg_lambda': 0.08540291195277565, 'min_split_gain': 0.5270489892667852, 'min_child_weight': 0.12638179998736207}. Best is trial 64 with value: 0.9593524038016085.[0m
[32m[I 2023-04-03 08:12:53,713][0m Trial 110 finished w

[32m[I 2023-04-03 09:41:26,612][0m Trial 127 finished with value: 0.9585594631638177 and parameters: {'boosting_type': 'dart', 'n_estimators': 768, 'min_child_samples': 6, 'learning_rate': 0.09074665026505979, 'num_leaves': 149, 'colsample_bytree': 0.9440869749379217, 'reg_alpha': 0.0528855481610014, 'reg_lambda': 0.09739262006365085, 'min_split_gain': 0.5455265355241911, 'min_child_weight': 0.202581316262239}. Best is trial 64 with value: 0.9593524038016085.[0m
[32m[I 2023-04-03 09:42:43,662][0m Trial 126 finished with value: 0.9590522789100138 and parameters: {'boosting_type': 'dart', 'n_estimators': 1011, 'min_child_samples': 3, 'learning_rate': 0.08147667353722364, 'num_leaves': 116, 'colsample_bytree': 0.8963906471213995, 'reg_alpha': 0.0556482797387972, 'reg_lambda': 0.09608513812273096, 'min_split_gain': 0.5519832035526059, 'min_child_weight': 0.2078786605975068}. Best is trial 64 with value: 0.9593524038016085.[0m
[32m[I 2023-04-03 09:42:55,966][0m Trial 130 finished wi

[32m[I 2023-04-03 11:18:39,913][0m Trial 147 finished with value: 0.9588044925717334 and parameters: {'boosting_type': 'dart', 'n_estimators': 788, 'min_child_samples': 3, 'learning_rate': 0.0836422266270139, 'num_leaves': 69, 'colsample_bytree': 0.9646957942579294, 'reg_alpha': 0.04790053275711644, 'reg_lambda': 0.08798722360760396, 'min_split_gain': 0.5565939386791106, 'min_child_weight': 0.20934666959066173}. Best is trial 64 with value: 0.9593524038016085.[0m
[32m[I 2023-04-03 11:21:51,869][0m Trial 144 finished with value: 0.9587075121485669 and parameters: {'boosting_type': 'dart', 'n_estimators': 1255, 'min_child_samples': 5, 'learning_rate': 0.08186757811637384, 'num_leaves': 200, 'colsample_bytree': 0.9314841746765031, 'reg_alpha': 0.04781359940045555, 'reg_lambda': 0.09496687362599185, 'min_split_gain': 0.5396309732109342, 'min_child_weight': 0.31860956203748064}. Best is trial 64 with value: 0.9593524038016085.[0m
[32m[I 2023-04-03 11:23:48,821][0m Trial 145 finished

[32m[I 2023-04-03 13:03:41,606][0m Trial 163 finished with value: 0.9588269468621192 and parameters: {'boosting_type': 'dart', 'n_estimators': 1143, 'min_child_samples': 7, 'learning_rate': 0.08018606917400754, 'num_leaves': 183, 'colsample_bytree': 0.8805962013608625, 'reg_alpha': 0.0643635309010882, 'reg_lambda': 0.09097996218513807, 'min_split_gain': 0.635706151905631, 'min_child_weight': 0.25697588979125224}. Best is trial 64 with value: 0.9593524038016085.[0m
[32m[I 2023-04-03 13:05:22,245][0m Trial 162 finished with value: 0.9588261939220549 and parameters: {'boosting_type': 'dart', 'n_estimators': 1158, 'min_child_samples': 7, 'learning_rate': 0.07370370485747, 'num_leaves': 182, 'colsample_bytree': 0.909141916393112, 'reg_alpha': 0.06439111513327264, 'reg_lambda': 0.0986696180673339, 'min_split_gain': 0.4039018312484641, 'min_child_weight': 0.21029378934994847}. Best is trial 64 with value: 0.9593524038016085.[0m
[32m[I 2023-04-03 13:07:34,396][0m Trial 164 finished wit

[32m[I 2023-04-03 14:04:21,049][0m Trial 178 finished with value: 0.9588166888068012 and parameters: {'boosting_type': 'dart', 'n_estimators': 653, 'min_child_samples': 4, 'learning_rate': 0.06245240268595767, 'num_leaves': 57, 'colsample_bytree': 0.9164999575718722, 'reg_alpha': 0.06689320758100885, 'reg_lambda': 0.0974772820132308, 'min_split_gain': 0.5692291916257176, 'min_child_weight': 0.22340316800886867}. Best is trial 64 with value: 0.9593524038016085.[0m
[32m[I 2023-04-03 14:09:39,446][0m Trial 179 finished with value: 0.9587008967806525 and parameters: {'boosting_type': 'dart', 'n_estimators': 640, 'min_child_samples': 8, 'learning_rate': 0.09708195571479224, 'num_leaves': 52, 'colsample_bytree': 0.9137094918283929, 'reg_alpha': 0.06734500181416515, 'reg_lambda': 0.09991032488751826, 'min_split_gain': 0.6051262499735661, 'min_child_weight': 0.22424739250661196}. Best is trial 64 with value: 0.9593524038016085.[0m
[32m[I 2023-04-03 14:10:53,461][0m Trial 182 finished w

[32m[I 2023-04-03 14:45:42,916][0m Trial 199 finished with value: 0.9525331158916988 and parameters: {'boosting_type': 'dart', 'n_estimators': 505, 'min_child_samples': 11, 'learning_rate': 0.05072808940311694, 'num_leaves': 16, 'colsample_bytree': 0.8427821962112568, 'reg_alpha': 0.03698973781177389, 'reg_lambda': 0.08991447223698024, 'min_split_gain': 0.4076125959063299, 'min_child_weight': 0.34313269881038716}. Best is trial 64 with value: 0.9593524038016085.[0m
[32m[I 2023-04-03 14:46:44,270][0m Trial 198 finished with value: 0.9576686754304721 and parameters: {'boosting_type': 'dart', 'n_estimators': 498, 'min_child_samples': 12, 'learning_rate': 0.04967493897915365, 'num_leaves': 34, 'colsample_bytree': 0.8338857977605995, 'reg_alpha': 0.03731955276246382, 'reg_lambda': 0.08889921114373261, 'min_split_gain': 0.4067390401552948, 'min_child_weight': 0.332900803359575}. Best is trial 64 with value: 0.9593524038016085.[0m
[32m[I 2023-04-03 14:51:18,991][0m Trial 200 finished 

[32m[I 2023-04-03 15:24:23,896][0m Trial 216 finished with value: 0.9588269563791834 and parameters: {'boosting_type': 'dart', 'n_estimators': 466, 'min_child_samples': 16, 'learning_rate': 0.05210418688495693, 'num_leaves': 138, 'colsample_bytree': 0.8162154956408503, 'reg_alpha': 0.03865400258693215, 'reg_lambda': 0.054433757491981254, 'min_split_gain': 0.42724972130216643, 'min_child_weight': 0.3675212952147815}. Best is trial 64 with value: 0.9593524038016085.[0m
[32m[I 2023-04-03 15:24:33,661][0m Trial 218 finished with value: 0.9590603996013056 and parameters: {'boosting_type': 'dart', 'n_estimators': 471, 'min_child_samples': 16, 'learning_rate': 0.059739994289212894, 'num_leaves': 78, 'colsample_bytree': 0.8149686041476134, 'reg_alpha': 0.039198082424831686, 'reg_lambda': 0.08869817968917534, 'min_split_gain': 0.42858495047991446, 'min_child_weight': 0.3876889917593234}. Best is trial 64 with value: 0.9593524038016085.[0m
[32m[I 2023-04-03 15:27:13,296][0m Trial 215 fin

KeyboardInterrupt: 

In [14]:
study.best_params

{'boosting_type': 'dart',
 'n_estimators': 729,
 'min_child_samples': 5,
 'learning_rate': 0.09058025591953338,
 'num_leaves': 146,
 'colsample_bytree': 0.9773512265322472,
 'reg_alpha': 0.04836405507163106,
 'reg_lambda': 0.08467358653637265,
 'min_split_gain': 0.6060617236332217,
 'min_child_weight': 0.19214701592888236}

In [15]:
study.best_value

0.9593524038016085