In [2]:
import warnings 
warnings.filterwarnings("ignore")

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RepeatedStratifiedKFold
import numpy as np
import pandas as pd
import pickle
import lightgbm as lgb

In [3]:
with open("probas_val.pkl", "rb") as f: 
    probas_val = pickle.load(f)
probas_val.drop(columns = ["index"], inplace = True)
y_val = np.array(probas_val["class"])
val_proba = np.array(probas_val.drop(columns = ["class"]))
val_proba_train, val_proba_validation, y_val_train, y_val_validation = train_test_split(
    val_proba, y_val, test_size=0.1, random_state=42)

In [4]:
#Set to large number, set early stopping rounds
n_estimators = [2000]
#Smaller dataset, try reducing learning rate
learning_rate = [0.001, 0.01]
#tree grows leaf wise instead of depth wise for lightgbm, focus on tuning num leaves
num_leaves = [5, 10, 15, 20, 50]
#set number of entries needed in a leaf to prevent overfitting
min_child_samples = [10, 25, 50, 100]
#could be redundancy in predictions since models can be similar,
#set L1 regularization to drop out unnecessary features
reg_alpha = [0.1, 0.2, 0.3]
early_stopping_rounds = [50]
random_state = [42]

param_grid = {"n_estimators": n_estimators,
              "learning_rate": learning_rate,
              "num_leaves": num_leaves,
              "min_child_samples": min_child_samples,
              "reg_alpha": reg_alpha,
              "early_stopping_round": early_stopping_rounds,
              "random_state": random_state}

In [5]:
lgb_model = lgb.LGBMClassifier(force_row_wise = True)
grid_search = GridSearchCV(estimator = lgb_model,
                           param_grid = param_grid,
                           cv = 5,
                           n_jobs = -1,
                           verbose = 2,
                           scoring = "f1")
grid_search.fit(val_proba_train, y_val_train, eval_set = [(val_proba_validation, y_val_validation)], eval_metric = "logloss")
grid_search.best_params_

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 50118, number of used features: 9
[LightGBM] [Info] Start training from score -1.099630
[LightGBM] [Info] Start training from score -1.098732
[LightGBM] [Info] Start training from score -1.097476
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's multi_logloss: 0.308071


{'early_stopping_round': 50,
 'learning_rate': 0.001,
 'min_child_samples': 10,
 'n_estimators': 2000,
 'num_leaves': 5,
 'random_state': 42,
 'reg_alpha': 0.1}