In [2]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
import pickle

# Read the data

In [3]:
#update the paths and files, if needed
data_X_train = pd.read_pickle('./wavelet_pickle_data/wavelet_pickle_data/2022/Scaled/X_train_scaled.pkl')
data_y_train = pd.read_pickle('./wavelet_pickle_data/wavelet_pickle_data/2022/y_train.pkl')

In [4]:
data_X_train = data_X_train.sort_index()
data_y_train = data_y_train.sort_index()

# Prepare the data for training

In [6]:
columns = ['Redox_Avg(1)', 'WC1','Temp_T12_Avg(1)','EC_Avg(1)','Matric_potential_Avg(1)','Redox_Avg(1)_sigma_b_24','Redox_Avg(1)_sigma_f_24','Redox_Avg(1)_sigma_b_12','Redox_Avg(1)_sigma_f_12',
           'Redox_Avg(2)', 'WC2','Temp_T12_Avg(2)','EC_Avg(2)','Matric_potential_Avg(2)','Redox_Avg(2)_sigma_b_24','Redox_Avg(2)_sigma_f_24','Redox_Avg(2)_sigma_b_12','Redox_Avg(2)_sigma_f_12',
           'Redox_Avg(3)', 'WC3','Temp_T12_Avg(3)','EC_Avg(3)','Matric_potential_Avg(3)','Redox_Avg(3)_sigma_b_24','Redox_Avg(3)_sigma_f_24','Redox_Avg(3)_sigma_b_12','Redox_Avg(3)_sigma_f_12',
           'Redox_Avg(4)', 'WC4','Temp_T12_Avg(4)','EC_Avg(4)','Matric_potential_Avg(4)','Redox_Avg(4)_sigma_b_24','Redox_Avg(4)_sigma_f_24','Redox_Avg(4)_sigma_b_12','Redox_Avg(4)_sigma_f_12',
           'Redox_Avg(5)', 'WC5','Temp_T12_Avg(5)','EC_Avg(5)','Matric_potential_Avg(5)','Redox_Avg(5)_sigma_b_24','Redox_Avg(5)_sigma_f_24','Redox_Avg(5)_sigma_b_12','Redox_Avg(5)_sigma_f_12'
            ,'Water_level_Avg','Temp_ottpls_Avg','BatterymV_Min','Wave_period_0.5(1)', 'Wave_period_0.7(1)', 'Wave_period_0.9(1)',
       'Wave_period_1.1(1)', 'Wave_period_1.5(1)', 'Wave_period_1.9(1)',
       'Wave_period_2.5(1)', 'Wave_period_3.3(1)', 'Wave_period_4.4(1)',
       'Wave_period_0.5(2)', 'Wave_period_0.7(2)', 'Wave_period_0.9(2)',
       'Wave_period_1.1(2)', 'Wave_period_1.5(2)', 'Wave_period_1.9(2)',
       'Wave_period_2.5(2)', 'Wave_period_3.3(2)', 'Wave_period_4.4(2)',
       'Wave_period_0.5(3)', 'Wave_period_0.7(3)', 'Wave_period_0.9(3)',
       'Wave_period_1.1(3)', 'Wave_period_1.5(3)', 'Wave_period_1.9(3)',
       'Wave_period_2.5(3)', 'Wave_period_3.3(3)', 'Wave_period_4.4(3)',
       'Wave_period_0.5(4)', 'Wave_period_0.7(4)', 'Wave_period_0.9(4)',
       'Wave_period_1.1(4)', 'Wave_period_1.5(4)', 'Wave_period_1.9(4)',
       'Wave_period_2.5(4)', 'Wave_period_3.3(4)', 'Wave_period_4.4(4)',
       'Wave_period_0.5(5)', 'Wave_period_0.7(5)', 'Wave_period_0.9(5)',
       'Wave_period_1.1(5)', 'Wave_period_1.5(5)', 'Wave_period_1.9(5)',
       'Wave_period_2.5(5)', 'Wave_period_3.3(5)', 'Wave_period_4.4(5)']


X_train = data_X_train[columns]
y_train= data_y_train[f'Redox_error_flag']

# Model's parameter selection

In [10]:
params = {
       'num_leaves': [10,12,15,17,20],
       'min_child_weight': [0.001,0.01,0.1,1,10],
       'subsample': [0.01,0.05,0.1,0.2]
}


gbm_clf = lgb.LGBMClassifier(max_depth=-1)

gsearch = GridSearchCV(estimator=gbm_clf,param_grid=params,cv=5)

gsearch.fit(X_train, y_train)


[LightGBM] [Info] Number of positive: 5876, number of negative: 71355
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045336 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22827
[LightGBM] [Info] Number of data points in the train set: 77231, number of used features: 93
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076083 -> initscore=-2.496791
[LightGBM] [Info] Start training from score -2.496791
[LightGBM] [Info] Number of positive: 5876, number of negative: 71355
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.047320 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22742
[LightGBM] [Info] Number of data points in the train set: 77231, number of used features: 93
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076083 -> initscore=-2.496791
[LightGBM] [Info] Start training from score -2.496791
[LightGBM] [

# Model training

In [8]:
gbm_clf = lgb.LGBMClassifier(max_depth=-1, **gsearch.best_params_)
gbm_clf.fit(X_train, y_train)
pickle.dump(gbm_clf, open('./LightGB_model.pkl', 'wb'))

[LightGBM] [Info] Number of positive: 7345, number of negative: 89194
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011511 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22927
[LightGBM] [Info] Number of data points in the train set: 96539, number of used features: 93
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076083 -> initscore=-2.496794
[LightGBM] [Info] Start training from score -2.496794
