In [1]:
import pickle
import numpy as np
import pandas as pd

from cuml.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix
from model_utils import get_dataset, do_HPO, timed, plot_search_results

## Hyperparameter optimization of the model

Defines three functions for fitting machine learning models: Random Forest, XGBoost and LightGBM.  

Use the do_HPO() with the following parameters :
+ model - classification model
+ params - a specified set of hyperparameters
+ mode - choose GridSearch or RandomSearch
+ n_folds - number of subsets for cross-validation

In [2]:
# Random forest
def fit_RF(x, y):
    params = {
        "n_estimators": [100],
        "max_depth": [6]
    }
    model = RandomForestClassifier()
    res, results = do_HPO(model, params, x, y, mode=MODE, n_folds=N_FOLDS)
    return res, results

# XGBoost
def fit_XGB(x, y):
    params = {
        "n_estimators": [100],
        "max_depth": [6],
        "learning_rate": [0.1]
    }
    model = XGBClassifier(tree_method='gpu_hist')
    res, results = do_HPO(model, params, x, y, mode=MODE, n_folds=N_FOLDS)
    return res, results

# LightGBM
def fit_LGB(x, y):
    params = {
        "n_estimators": [100],
        "max_depth": [6],
        "learning_rate": [0.1],
        "num_leaves": [80]
    }
    model = LGBMClassifier()
    res, results = do_HPO(model, params, x, y, mode=MODE, n_folds=N_FOLDS)
    return res, results

In [3]:
# 分類報告
def get_report(actual, pred):
    data = np.flip(confusion_matrix(actual, pred))
    rep1 = pd.DataFrame(data, ["Actual True", "Actual False"], ["Predicted  True", "Predicted  False"])
    rep2 = classification_report(actual, pred, digits=3)
    print(rep1, '\n'*3, rep2)

# 儲存模型
def save_model(model, path):
    pickle.dump(model, open(path, "wb"))
    
# 讀取模型
def load_model(path):
    return pickle.load(open(path, 'rb'))

#### Define HPO default parameters

In [4]:
MODE = 'gpu-grid'
N_FOLDS = 10

#### Loading the dataset

In [5]:
train_path = '../../dataset/to_extractive/train.parquet'
test_path = '../../dataset/to_extractive/test.parquet'

x_train, y_train, x_test, y_test = get_dataset(train_set = train_path,
                                               test_set = test_path,
                                               cols = ['section','F1','F2','F3','F4','F5','F6','F7','F10'], # input data (x)
                                               tgt = 'label' # target label (y)
                                               )

#### Model Training  
Find the best hyperparameter by adjusting the model parameters of the function

In [None]:
with timed():
    res, results = fit_LGB(x_train, y_train)

In [None]:
# res -> best model
y_pred = res.predict(x_test)
get_report(y_test, y_pred)

In [None]:
## Visualizing the Search
## Plots by fixing all paramters except one parameter to its best value using matplotlib
# plot_search_results(results)

In [None]:
## Save Model
# save_model(res, 'model/example.pkl')

#### Additional

In [None]:
## Unused test code
## ===================================================== Adjust True threshold 
# model = load_model('model/RF_model_F10.pkl')
# y_pred = model.predict_proba(x_test)
# y_pred=(y_pred.iloc[:,1] > 0.7).astype('int')
# get_report(y_test, y_pred)
## ===================================================== Cuml KNN, LR
# from cuml.linear_model import LogisticRegression
# from cuml.neighbors import KNeighborsClassifier
# def fit_LR(x, y):
#     params = {
#         'penalty': ['l1','l2'],
#         'C': [0.01, 0.1, 1, 10],
#         'max_iter': [1000, 2000]
#     }
#     model = LogisticRegression(verbose=0) 
#     res, results = do_HPO(model, params, x, y, mode=MODE, n_folds=N_FOLDS)
#     return res, results
# def fit_KNN(x, y):
#     params = {
#         'n_neighbors':[10, 15, 20]
#     }
#     model = KNeighborsClassifier(verbose=0)
#     res, results = do_HPO(model, params, x, y, mode=MODE, n_folds=N_FOLDS)
#     return res, results

In [None]:
# SystemError: initialization of _internal failed without raising an exception
# =============================================================================
# Try installing the following version
# pip install numpy==1.23.5