In [1]:
import pandas as pd
import optuna
import lightgbm
import xgboost
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('clean_data_for_training.csv')

In [2]:
X = df.drop(['target'], axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [3]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 1000)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.5)
    max_depth = trial.suggest_int('max_depth', 2, 20)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    min_child_samples = trial.suggest_int('min_child_samples', 1, 10)
    model_lgbm = LGBMClassifier(
        n_estimators = n_estimators,
        learning_rate = learning_rate,
        max_depth = max_depth,
        subsample = subsample,
        min_child_samples = min_child_samples,
        verbose = -1
    )
    model_lgbm.fit(X_train, y_train)
    pred_lgbm = model_lgbm.predict(X_test)
    accuracy = accuracy_score(y_test, pred_lgbm)
    return accuracy
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)  

[I 2025-03-03 13:43:24,926] A new study created in memory with name: no-name-6d8ed03c-cdd0-4139-9bf6-f0cd39596075
[I 2025-03-03 13:43:25,891] Trial 0 finished with value: 0.8170527022490769 and parameters: {'n_estimators': 689, 'learning_rate': 0.4935207634188131, 'max_depth': 4, 'subsample': 0.6111876899452247, 'min_child_samples': 2}. Best is trial 0 with value: 0.8170527022490769.
[I 2025-03-03 13:43:26,043] Trial 1 finished with value: 0.8214165827458879 and parameters: {'n_estimators': 28, 'learning_rate': 0.07475301281687906, 'max_depth': 14, 'subsample': 0.7890090789173791, 'min_child_samples': 5}. Best is trial 1 with value: 0.8214165827458879.
[I 2025-03-03 13:43:26,669] Trial 2 finished with value: 0.8143672373279625 and parameters: {'n_estimators': 200, 'learning_rate': 0.231448965315955, 'max_depth': 13, 'subsample': 0.8759613673724648, 'min_child_samples': 10}. Best is trial 1 with value: 0.8214165827458879.
[I 2025-03-03 13:43:27,384] Trial 3 finished with value: 0.814367

In [4]:
model_lgbm = LGBMClassifier(**study.best_params)
model_lgbm.fit(X_train, y_train)
pred_lgbm = model_lgbm.predict(X_test)
precision_lgbm = precision_score(y_test, pred_lgbm, pos_label='NORM')
recall_lgbm = recall_score(y_test, pred_lgbm, pos_label='NORM')
f1_lgbm = f1_score(y_test, pred_lgbm, pos_label='NORM')

In [5]:
print('Precision:', precision_lgbm)
print('Recall:', recall_lgbm)
print('F1:', f1_lgbm)

Precision: 0.8138395590936925
Recall: 0.8812997347480106
F1: 0.8462273161413563


In [6]:
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.fit_transform(y_test)
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 1000)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.5)
    max_depth = trial.suggest_int('max_depth', 2, 20)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)
    model_xgb = XGBClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        subsample=subsample,
        min_child_weight=min_child_weight
    )
    model_xgb.fit(X_train, y_train_enc)
    pred_xgb = model_xgb.predict(X_test)
    accuracy = accuracy_score(y_test_enc, pred_xgb)
    return accuracy
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)  

[I 2025-03-03 13:46:22,989] A new study created in memory with name: no-name-75ce9227-3eda-4b89-b12f-ad9771827d71
[I 2025-03-03 13:46:28,463] Trial 0 finished with value: 0.8267875125881168 and parameters: {'n_estimators': 716, 'learning_rate': 0.02889914177245901, 'max_depth': 8, 'subsample': 0.9755187527542764, 'min_child_weight': 9}. Best is trial 0 with value: 0.8267875125881168.
[I 2025-03-03 13:46:32,834] Trial 1 finished with value: 0.8130245048674052 and parameters: {'n_estimators': 752, 'learning_rate': 0.3817856004542725, 'max_depth': 9, 'subsample': 0.833640042795595, 'min_child_weight': 1}. Best is trial 0 with value: 0.8267875125881168.
[I 2025-03-03 13:46:35,984] Trial 2 finished with value: 0.8022826451829473 and parameters: {'n_estimators': 458, 'learning_rate': 0.33382213965208435, 'max_depth': 12, 'subsample': 0.6657072746531749, 'min_child_weight': 3}. Best is trial 0 with value: 0.8267875125881168.
[I 2025-03-03 13:46:41,749] Trial 3 finished with value: 0.817724068

In [7]:
model_xgb = XGBClassifier(**study.best_params)
model_xgb.fit(X_train, y_train_enc)
pred_xgb = model_xgb.predict(X_test)
precision_xgb = precision_score(y_test_enc, pred_xgb)
recall_xgb = recall_score(y_test_enc, pred_xgb)
f1_xgb= f1_score(y_test_enc, pred_xgb)
print('Precision:', precision_xgb)
print('Recall:', recall_xgb)
print('F1:', f1_xgb)

Precision: 0.8596750369276218
Recall: 0.7912984364377974
F1: 0.824070796460177


In [8]:
from sklearn.ensemble import RandomForestClassifier
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 1000)
    max_depth = trial.suggest_int('max_depth', 2, 20)
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])
    min_samples_split = trial.suggest_float('min_samples_split', 0.1, 1)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    rfc = RandomForestClassifier(
        n_estimators = n_estimators,
        max_depth = max_depth,
        bootstrap = bootstrap,
        min_samples_split = min_samples_split,
        min_samples_leaf = min_samples_leaf
    )
    rfc.fit(X_train, y_train)
    pred_rfc = rfc.predict(X_test)
    accuracy = accuracy_score(y_test, pred_rfc)
    return accuracy
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)  

[I 2025-03-03 13:50:53,707] A new study created in memory with name: no-name-1b559a9f-e5ec-441d-b836-efb5d879230d
[I 2025-03-03 13:51:14,324] Trial 0 finished with value: 0.792212151728768 and parameters: {'n_estimators': 586, 'max_depth': 14, 'bootstrap': False, 'min_samples_split': 0.17297029819392507, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.792212151728768.
[I 2025-03-03 13:51:19,376] Trial 1 finished with value: 0.75730110775428 and parameters: {'n_estimators': 747, 'max_depth': 19, 'bootstrap': False, 'min_samples_split': 0.9113528479423355, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.792212151728768.
[I 2025-03-03 13:51:20,979] Trial 2 finished with value: 0.7603222557905337 and parameters: {'n_estimators': 181, 'max_depth': 19, 'bootstrap': False, 'min_samples_split': 0.7529125893094862, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.792212151728768.
[I 2025-03-03 13:51:21,045] Trial 3 finished with value: 0.5062101376300772 and parameters: {'n_est

In [9]:
rfc = RandomForestClassifier(**study.best_params)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)
precision_rfc = precision_score(y_test, pred_rfc, pos_label='NORM')
recall_rfc = recall_score(y_test, pred_rfc, pos_label='NORM')
f1_rfc = f1_score(y_test, pred_rfc, pos_label='NORM')

In [10]:
print('Precision:', precision_rfc)
print('Recall:', recall_rfc)
print('F1:', f1_rfc)

Precision: 0.7928802588996764
Recall: 0.8123342175066313
F1: 0.8024893547330495
