**Furthur Finetuning LightGBM**

Changes include:

(1) Shifted tuning goal from F-1 score to precision

(2) Changed parameter search method from grid search to Bayesian Optimization for broader, more flexible parameter selection

In [None]:
model_precision_scores = {}

import optuna
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
import numpy as np

# Ensure data types are float32 to reduce memory usage
X_train_bal = X_train_bal.astype(np.float32)
X_test_bal = X_test_bal.astype(np.float32)

# Objective function for Optuna
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 7, 127),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 0.5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 0.5),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
        'random_state': 42,
        'device': 'gpu',
        'gpu_device_id': 0,
        'n_jobs': 1,
        'verbose': -1
    }
    clf = LGBMClassifier(**param)
    return cross_val_score(clf, X_train_bal, y_train_bal, cv=2, scoring='precision_weighted', n_jobs=1).mean()

# Create and run Optuna study
study = optuna.create_study(direction='maximize', study_name='LGBM Precision Optimization')
study.optimize(objective, n_trials=100, timeout=36000)

# Output the best parameters found
print("\nBest Parameters found:", study.best_params)

# Train the best model
best_model = LGBMClassifier(
    **study.best_params,
    random_state=42,
    device='gpu',
    gpu_device_id=0,
    n_jobs=1,
    verbose=-1
)
best_model.fit(X_train_bal, y_train_bal)
y_pred = best_model.predict(X_test_bal)

# Convert labels back to original strings
y_test_labels = label_encoder.inverse_transform(y_test_bal)
y_pred_labels = label_encoder.inverse_transform(y_pred)

# Print classification report
print("\nLightGBM Classification Report:")
print(classification_report(y_test_labels, y_pred_labels))

# Save the classification report as a dictionary
precision_scores_lgb = classification_report(y_test_labels, y_pred_labels, output_dict=True)
model_precision_scores['LightGBM'] = precision_scores_lgb

Best Parameters found: {
    'colsample_bytree': 0.7715490096744506,
    'learning_rate': 0.09741465956967921,
    'max_depth': 9,
    'min_child_samples': 37,
    'n_estimators': 169,
    'num_leaves': 90,
    'reg_alpha': 0.09536592999298432,
    'reg_lambda': 0.3981311006973933,
    'subsample': 0.946224464192131,
    'random_state': 42,
    'device': 'gpu',
    'gpu_device_id': 0,
    'n_jobs': 1,
    'verbose': -1
}

**Train LightGBM based on optimal parameters**

In [None]:
# Define the best parameters from previous optimization
best_params = {
    'colsample_bytree': 0.7715490096744506,
    'learning_rate': 0.09741465956967921,
    'max_depth': 9,
    'min_child_samples': 37,
    'n_estimators': 169,
    'num_leaves': 90,
    'reg_alpha': 0.09536592999298432,
    'reg_lambda': 0.3981311006973933,
    'subsample': 0.946224464192131,
    'random_state': 42,
    'device': 'gpu',
    'gpu_device_id': 0,
    'n_jobs': 1,
    'verbose': -1
}

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Ensure data types are float32 to reduce memory usage
X_train_bal = X_train_bal.astype(np.float32)
X_test_bal = X_test_bal.astype(np.float32)

# Train the model using the best parameters
model = LGBMClassifier(**best_params)
model.fit(X_train_bal, y_train_bal)

# Predict probabilities
y_proba = model.predict_proba(X_test_bal)

# Encode labels if needed for consistent indexing
label_encoder = LabelEncoder()
y_test_bal_encoded = label_encoder.fit_transform(y_test_bal)
label_names = label_encoder.inverse_transform(np.unique(y_test_bal_encoded))

In [None]:
# Manually create label_map
label_map = {
    0: "OTHER",
    1: "add_license OR remove_license",
    2: "add_user OR remove_user",
    3: "add_user_to_channel OR remove_user_from_channel",
    4: "os_update",
    5: "password_reset",
    6: "reset_mfa",
    7: "shipping_request"
}

# Print label_map to verify correctness
for index, label_name in label_map.items():
    print(f"Encoded label {index} corresponds to {label_name}")

Encoded label 0 corresponds to OTHER
Encoded label 1 corresponds to add_license OR remove_license
Encoded label 2 corresponds to add_user OR remove_user
Encoded label 3 corresponds to add_user_to_channel OR remove_user_from_channel
Encoded label 4 corresponds to os_update
Encoded label 5 corresponds to password_reset
Encoded label 6 corresponds to reset_mfa
Encoded label 7 corresponds to shipping_request


In [None]:
from sklearn.metrics import precision_score, recall_score

y_pred = np.argmax(y_proba, axis=1)
precision = precision_score(y_test_bal_encoded, y_pred, average=None)
recall = recall_score(y_test_bal_encoded, y_pred, average=None)

# Store precision and recall results per label
precision_recall_results = []
for label, name in label_map.items():
    precision_recall_results.append({"Label": name, "Metric": "Precision", "Score": precision[label]})
    precision_recall_results.append({"Label": name, "Metric": "Recall", "Score": recall[label]})

# Calculate overall precision and recall
overall_precision = precision_score(y_test_bal_encoded, y_pred, average="macro")
overall_recall = recall_score(y_test_bal_encoded, y_pred, average="macro")
precision_recall_results.append({"Label": "Overall", "Metric": "Precision", "Score": overall_precision})
precision_recall_results.append({"Label": "Overall", "Metric": "Recall", "Score": overall_recall})
weighted_recall = recall_score(y_test_bal_encoded, y_pred, average="weighted")