In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.metrics import classification_report

from src.config import CORRELATED_FEATURES_TO_REMOVE, LOG_TRANSFORM_FEATURES, NAME_COLS, CLASS_WEIGHT
from src.helpers import remove_correlations, log_trainsform, train_lgbm_default, drop_names, encode_categorical, subset, train_lgbm_weighted, train_lgbm_class_weight

In [2]:
from dotenv import load_dotenv

load_dotenv()

TRAIN_DATA_PATH = os.getenv("TRAIN_DATA_PATH")
TEST_DATA_PATH = os.getenv("TEST_DATA_PATH")


In [3]:
df_train = pd.read_csv(TRAIN_DATA_PATH)
df_test = pd.read_csv(TEST_DATA_PATH)

## **Preprocessing**

- DO NOT remove outliers (give signal)

In [4]:
df_train = remove_correlations(df_train, CORRELATED_FEATURES_TO_REMOVE)
df_train = log_trainsform(df_train, LOG_TRANSFORM_FEATURES)
df_train = drop_names(df_train, NAME_COLS)
df_train = encode_categorical(df_train)[0]



Making a subset


In [5]:
df_train_subset = subset(df_train, 25)
df_train_subset.shape


(1561118, 10)

In [6]:
emergency_counts = df_train_subset['urgency_level'].value_counts().sort_index()
emergency_percentages = df_train_subset['urgency_level'].value_counts(normalize=True).sort_index() * 100

# Create a summary table
summary_table = pd.DataFrame({
    'Emergency Level': emergency_counts.index,
    'Count': emergency_counts.values,
    'Percentage (%)': emergency_percentages.values.round(3)
})
summary_table

Unnamed: 0,Emergency Level,Count,Percentage (%)
0,0,1559488,99.896
1,1,533,0.034
2,2,537,0.034
3,3,560,0.036


In [6]:
X_train_subset = df_train_subset.drop(columns=["urgency_level"])
y_train_subset = df_train_subset["urgency_level"]

**Default LightGBM model**

In [None]:
model, metrics, _ = train_lgbm_default(X_train_subset, y_train_subset)
metrics

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028963 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1023
[LightGBM] [Info] Number of data points in the train set: 1248894, number of used features: 9
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
Training until validation scores don't improve for 50 rounds
[50]	valid_0's multi_logloss: 0.0607124
[100]	valid_0's multi_logloss: 0.0145938
[150]	valid_0's multi_logloss: 0.0071725
[200]	valid_0's multi_logloss: 0.0045923
[250]	valid_0's multi_logloss: 0.00330159
[300]	valid_0's multi_logloss: 0.00267483
[350]	valid_0's multi_logloss: 0.00231311
[400]	valid_0's multi_logloss: 0.00212243
[450]	valid_0's multi_logloss: 0.00203782
[500]	valid_0's multi_logloss: 0.00201569
[550]	valid_0's mu

{'val_f1_macro': 0.8165808255611197,
 'val_f1_weighted': 0.9994713875455925,
 'best_iteration': 507}

In [None]:
print(classification_report(y_train_subset, model.predict(X_train_subset, num_iteration=model.best_iteration_)))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1559488
           1       0.85      0.92      0.88       533
           2       0.95      0.96      0.95       537
           3       0.99      1.00      1.00       560

    accuracy                           1.00   1561118
   macro avg       0.95      0.97      0.96   1561118
weighted avg       1.00      1.00      1.00   1561118



In [None]:
X_val, y_val = _  # from train_lgbm_default return

print(classification_report(
        y_val,
        model.predict(X_val, num_iteration=model.best_iteration_)
    )
)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    311898
           1       0.44      0.58      0.50       107
           2       0.77      0.79      0.78       107
           3       0.97      1.00      0.99       112

    accuracy                           1.00    312224
   macro avg       0.79      0.84      0.82    312224
weighted avg       1.00      1.00      1.00    312224



**Grid search LightGBM**

In [8]:
model, metrics, artifacts = train_lgbm_weighted(X_train_subset, y_train_subset, class_weight=CLASS_WEIGHT)
metrics

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006439 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1023
[LightGBM] [Info] Number of data points in the train set: 1248894, number of used features: 9
[LightGBM] [Info] Start training from score -0.003450
[LightGBM] [Info] Start training from score -6.193976
[LightGBM] [Info] Start training from score -7.283242
[LightGBM] [Info] Start training from score -7.242234


{'val_f1_macro': 0.5688077973381812,
 'val_f1_weighted': 0.9986185233830811,
 'cv_best_score_f1_macro': 0.6134423656085568,
 'best_params': {'min_child_samples': 50, 'num_leaves': 31}}

**Class-weighted default LightGBM**

In [None]:
print(CLASS_WEIGHT)
model, metrics, _ = train_lgbm_class_weight(X_train_subset, 
                                            y_train_subset,
                                            class_weight=CLASS_WEIGHT,
                                            early_stopping_rounds=500,
                                            lgbm_params={
                                                "learning_rate": 0.03,
                                                "n_estimators": 5000,
                                                "subsample": 0.8,
                                                "subsample_freq": 1,
                                            })
metrics

{0: 1.0, 1: 3.0, 2: 2.0, 3: 2.0}
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009252 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1023
[LightGBM] [Info] Number of data points in the train set: 1248894, number of used features: 9
[LightGBM] [Info] Start training from score -0.002429
[LightGBM] [Info] Start training from score -6.886102
[LightGBM] [Info] Start training from score -7.282221
[LightGBM] [Info] Start training from score -7.241213
Training until validation scores don't improve for 500 rounds
[50]	valid_0's multi_logloss: 0.00445266
[100]	valid_0's multi_logloss: 0.00342409
[150]	valid_0's multi_logloss: 0.00310647
[200]	valid_0's multi_logloss: 0.00298238
[250]	valid_0's multi_logloss: 0.00292587
[300]	valid_0's multi_logloss: 0.00290685
[350]	valid_0's multi_logloss: 0.00290156
[400]	valid_0's multi_logloss: 0.00289222


{'val_f1_macro': 0.7749424474444162,
 'val_f1_weighted': 0.9994336349587584,
 'best_iteration': 399}