In [1]:
import xgboost as xgb
from sklearn.metrics import classification_report
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Load the preprocessed feature data (already encoded)
X_train = np.load('X_train.npy')  # Your preprocessed training features
y_train = np.load('y_train.npy')  # Your labels
X_val = np.load('X_val.npy')  # Your preprocessed validation features
y_val = np.load('y_val.npy')  # Your validation labels
X_test = np.load('X_test.npy')  # Your preprocessed test features
y_test = np.load('y_test.npy')  # Your test labels

In [3]:
# Reshape the input data to be flat for XGBoost, i.e., (samples, time_steps * features)
X_train_flattened = X_train.reshape(X_train.shape[0], -1)  # Reshape to (2622, 6000)
X_val_flattened = X_val.reshape(X_val.shape[0], -1)  # Reshape to (1041, 6000)
X_test_flattened = X_test.reshape(X_test.shape[0], -1)  # Reshape to (837, 6000)

print(f"Flattened Training Data: {X_train_flattened.shape}, {y_train.shape}")
print(f"Flattened Validation Data: {X_val_flattened.shape}, {y_val.shape}")
print(f"Flattened Test Data: {X_test_flattened.shape}, {y_test.shape}")

Flattened Training Data: (2622, 6000), (2622,)
Flattened Validation Data: (1041, 6000), (1041,)
Flattened Test Data: (837, 6000), (837,)


In [4]:
# XGBoost requires the data to be in DMatrix format, so convert the reshaped data
train_data = xgb.DMatrix(X_train_flattened, label=y_train)
val_data = xgb.DMatrix(X_val_flattened, label=y_val)
test_data = xgb.DMatrix(X_test_flattened, label=y_test)
# Set up XGBoost parameters for binary classification
params = {
    "objective": "binary:logistic",  # Binary classification
    "eval_metric": "logloss",  # Using logloss for binary classification
    "learning_rate": 0.1,  # Learning rate
    "max_depth": 6,  # Depth of trees
    "alpha": 10,  # L1 regularization term
    "lambda": 1,  # L2 regularization term
}

In [5]:
# Train the model with validation data (early stopping can be set as well)
evals = [(train_data, 'train'), (val_data, 'eval')]
model = xgb.train(
    params,
    train_data,
    num_boost_round=5,  # Max boosting rounds
    evals=evals,
    early_stopping_rounds=50  # Stop training early if the validation error doesn't improve
)

[0]	train-logloss:0.59979	eval-logloss:0.61240
[1]	train-logloss:0.53981	eval-logloss:0.55418
[2]	train-logloss:0.48911	eval-logloss:0.50487
[3]	train-logloss:0.44630	eval-logloss:0.46336
[4]	train-logloss:0.40936	eval-logloss:0.42706


In [6]:
# Predict the probabilities from the XGBoost model
y_pred_prob = model.predict(xgb.DMatrix(X_test_flattened))

# Convert the probabilities to binary labels (threshold = 0.5)
y_pred_binary = (y_pred_prob >= 0.5).astype(int)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_binary))

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       539
           1       0.98      0.91      0.94       298

    accuracy                           0.96       837
   macro avg       0.96      0.95      0.95       837
weighted avg       0.96      0.96      0.96       837



In [7]:
# Save the trained XGBoost model for later use
model.save_model("xgboost_model.json")