In [1]:
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
# Load the preprocessed data
X_train = np.load('X_train.npy')
y_train = np.load('y_train.npy')
X_val = np.load('X_val.npy')
y_val = np.load('y_val.npy')
X_test = np.load('X_test.npy')
y_test = np.load('y_test.npy')

In [3]:
# Flatten the input data for XGBoost
X_train_flattened = X_train.reshape(X_train.shape[0], -1)
X_val_flattened = X_val.reshape(X_val.shape[0], -1)
X_test_flattened = X_test.reshape(X_test.shape[0], -1)

# Combine train and validation data for grid search
X_train_combined = np.vstack((X_train_flattened, X_val_flattened))
y_train_combined = np.hstack((y_train, y_val))

# Convert to DMatrix format
train_data = xgb.DMatrix(X_train_combined, label=y_train_combined)
test_data = xgb.DMatrix(X_test_flattened, label=y_test)

In [4]:
# Parameter grid for hyperparameter tuning
param_grid = {
    "learning_rate": [0.1],
    "max_depth": [4, 5], # Depth of trees
    "alpha": [1, 10], # L1 regularization term
    "lambda": [1, 2], # L2 regularization term
    "num_boost_round": [2, 3], # Max boosting rounds
}

In [5]:
# Perform manual grid search
best_model = None
best_params = None
best_val_loss = float('inf')
results = []

for learning_rate in param_grid['learning_rate']:
    for max_depth in param_grid['max_depth']:
        for alpha in param_grid['alpha']:
            for lambda_ in param_grid['lambda']:
                for num_boost_round in param_grid['num_boost_round']:
                    params = {
                        "objective": "binary:logistic", # Binary classification
                        "eval_metric": "logloss", # Using logloss for binary classification
                        "learning_rate": learning_rate,
                        "max_depth": max_depth,
                        "alpha": alpha,
                        "lambda": lambda_,
                    }
                    evals = [(train_data, 'train')]
                    model = xgb.train(
                        params,
                        train_data,
                        num_boost_round=num_boost_round,
                    )

                    # Validate the model on the test set
                    y_pred_prob = model.predict(test_data)
                    y_pred_binary = (y_pred_prob >= 0.5).astype(int)

                    # Calculate logloss as the validation loss
                    val_loss = np.mean(-y_test * np.log(y_pred_prob) - (1 - y_test) * np.log(1 - y_pred_prob))

                    # Track the best model
                    if val_loss < best_val_loss:
                        best_val_loss = val_loss
                        best_model = model
                        best_params = {
                            "learning_rate": learning_rate,
                            "max_depth": max_depth,
                            "alpha": alpha,
                            "lambda": lambda_,
                            "num_boost_round": num_boost_round,
                        }

                    # Save results
                    results.append({
                        "params": params,
                        "num_boost_round": num_boost_round,
                        "val_loss": val_loss,
                    })

In [6]:
# Display the best parameters and validation loss
print(f"Best Parameters and Results:\n{best_params}, val_loss: {best_val_loss}")

Best Parameters and Results:
{'learning_rate': 0.1, 'max_depth': 5, 'alpha': 1, 'lambda': 1, 'num_boost_round': 3}, val_loss: 0.469704351402383


In [7]:
# Save the best model
best_model.save_model("Saved Models/best_xgboost.json")

In [8]:
# Generate classification report using the best model
y_pred_prob = best_model.predict(test_data)
y_pred_binary = (y_pred_prob >= 0.5).astype(int)

print("Classification Report:")
print(classification_report(y_test, y_pred_binary))

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       539
           1       0.98      0.88      0.93       298

    accuracy                           0.95       837
   macro avg       0.96      0.94      0.95       837
weighted avg       0.95      0.95      0.95       837

