In [1]:
import pandas as pd
from constants import FileDef
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV, RandomizedSearchCV
from scipy.stats import zscore

In [2]:
def load_data(train_path: str, val_path: str, test_path: str) -> pd.DataFrame:
    train = pd.read_csv(train_path)
    val = pd.read_csv(val_path)
    test = pd.read_csv(test_path)

    return train, val, test

In [3]:
def preprocess_data(
    train: pd.DataFrame, val: pd.DataFrame, test: pd.DataFrame
) -> pd.DataFrame:
    drop_cols = ["domain", "class"]
    normalize_cols = ["domain_length", "strange_char_count", "numeric_sequence", "special_char_sequence"]

    X_train = train.drop(columns=drop_cols)
    X_train[normalize_cols] = X_train[normalize_cols].apply(zscore)
    y_train = train["class"].convert_dtypes()

    X_val = val.drop(columns=drop_cols)
    X_val[normalize_cols] = X_val[normalize_cols].apply(zscore)
    y_val = val["class"].convert_dtypes()

    X_test = test.drop(columns=drop_cols)
    X_test[normalize_cols] = X_test[normalize_cols].apply(zscore)
    y_test = test["class"].convert_dtypes()

    return X_train, y_train, X_val, y_val, X_test, y_test

In [4]:
def train_model(
    X_train: pd.DataFrame,
    y_train: pd.DataFrame,
    eval_set: list,
    params: dict,
) -> XGBClassifier:
    print("*** TRAINING ***")
    model = XGBClassifier(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=eval_set, 
        verbose=False,
    )

    return model

In [5]:
def tune_model(
    model: XGBClassifier,
    # X_train: pd.DataFrame,
    # y_train: pd.DataFrame,
    X_val: pd.DataFrame,
    y_val: pd.DataFrame,
) -> dict:
    print("*** TUNING ***")
    # define the original parameter grid, used with RandomizedSearchCV
    # param_grid = {
    #     "learning_rate": [0.01, 0.05, 0.1],
    #     "max_depth": [3, 5, 7],
    #     "min_child_weight": [1, 3, 5],
    #     "gamma": [0.0, 0.1, 0.2],
    #     "subsample": [0.6, 0.8, 1.0],
    #     "colsample_bytree": [0.6, 0.8, 1.0],
    #     "reg_alpha": [0, 0.001, 0.01, 0.1, 1],
    #     "reg_lambda": [0, 0.001, 0.01, 0.1, 1],
    #     "n_estimators": [100, 300, 500],
    # }

    # this is the second round of tuning
    param_grid = {
        "gamma": [0.08, 0.1, 0.12],
        "subsample": [0.9, 1.0],
        "reg_lambda": [0.8, 1.0, 1.2],
        "reg_alpha": [0.05, 0.1, 0.15],
        "min_child_weight": [1, 2],
        "max_depth": [4, 5, 6],
        "colsample_bytree": [0.9, 1.0],
    }

    # define the cross-validation strategy
    cv = RepeatedStratifiedKFold(random_state=42)

    # perform the grid search
    grid_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        cv=cv,
        n_jobs=-1,
        scoring="accuracy",
        verbose=2,
    )

    # fit the grid search to the data
    grid_search.fit(
        X_val,
        y_val,
    )

    # get the best hyperparameters
    best_params = grid_search.best_params_

    return best_params

In [6]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)
    print("Confusion matrix:\n", cm)

# Load data

In [7]:
train, val, test = load_data(
    FileDef.TRAIN.value, FileDef.VALIDATE.value, FileDef.TEST.value
)
X_train, y_train, X_val, y_val, X_test, y_test = preprocess_data(train, val, test)

# Train initial model

In [8]:
# define our parameters for the model
train_params = {
    "objective": "binary:logistic",
    "n_estimators": 500,
    "booster": "gbtree",
    "grow_policy": "lossguide",
    "n_jobs": -1,
    "subsample": 1.0,
    "reg_lambda": 1.0,
    "reg_alpha": 0.15,
    "min_child_weight": 1,
    "max_depth": 7,
    "learning_rate": 0.06,
    "gamma": 0.12,
    "colsample_bytree": 0.9,
    # "early_stopping_rounds": 10
} # these are the tuned hyperparameters

eval_set = [(X_val, y_val), (X_test, y_test)]
model = train_model(X_train, y_train, eval_set, train_params)
evaluate_model(model, X_test, y_test)

*** TRAINING ***
Accuracy: 0.8317559832564945
Precision: 0.9162872391112514
Recall: 0.7556866930336702
F1-score: 0.8282737670492772
Confusion matrix:
 [[341251  29693]
 [105075 325008]]


In [9]:
model.get_booster().get_score(importance_type='gain')

{'domain_length': 309.6103820800781,
 'strange_char_count': 42.95644760131836,
 'numeric_sequence': 51.983306884765625,
 'consonant_ratio': 55.15205383300781,
 'vowel_ratio': 104.23319244384766,
 'special_char_sequence': 59.00568771362305}

# Tune and retrain

In [12]:
# tuned_params = tune_model(model, X_val, y_val)  # X_test, y_train,
# print(tuned_params)

*** TUNING ***
Fitting 50 folds for each of 10 candidates, totalling 500 fits
[CV] END colsample_bytree=1.0, gamma=0.08, max_depth=5, min_child_weight=1, reg_alpha=0.05, reg_lambda=1.0, subsample=1.0; total time=  57.4s
[CV] END colsample_bytree=1.0, gamma=0.08, max_depth=5, min_child_weight=1, reg_alpha=0.05, reg_lambda=1.0, subsample=1.0; total time=  59.2s
[CV] END colsample_bytree=1.0, gamma=0.08, max_depth=5, min_child_weight=1, reg_alpha=0.05, reg_lambda=1.0, subsample=1.0; total time= 1.0min
[CV] END colsample_bytree=1.0, gamma=0.08, max_depth=5, min_child_weight=1, reg_alpha=0.05, reg_lambda=1.0, subsample=1.0; total time= 1.0min
[CV] END colsample_bytree=1.0, gamma=0.08, max_depth=5, min_child_weight=1, reg_alpha=0.05, reg_lambda=1.0, subsample=1.0; total time= 1.0min
[CV] END colsample_bytree=1.0, gamma=0.08, max_depth=5, min_child_weight=1, reg_alpha=0.05, reg_lambda=1.0, subsample=1.0; total time= 1.0min
[CV] END colsample_bytree=1.0, gamma=0.08, max_depth=5, min_child_weig

KeyboardInterrupt: 

In [None]:
# # retrain using better parameters
# tuned_model = train_model(X_train, y_train, X_val, y_val, tuned_params)

# evaluate_model(model, X_test, y_test)

Accuracy: 0.9999990637037259
Precision: 1.0
Recall: 0.9999982561535979
F1-score: 0.9999991280760386
Confusion matrix:
 [[494593      0]
 [     1 573444]]
