In [1]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, classification_report
import numpy as np
import pandas as pd
from xgboost import XGBClassifier

In [20]:
features = pd.read_csv('../data/features.csv', index_col=0)
labels = pd.read_csv('../data/labels.csv', index_col=0)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [21]:
def outlier_handler(df):
    q1 = df.quantile(0.25)
    q3 = df.quantile(0.75)
    IQR = q3-q1
    lwr_bound = q1-(1.5*IQR)
    upr_bound = q3+(1.5*IQR)
    df = np.where(df > upr_bound, df.median(), np.where(df < lwr_bound, df.median(), df))
    return df

numeric_transformer = Pipeline([
    ('Outlier_handler', FunctionTransformer(outlier_handler)),
    ('Imputer', SimpleImputer(strategy='median'))
])

categorical_transformer = Pipeline([
    ('Imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('Binary_encoder', OneHotEncoder(sparse=False, drop='if_binary', handle_unknown='ignore'))
    # ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

numeric_features = X_train.select_dtypes(['int64', 'float64']).columns
cat_cols = X_train.select_dtypes('category').columns
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, cat_cols)
])
model = XGBClassifier(tree_method='hist',
                            objective='binary:logistic',
                            eval_metric='auc',
                            seed=0)
baseline_clf = Pipeline([
    ("preprocessor", preprocessor),
    ("scaler", StandardScaler()), 
    ("clf", model)
])

In [None]:
param_grid = {
    'clf__learning_rate': np.arange(0.05, 1, 0.05),
    'clf__max_depth': np.arange(3,100,1),
    'clf__n_estimators': np.arange(50,2500,50),
    'clf__eta'             : [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
    'clf__scale_pos_weight': [1, 1.5, 2, 2.5, 3, 3.5]
}
rand_auc = RandomizedSearchCV(estimator=baseline_clf, param_distributions=param_grid, scoring='recall', n_iter=100, verbose=False, 
                              n_jobs=-1, random_state=0)
rand_auc.fit(X_train, y_train)
rand_auc.best_score_, rand_auc.best_params_

In [6]:
learning_rate = rand_auc.best_params_['clf__learning_rate']
max_depth = rand_auc.best_params_['clf__max_depth']
n_estimators = rand_auc.best_params_['clf__n_estimators']
eta = rand_auc.best_params_['clf__learning_rate']
scale_pos_weight = rand_auc.best_params_['clf__scale_pos_weight']

model = XGBClassifier(learning_rate=learning_rate, max_depth=max_depth, n_estimators=n_estimators, eta=eta, scale_pos_weight=scale_pos_weight)

baseline_clf = Pipeline([
    ("preprocessor", preprocessor),
    ("scaler", StandardScaler()), 
    ("clf", model)
]).fit(X_train, y_train)

In [None]:
# baseline_clf.fit(X_train, y_train)

y_pred = baseline_clf.predict_proba(X_test)
print(f'ROC 1: {roc_auc_score(y_test, y_pred[:,1])}')
print(classification_report(y_test, baseline_clf.predict(X_test)))

ROC 1: 0.6284227418530853

                        precision    recall  f1-score   support
                    0       0.84      0.33      0.48      7336
                    1       0.31      0.83      0.45      2664

        accuracy                                0.47     10000
        macro avg           0.58      0.58      0.46     10000
        weighted avg        0.70      0.47      0.47     10000