In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

SEED = 314159265
VALID_SIZE = 0.2
TARGET = 'Diabetes_binary'
FEATURES = ['HighBP', 'PhysHlth', 'DiffWalk', 'BMI', 'GenHlth', 'HighChol', 'HeartDiseaseorAttack', 'Age']

def score(params):
    print("Training with params: ")
    print(params)
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    dtrain = xgb.DMatrix(train_features, label=y_train)
    dvalid = xgb.DMatrix(valid_features, label=y_valid)
    watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    gbm_model = xgb.train(params, dtrain, num_round,
                          evals=watchlist,
                          early_stopping_rounds = 10,
                          verbose_eval=False)
    predictions = gbm_model.predict(dvalid)
    score = roc_auc_score(y_valid, predictions)

    print("\tScore {0}\n\n".format(score))
    loss = 1 - score
    return {'loss': loss, 'status': STATUS_OK}


def optimize(
             trials, 
             random_state=SEED):
    space = {
        'n_estimators': hp.quniform('n_estimators', 100, 1000, 1),
        'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
        'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
        'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
        'eval_metric': 'auc',
        'objective': 'binary:logistic',
        'nthread': 4,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'seed': random_state
    }
    best = fmin(score, space, algo=tpe.suggest, 
                trials=trials, 
                max_evals=50)
    return best


df = pd.read_csv(r"D:\CDC Diabetes\diabetes_binary_health_indicators_BRFSS2015.csv")

train_df, test_df = train_test_split(df.copy(),
                                                    shuffle = True,
                                                    random_state = 440,
                                                    test_size = .2,
                                                    stratify = df[TARGET].values)


train, valid = train_test_split(train_df, test_size=VALID_SIZE,
                                random_state=SEED)
train_features = train[FEATURES]
valid_features = valid[FEATURES]
y_train = train[TARGET]
y_valid = valid[TARGET]


trials = Trials()

best_hyperparams = optimize(
                            trials
                            )
print("The best hyperparameters are: ", "\n")
print(best_hyperparams)

Training with params:                                 
{'booster': 'gbtree', 'colsample_bytree': 0.8500000000000001, 'eta': 0.4, 'eval_metric': 'auc', 'gamma': 0.8, 'max_depth': 8, 'min_child_weight': 4.0, 'n_estimators': 251.0, 'nthread': 4, 'objective': 'binary:logistic', 'seed': 314159265, 'subsample': 0.9, 'tree_method': 'exact'}
	Score 0.8072607338690805                             


Training with params:                                                            
{'booster': 'gbtree', 'colsample_bytree': 0.55, 'eta': 0.325, 'eval_metric': 'auc', 'gamma': 1.0, 'max_depth': 2, 'min_child_weight': 1.0, 'n_estimators': 112.0, 'nthread': 4, 'objective': 'binary:logistic', 'seed': 314159265, 'subsample': 0.65, 'tree_method': 'exact'}
	Score 0.8288781422295777                                                        


Training with params:                                                            
{'booster': 'gbtree', 'colsample_bytree': 0.6000000000000001, 'eta': 0.125, 'eval_metric'

Training with params:                                                             
{'booster': 'gbtree', 'colsample_bytree': 0.65, 'eta': 0.45, 'eval_metric': 'auc', 'gamma': 0.5, 'max_depth': 9, 'min_child_weight': 3.0, 'n_estimators': 190.0, 'nthread': 4, 'objective': 'binary:logistic', 'seed': 314159265, 'subsample': 0.5, 'tree_method': 'exact'}
	Score 0.7974312405535757                                                         


Training with params:                                                             
{'booster': 'gbtree', 'colsample_bytree': 0.9, 'eta': 0.225, 'eval_metric': 'auc', 'gamma': 0.65, 'max_depth': 10, 'min_child_weight': 5.0, 'n_estimators': 795.0, 'nthread': 4, 'objective': 'binary:logistic', 'seed': 314159265, 'subsample': 0.8500000000000001, 'tree_method': 'exact'}
	Score 0.7843665472829655                                                         


Training with params:                                                             
{'booster': 'gbtree', 'colsa

	Score 0.8057170947223196                                                         


Training with params:                                                             
{'booster': 'gbtree', 'colsample_bytree': 0.9, 'eta': 0.17500000000000002, 'eval_metric': 'auc', 'gamma': 0.8, 'max_depth': 1, 'min_child_weight': 1.0, 'n_estimators': 253.0, 'nthread': 4, 'objective': 'binary:logistic', 'seed': 314159265, 'subsample': 0.75, 'tree_method': 'exact'}
	Score 0.8260839219776612                                                         


Training with params:                                                             
{'booster': 'gbtree', 'colsample_bytree': 0.7000000000000001, 'eta': 0.05, 'eval_metric': 'auc', 'gamma': 1.0, 'max_depth': 7, 'min_child_weight': 6.0, 'n_estimators': 607.0, 'nthread': 4, 'objective': 'binary:logistic', 'seed': 314159265, 'subsample': 0.8500000000000001, 'tree_method': 'exact'}
	Score 0.825446912610811                                                          



In [3]:
imbal = xgb.XGBClassifier(colsample_bytree = 0.8500000000000001,
                          eta = 0.125,
                          eval_metric = 'auc',
                          gamma = 0.75, 
                          max_depth = 1, 
                          min_child_weight = 3, 
                          n_estimators = 642,
                          objective = 'binary:logistic',
                          subsample = 0.7000000000000001,
                          seed = SEED,
                          tree_method = 'exact')

imbal.fit(train_features, y_train, 
          eval_set = [(valid_features, y_valid)],
          verbose = False)

pred = imbal.predict(test_df[FEATURES])
pred_imbal = imbal.predict_proba(test_df[FEATURES])[:,1]

print("Accuracy: ", accuracy_score(test_df[TARGET], pred))
print("Recall: ", recall_score(test_df[TARGET], pred, average = 'micro'))
print("ROC AUC: ", roc_auc_score(test_df[TARGET], pred_imbal))

Accuracy:  0.8656575212866604
Recall:  0.8656575212866604
ROC AUC:  0.8209993540828906
