In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

SEED = 314159265
VALID_SIZE = 0.2
TARGET = 'Diabetes_binary'
FEATURES = ['HighBP', 'PhysHlth', 'DiffWalk', 'BMI', 'GenHlth', 'HighChol', 'HeartDiseaseorAttack', 'Age']

def score(params):
    print("Training with params: ")
    print(params)
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    dtrain = xgb.DMatrix(train_features, label=y_train)
    dvalid = xgb.DMatrix(valid_features, label=y_valid)
    watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    gbm_model = xgb.train(params, dtrain, num_round,
                          evals=watchlist,
                          early_stopping_rounds = 10,
                          verbose_eval=False)
    predictions = gbm_model.predict(dvalid)
    score = roc_auc_score(y_valid, predictions)

    print("\tScore {0}\n\n".format(score))
    loss = 1 - score
    return {'loss': loss, 'status': STATUS_OK}


def optimize(
             trials, 
             random_state=SEED):
    space = {
        'n_estimators': hp.quniform('n_estimators', 100, 1000, 1),
        'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
        'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
        'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
        'eval_metric': 'auc',
        'objective': 'binary:logistic',
        'nthread': 4,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'seed': random_state
    }
    best = fmin(score, space, algo=tpe.suggest, 
                trials=trials, 
                max_evals=50)
    return best


df = pd.read_csv(r"D:\CDC Diabetes\diabetes_binary_5050split_health_indicators_BRFSS2015.csv")

train_df, test_df = train_test_split(df.copy(),
                                                    shuffle = True,
                                                    random_state = 440,
                                                    test_size = .2,
                                                    stratify = df[TARGET].values)


train, valid = train_test_split(train_df, test_size=VALID_SIZE,
                                random_state=SEED)
train_features = train[FEATURES]
valid_features = valid[FEATURES]
y_train = train[TARGET]
y_valid = valid[TARGET]


trials = Trials()

best_hyperparams = optimize(
                            trials
                            )
print("The best hyperparameters are: ", "\n")
print(best_hyperparams)

Training with params:                                 
{'booster': 'gbtree', 'colsample_bytree': 0.8, 'eta': 0.275, 'eval_metric': 'auc', 'gamma': 0.65, 'max_depth': 5, 'min_child_weight': 4.0, 'n_estimators': 210.0, 'nthread': 4, 'objective': 'binary:logistic', 'seed': 314159265, 'subsample': 0.9500000000000001, 'tree_method': 'exact'}
	Score 0.8161256824676122                             


Training with params:                                                           
{'booster': 'gbtree', 'colsample_bytree': 0.9500000000000001, 'eta': 0.375, 'eval_metric': 'auc', 'gamma': 0.7000000000000001, 'max_depth': 7, 'min_child_weight': 4.0, 'n_estimators': 146.0, 'nthread': 4, 'objective': 'binary:logistic', 'seed': 314159265, 'subsample': 0.65, 'tree_method': 'exact'}
	Score 0.8018597222894418                                                       


Training with params:                                                           
{'booster': 'gbtree', 'colsample_bytree': 0.7000000000000001

Training with params:                                                             
{'booster': 'gbtree', 'colsample_bytree': 0.8500000000000001, 'eta': 0.5, 'eval_metric': 'auc', 'gamma': 0.9500000000000001, 'max_depth': 8, 'min_child_weight': 4.0, 'n_estimators': 959.0, 'nthread': 4, 'objective': 'binary:logistic', 'seed': 314159265, 'subsample': 0.6000000000000001, 'tree_method': 'exact'}
	Score 0.7741458970279136                                                         


Training with params:                                                             
{'booster': 'gbtree', 'colsample_bytree': 0.8500000000000001, 'eta': 0.05, 'eval_metric': 'auc', 'gamma': 0.9500000000000001, 'max_depth': 12, 'min_child_weight': 2.0, 'n_estimators': 887.0, 'nthread': 4, 'objective': 'binary:logistic', 'seed': 314159265, 'subsample': 0.65, 'tree_method': 'exact'}
	Score 0.7933709207173887                                                         


Training with params:                                 

	Score 0.7859485207099851                                                         


Training with params:                                                             
{'booster': 'gbtree', 'colsample_bytree': 0.8, 'eta': 0.125, 'eval_metric': 'auc', 'gamma': 0.6000000000000001, 'max_depth': 2, 'min_child_weight': 5.0, 'n_estimators': 806.0, 'nthread': 4, 'objective': 'binary:logistic', 'seed': 314159265, 'subsample': 0.7000000000000001, 'tree_method': 'exact'}
	Score 0.8222801781153395                                                         


Training with params:                                                             
{'booster': 'gbtree', 'colsample_bytree': 1.0, 'eta': 0.225, 'eval_metric': 'auc', 'gamma': 0.7000000000000001, 'max_depth': 11, 'min_child_weight': 1.0, 'n_estimators': 737.0, 'nthread': 4, 'objective': 'binary:logistic', 'seed': 314159265, 'subsample': 1.0, 'tree_method': 'exact'}
	Score 0.8045496673593735                                                         

In [4]:
fifty = xgb.XGBClassifier(colsample_bytree = 0.65,
                          eta = 0.025,
                          eval_metric = 'auc',
                          gamma = 0.7000000000000001, 
                          max_depth = 2, 
                          min_child_weight = 1, 
                          n_estimators = 438,
                          objective = 'binary:logistic',
                          seed = SEED,
                          subsample = 0.8,
                          tree_method = 'exact')

fifty.fit(train_features, y_train,
          eval_set = [(valid_features, y_valid)],
          verbose = False)

pred = fifty.predict(test_df[FEATURES])
pred_fifty = fifty.predict_proba(test_df[FEATURES])[:,1]

print("Accuracy: ", accuracy_score(test_df[TARGET], pred))
print("Recall: ", recall_score(test_df[TARGET], pred))
print("ROC AUC: ", roc_auc_score(test_df[TARGET], pred_fifty))

Accuracy:  0.747719074899215
Recall:  0.7883717640401754
ROC AUC:  0.8247246929288446
