In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

SEED = 314159265
VALID_SIZE = 0.2
TARGET = 'Diabetes_012'
FEATURES = ['HighBP', 'PhysHlth', 'DiffWalk', 'BMI', 'GenHlth', 'HighChol', 'HeartDiseaseorAttack', 'Age']

def score(params):
    print("Training with params: ")
    print(params)
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    dtrain = xgb.DMatrix(train_features, label=y_train)
    dvalid = xgb.DMatrix(valid_features, label=y_valid)
    watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    gbm_model = xgb.train(params, dtrain, num_round,
                          evals=watchlist,
                          early_stopping_rounds = 10,
                          verbose_eval=False)
    predictions = gbm_model.predict(dvalid)
    score = roc_auc_score(y_valid, predictions, multi_class = 'ovr')

    print("\tScore {0}\n\n".format(score))
    loss = 1 - score
    return {'loss': loss, 'status': STATUS_OK}


def optimize(
             trials, 
             random_state=SEED):
    space = {
        'n_estimators': hp.quniform('n_estimators', 100, 1000, 1),
        'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
        'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
        'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
        'eval_metric': 'auc',
        'objective': 'multi:softprob',
        'nthread': 4,
        'num_class': 3,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'seed': random_state
    }
    best = fmin(score, space, algo=tpe.suggest, 
                trials=trials, 
                max_evals=50)
    return best


df = pd.read_csv(r"D:\CDC Diabetes\diabetes_012_health_indicators_BRFSS2015.csv")

train_df, test_df = train_test_split(df.copy(),
                                                    shuffle = True,
                                                    random_state = 440,
                                                    test_size = .2,
                                                    stratify = df[TARGET].values)


train, valid = train_test_split(train_df, test_size=VALID_SIZE,
                                random_state=SEED)
train_features = train[FEATURES]
valid_features = valid[FEATURES]
y_train = train[TARGET]
y_valid = valid[TARGET]


trials = Trials()

best_hyperparams = optimize(
                            trials
                            )
print("The best hyperparameters are: ", "\n")
print(best_hyperparams)

Training with params:                                 
{'booster': 'gbtree', 'colsample_bytree': 0.9500000000000001, 'eta': 0.07500000000000001, 'eval_metric': 'auc', 'gamma': 0.7000000000000001, 'max_depth': 7, 'min_child_weight': 3.0, 'n_estimators': 757.0, 'nthread': 4, 'num_class': 3, 'objective': 'multi:softprob', 'seed': 314159265, 'subsample': 0.9, 'tree_method': 'exact'}
	Score 0.764993313712468                              


Training with params:                                                               
{'booster': 'gbtree', 'colsample_bytree': 0.8500000000000001, 'eta': 0.35000000000000003, 'eval_metric': 'auc', 'gamma': 0.7000000000000001, 'max_depth': 10, 'min_child_weight': 6.0, 'n_estimators': 406.0, 'nthread': 4, 'num_class': 3, 'objective': 'multi:softprob', 'seed': 314159265, 'subsample': 0.55, 'tree_method': 'exact'}
	Score 0.7323910926413157                                                           


Training with params:                                       

	Score 0.7778965775912958                                                         


Training with params:                                                             
{'booster': 'gbtree', 'colsample_bytree': 0.8, 'eta': 0.4, 'eval_metric': 'auc', 'gamma': 0.65, 'max_depth': 1, 'min_child_weight': 1.0, 'n_estimators': 100.0, 'nthread': 4, 'num_class': 3, 'objective': 'multi:softprob', 'seed': 314159265, 'subsample': 0.6000000000000001, 'tree_method': 'exact'}
	Score 0.7771252633488306                                                         


Training with params:                                                             
{'booster': 'gbtree', 'colsample_bytree': 0.7000000000000001, 'eta': 0.45, 'eval_metric': 'auc', 'gamma': 0.5, 'max_depth': 5, 'min_child_weight': 3.0, 'n_estimators': 622.0, 'nthread': 4, 'num_class': 3, 'objective': 'multi:softprob', 'seed': 314159265, 'subsample': 0.9500000000000001, 'tree_method': 'exact'}
	Score 0.7630498466522425                              

In [2]:
multi = xgb.XGBClassifier(colsample_bytree = 0.6000000000000001,
                          eta = 0.07500000000000001,
                          eval_metric =  'auc',
                          gamma = 0.6000000000000001, 
                          max_depth = 1, 
                          min_child_weight = 4, 
                          n_estimators = 554,
                          num_class = 3,
                          objective = 'multi:softprob',
                          subsample = 0.65,
                          seed = SEED,
                          tree_method = 'exact')

multi.fit(train_features, y_train, 
          eval_set = [(valid_features, y_valid)],
          verbose = False)

pred = multi.predict(test_df[FEATURES])
pred_multi = multi.predict_proba(test_df[FEATURES])

print("Accuracy: ", accuracy_score(test_df[TARGET], pred))
print("Recall: ", recall_score(test_df[TARGET], pred, average = 'micro'))
print("ROC AUC: ", roc_auc_score(test_df[TARGET], pred_multi, multi_class='ovr'))

Accuracy:  0.8489632608010091
Recall:  0.8489632608010091
ROC AUC:  0.7760194294480312
