In [1]:
import pandas as pd
import numpy as np
import argparse
import pickle

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

from sklearn.metrics import roc_auc_score, precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

# file
import os
os.chdir("..")
import handset_model_current as handset_model
os.chdir("xgboost")

Using TensorFlow backend.


In [2]:
def bool_arg(string):
    value = string.lower()
    if value == 'true':
        return True
    elif value == 'false':
        return False
    else:
        raise argparse.ArgumentTypeError("Expected True or False, but got {}".format(string))

# minimal preprocessing
parser = argparse.ArgumentParser()

# model hyperparameters
# small number of epochs for experimentation
parser.add_argument('--epochs', default=10, type=int,
                    help="Nr of epochs. Default is 100", dest="epochs")
parser.add_argument('--batch_size', default=256, type=int,
                    help="Batch size. Default is 32", dest="batch_size")
parser.add_argument('--earlystop', default=3, type=int,
                    help="Number of epochs with no improvement after which training will be stopped.",
                    dest="earlystop")
parser.add_argument('--verbose', default=True, type=bool_arg, help="If True (default), verbose output",
                    dest="verbose")

# cross_val is not ready to be used
parser.add_argument('--cross_val', default=0, type=int,
                    help="Number of folds (if bigger than 0) to use for cross validation. Default is 0.",
                    dest="cross_val")

# no applying class weights
parser.add_argument('--apply_class_weights', default=False, type=bool_arg,
                    help="If True, apply different loss weights (based on frequency of samples) to different "
                         "classes.",
                    dest="apply_class_weights")

# no smooth factor
parser.add_argument('--smooth_factor', default=0, type=float,
                    help="Smooth factor to be used when calculating class weights, so that highly unfrequent "
                    "classes do not get huge weights.",
                    dest="smooth_factor")

# oversampling with neg to pos ratio=3
parser.add_argument('--oversample', default=True, type=bool_arg,
                    help="If True (default), apply oversampling to generate balanced batches.",
                    dest="oversample")
parser.add_argument('--ratio', default=3, type=int,
                    help="Ratio of negative to positive samples to use for balanced batch generation "
                         "(if oversample=True)",
                    dest="ratio")

# activation: prelu
parser.add_argument('--activation', default='prelu',
                    help="NN activation to be used. Default is prelu",
                    dest="activation")

# no x_vars
parser.add_argument('--x_vars', default=False, type=bool_arg, help="If True (default), include X variables",
                    dest="x_vars")

# standardize numerical data
parser.add_argument('--std', default=True, type=bool_arg, help="If True (default), standardize data.",
                    dest="std")

# no pca
parser.add_argument('--pca_whiten', default=False, type=bool_arg, help="If True (default), PCA-whiten data.",
                    dest="pca_whiten")
parser.add_argument('--pca_reduce', default=0, type=float,
                    help="{0, 1, 0<x<1} If 0, no dimensionality reduction is done. If 1, Thomas P. Minka's method "
                         "('Automatic Choice of Dimensionality for PCA'. NIPS 2000) is used to determine the "
                         "number of dimensions to keep. If 0 < pca_reduce < 1, enough number of dimensions will "
                         "be kept to keep 'pca_reduce' percentage of variance explained. Default is 0.9.",
                    dest="pca_reduce")

# one-hot encode cat data (embeddings are not used)
parser.add_argument('--cat_enc', default='one-hot',
                    help="Encoding to be used for categorical variables. Default is 'integer' "
                         "(embedding layers will then be used). Other alternatives: 'hashing_char', "
                         "'hashing_all', 'one-hot'.",
                    dest="cat_enc")

# no log transform
parser.add_argument('--log_xform', default=False, type=bool_arg, help="If True (default), log-transform data.",
                    dest="log_xform")

# encode categorical and binary data as 1/0
parser.add_argument('--binary_enc', default=True, type=bool_arg,
                    help="If False (default), the negative cases of binary variables will be represented as -1 "
                         "instead of 0.", dest="binary_enc")

# id for saving/ loading
parser.add_argument('--data_split_id', default=2, type=int,
                    help="Id for the train-test data split to be used. If a new id is given, a new data split "
                         "will be generated and saved to disk with the given id. If id is 0 (default), a new "
                         "split will be generated, but not saved to disk. If a previously used id is given, "
                         "a previously generated and saved data split with that id will be used.",
                    dest="data_split_id")
parser.add_argument("-f")
args = parser.parse_args()

In [3]:
os.chdir("..")
data_train, data_test, cat_levels = handset_model.load_and_preprocess_data(args)  # split_id=2 for f_classif features
os.chdir("xgboost")

generating dictionary with levels of catagorical variables...
Reusing data split with id=2
Loading previously pre-processed numerical data...
Loading previously pre-processed categorical data...


In [4]:
data_train['num'].shape, data_train['cat'].shape  # 2nd dim should be 7 and 235

((466632, 7), (466632, 235))

In [5]:
X_train = pd.concat([data_train['num'], data_train['cat']], axis=1)
y_train = data_train['labels']['TARGET_S_TO_S_APPLE']
X_test = pd.concat([data_test['num'], data_test['cat']], axis=1)
y_test = data_test['labels']['TARGET_S_TO_S_APPLE']

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((466632, 242), (466632,), (116659, 242), (116659,))

In [6]:
# References:
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
# https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
# https://github.com/dmlc/xgboost/blob/master/doc/how_to/param_tuning.md

npos = len([i for i in y_train if i == 1]) 
nneg = len([i for i in y_train if i == 0])

xgb1 = XGBClassifier(learning_rate=0.1,
                     n_estimators=30,
                     max_depth=5,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective='binary:logistic',
                     seed=0,
                     scale_pos_weight=nneg/npos,
                     silent=False
                    )

xgb1.get_params()

{'base_score': 0.5,
 'colsample_bylevel': 1,
 'colsample_bytree': 0.8,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 5,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 30,
 'nthread': -1,
 'objective': 'binary:logistic',
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 199.61564918314704,
 'seed': 0,
 'silent': False,
 'subsample': 0.8}

In [7]:
alg = xgb1
save=True
saveName='xgb1.pickle'
load=False
loadName='xgb1.pickle'

if load:
    print("loading model/alg from " + loadName)
    alg = pickle.load(open(loadName, 'rb'))
    print(alg)
else:
    # Fit the algorithm on the data
    print("fitting the model...")
    alg.fit(X_train.values, y_train.values, eval_metric='auc')

    if save:
        # save the algorithm
        pickle.dump(alg, open(saveName, 'wb'))
        print("saved model/alg as " + saveName)
        
print("finished")

fitting the model...
saved model/alg as xgb1.pickle
finished


In [8]:
# make sure that the model has predicted some positive labels

temp = alg.predict(X_train.values)
np.unique(temp)

array([0, 1])

In [9]:
print("making predictions...")
train_preds = alg.predict(X_train.values)
test_preds = alg.predict(X_test.values)

roc_auc_train = roc_auc_score(y_train.values, train_preds)
roc_auc_test = roc_auc_score(y_test.values, test_preds)

a_train = accuracy_score(y_train.values, np.rint(train_preds))
a_test = accuracy_score(y_test.values, np.rint(test_preds))

p_train = precision_score(y_train.values, np.rint(train_preds))
p_test = precision_score(y_test.values, np.rint(test_preds))

r_train = recall_score(y_train.values, np.rint(train_preds))
r_test = recall_score(y_test.values, np.rint(test_preds))

m_train = confusion_matrix(y_train.values, np.rint(train_preds))
m_test = confusion_matrix(y_test.values, np.rint(test_preds))
true_pos_rate_train = m_train[1][1]/(m_train[1][1]+m_train[1][0])
true_pos_rate_test = m_test[1][1]/(m_test[1][1]+m_test[1][0])

print('train-auc: %f\teval-auc: %f' % (roc_auc_train, roc_auc_test))
print('train-accuracy: %f\teval-accuracy: %f' % (a_train, a_test))
print('train-precision: %f\teval-precision: %f' % (p_train, p_test))
print('train-recall: %f\teval-recall: %f' % (r_train, r_test))

print('train-confusion-matrix:\n', m_train)
print('test-confusion-matrix:\n', m_test)
print('train-true-pos-rate: %f\teval-true-pos-rate: %f' % (true_pos_rate_train, true_pos_rate_test))

making predictions...
train-auc: 0.790120	eval-auc: 0.748477
train-accuracy: 0.754290	eval-accuracy: 0.755064
train-precision: 0.016544	eval-precision: 0.014937
train-recall: 0.826311	eval-recall: 0.741824
train-confusion-matrix:
 [[350054 114252]
 [   404   1922]]
test-confusion-matrix:
 [[87654 28424]
 [  150   431]]
train-true-pos-rate: 0.826311	eval-true-pos-rate: 0.741824


In [None]:
# example of grid search for param tuning (takes a really long time)

param_test1 = {
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5]
}
gsearch1 = GridSearchCV(estimator = xgb1,
                        param_grid = param_test1,
                        scoring='precision',
                        n_jobs=-1,
                        iid=False, 
                        cv=3,
                        verbose=10
                       )

gsearch1.fit(X_train.values, y_train.values)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_