# Xgboost

In [1]:
import numpy as np
import xgboost as xgb



In [4]:
print("Train possible labels: ")
print(np.unique(dtrain.get_label()))

print("\nTest possible labels: ")
print(np.unique(dtest.get_label()))

Train possible labels: 
[ 0.  1.]

Test possible labels: 
[ 0.  1.]


#training clasifier

In [6]:
bst = xgb.train(params, dtrain, num_rounds)

We can also observe performance on test dataset using `watchlist`

In [7]:
watchlist  = [(dtest,'test'), (dtrain,'train')] # native interface only
bst = xgb.train(params, dtrain, num_rounds, watchlist)

[0]	test-error:0.042831	train-error:0.046522
[1]	test-error:0.021726	train-error:0.022263
[2]	test-error:0.006207	train-error:0.007063
[3]	test-error:0.018001	train-error:0.0152
[4]	test-error:0.006207	train-error:0.007063


### Make predictions<a name='predict' />

In [8]:
preds_prob = bst.predict(dtest)
preds_prob

array([ 0.08073306,  0.92217326,  0.08073306, ...,  0.98059034,
        0.01182149,  0.98059034], dtype=float32)

Calculate simple accuracy metric to verify the results. Of course validation should be performed accordingly to the dataset, but in this case accuracy is sufficient.

In [9]:
labels = dtest.get_label()
preds = preds_prob > 0.5 # threshold
correct = 0

for i in range(len(preds)):
    if (labels[i] == preds[i]):
        correct += 1

print('Predicted correctly: {0}/{1}'.format(correct, len(preds)))
print('Error: {0:.4f}'.format(1-correct/len(preds)))

Predicted correctly: 1601/1611
Error: 0.0062


In [10]:
import numpy as np

from sklearn.datasets import load_svmlight_files
from sklearn.metrics import accuracy_score

from xgboost.sklearn import XGBClassifier

loading the data

In [12]:
X_train, y_train, X_test, y_test = load_svmlight_files(('data/agaricus.txt.train', 'data/agaricus.txt.test'))

Examine what was loaded

In [15]:
params = {
    'objective': 'binary:logistic',
    'max_depth': 2,
    'learning_rate': 1.0,
    'silent': 1.0,
    'n_estimators': 5
}

### Training classifier<a name='train' />

In [16]:
bst = XGBClassifier(**params).fit(X_train, y_train)

### Make predictions<a name='predict' />

In [17]:
preds = bst.predict(X_test)
preds

array([ 0.,  1.,  0., ...,  1.,  0.,  1.])

Calculate obtained error

In [18]:
correct = 0

for i in range(len(preds)):
    if (y_test[i] == preds[i]):
        correct += 1
        
acc = accuracy_score(y_test, preds)

print('Predicted correctly: {0}/{1}'.format(correct, len(preds)))
print('Error: {0:.4f}'.format(1-acc))

Predicted correctly: 1601/1611
Error: 0.0062


# Evaluate results

Specify training parameters - we are going to use 5 decision tree stumps with average learning rate.

In [21]:
# specify general training parameters
params = {
    'objective':'binary:logistic',
    'max_depth':1,
    'silent':1,
    'eta':0.5
}

num_rounds = 5

Before training the model let's also specify `watchlist` array to observe it's performance on the both datasets.

In [22]:
watchlist  = [(dtest,'test'), (dtrain,'train')]

In [None]:
#predefined Evualation

In [23]:
bst = xgb.train(params, dtrain, num_rounds, watchlist)

[0]	test-error:0.11049	train-error:0.113926
[1]	test-error:0.11049	train-error:0.113926
[2]	test-error:0.03352	train-error:0.030401
[3]	test-error:0.027312	train-error:0.021495
[4]	test-error:0.031037	train-error:0.025487


In [24]:
params['eval_metric'] = 'logloss'
bst = xgb.train(params, dtrain, num_rounds, watchlist)

[0]	test-logloss:0.457887	train-logloss:0.460108
[1]	test-logloss:0.383911	train-logloss:0.378728
[2]	test-logloss:0.312678	train-logloss:0.308061
[3]	test-logloss:0.26912	train-logloss:0.26139
[4]	test-logloss:0.239746	train-logloss:0.232174


You can also use multiple evaluation metrics at one time

In [25]:
params['eval_metric'] = ['logloss', 'auc']
bst = xgb.train(params, dtrain, num_rounds, watchlist)

[0]	test-logloss:0.457887	test-auc:0.892138	train-logloss:0.460108	train-auc:0.888997
[1]	test-logloss:0.383911	test-auc:0.938901	train-logloss:0.378728	train-auc:0.942881
[2]	test-logloss:0.312678	test-auc:0.976157	train-logloss:0.308061	train-auc:0.981415
[3]	test-logloss:0.26912	test-auc:0.979685	train-logloss:0.26139	train-auc:0.985158
[4]	test-logloss:0.239746	test-auc:0.9785	train-logloss:0.232174	train-auc:0.983744


In [26]:
# custom evaluation metric
def misclassified(pred_probs, dtrain):
    labels = dtrain.get_label() # obtain true labels
    preds = pred_probs > 0.5 # obtain predicted values
    return 'misclassified', np.sum(labels != preds)

In [27]:
bst = xgb.train(params, dtrain, num_rounds, watchlist, feval=misclassified, maximize=False)

[0]	test-misclassified:178	train-misclassified:742
[1]	test-misclassified:178	train-misclassified:742
[2]	test-misclassified:54	train-misclassified:198
[3]	test-misclassified:44	train-misclassified:140
[4]	test-misclassified:50	train-misclassified:166
