# XGBoost Multiclass Classification

https://github.com/dmlc/xgboost/tree/master/demo/multiclass_classification

## Data Preparation

https://archive.ics.uci.edu/ml/datasets/Dermatology

https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data

https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.names

Aim for this dataset is to determine the type of Eryhemato-Squamous Disease.

* Data Set Characteristics: Multivariate
* Attribute Characteristics: Categorical, Integer
* Associated Tasks: Classification
* Number of Instances: 366
* Number of Attributes: 33
* Missing Values? Yes
* Area: Life
* Date Donated: 1998-01-01
* Number of Web Hits: 119491

In [5]:
import os
import requests

HOME_DIR = os.path.abspath('xgboost')

if not os.path.exists(HOME_DIR):
    os.mkdir(HOME_DIR)

DATA_DIR = os.path.join(HOME_DIR, 'data')

if not os.path.exists(DATA_DIR):
    os.mkdir(DATA_DIR)

DATASET_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data'
DATASET_FILENAME = DATASET_URL.split('/')[-1]
DATASET_FILE = os.path.join(DATA_DIR, DATASET_FILENAME)

dataset_not_found = not os.path.exists(DATASET_FILE)

if dataset_not_found:
    print('Downloading {}...'.format(DATASET_FILENAME))
    r = requests.get(DATASET_URL, stream=True)
    with open(DATASET_FILE, 'wb') as f:
        for chunk in r.iter_content(chunk_size=32768):
            if chunk:
                f.write(chunk)
    print('Done!')

In [7]:
def show(file, lines=10):
    with open(file) as f:
        for _ in range(lines):
            print(next(f), end='')

show(DATASET_FILE)

2,2,0,3,0,0,0,0,1,0,0,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,3,0,0,0,1,0,55,2
3,3,3,2,1,0,0,0,1,1,1,0,0,1,0,1,2,0,2,2,2,2,2,1,0,0,0,0,0,0,0,1,0,8,1
2,1,2,3,1,3,0,3,0,0,0,1,0,0,0,1,2,0,2,0,0,0,0,0,2,0,2,3,2,0,0,2,3,26,3
2,2,2,0,0,0,0,0,3,2,0,0,0,3,0,0,2,0,3,2,2,2,2,0,0,3,0,0,0,0,0,3,0,40,1
2,3,2,2,2,2,0,2,0,0,0,1,0,0,0,1,2,0,0,0,0,0,0,0,2,2,3,2,3,0,0,2,3,45,3
2,3,2,0,0,0,0,0,0,0,0,0,2,1,0,2,2,0,2,0,0,0,1,0,0,0,0,2,0,0,0,1,0,41,2
2,1,0,2,0,0,0,0,0,0,0,0,0,0,3,1,3,0,0,0,2,0,0,0,0,0,0,0,0,0,0,2,0,18,5
2,2,3,3,3,3,0,2,0,0,0,2,0,0,0,2,3,0,0,0,0,0,0,0,0,2,2,3,2,0,0,3,3,57,3
2,2,1,0,2,0,0,0,0,0,0,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,0,2,0,0,0,2,0,22,4
2,2,1,0,1,0,0,0,0,0,0,0,0,0,0,3,2,0,2,0,0,0,0,0,0,0,0,2,0,0,0,2,0,30,4


In [14]:
import numpy as np

# label need to be 0 to num_class -1
data = np.loadtxt(DATASET_FILE, delimiter=',',
        converters={33: lambda x:int(x == '?'), 34: lambda x:int(x)-1})
sz = data.shape
sz

(366, 35)

In [16]:
split = int(sz[0] * 0.7)
split

256

In [17]:
train = data[:split, :]
test = data[split:, :]

train_X = train[:, :33]
train_Y = train[:, 34]

test_X = test[:, :33]
test_Y = test[:, 34]

print(train_X.shape, train_Y.shape)
print(test_X.shape, test_Y.shape)

(256, 33) (256,)
(110, 33) (110,)


## Training

In [21]:
import xgboost as xgb

xg_train = xgb.DMatrix(train_X, label=train_Y)
xg_test = xgb.DMatrix(test_X, label=test_Y)

# setup parameters for xgboost
param = {}

# use softmax multi-class classification
param['objective'] = 'multi:softmax'

# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 6
param['silent'] = 1
param['nthread'] = 4
param['num_class'] = 6

watchlist = [(xg_train, 'train'), (xg_test, 'test')]
num_round = 5
bst_softmax = xgb.train(param, xg_train, num_round, watchlist)

[0]	train-merror:0.011719	test-merror:0.127273
[1]	train-merror:0.015625	test-merror:0.127273
[2]	train-merror:0.011719	test-merror:0.109091
[3]	train-merror:0.007812	test-merror:0.081818
[4]	train-merror:0.007812	test-merror:0.090909


In [22]:
# get prediction
pred_softmax = bst_softmax.predict(xg_test)
print(pred_softmax.shape)

(110,)


In [23]:
pred_softmax[0]

3.0

In [20]:
test_Y[0]

3.0

In [10]:
error_rate_softmax = np.sum(pred_softmax != test_Y) / test_Y.shape[0]
print('Test error using softmax = {}'.format(error_rate_softmax))

Test error using softmax = 0.09090909090909091


In [24]:
# do the same thing again, but output probabilities
param['objective'] = 'multi:softprob'
bst_softprob = xgb.train(param, xg_train, num_round, watchlist)

[0]	train-merror:0.011719	test-merror:0.127273
[1]	train-merror:0.015625	test-merror:0.127273
[2]	train-merror:0.011719	test-merror:0.109091
[3]	train-merror:0.007812	test-merror:0.081818
[4]	train-merror:0.007812	test-merror:0.090909


In [25]:
pred_softprob = bst_softprob.predict(xg_test)
print(pred_softprob.shape)

(110, 6)


In [26]:
pred_softprob[0]

array([ 0.12211822,  0.12527415,  0.12204564,  0.38662398,  0.12199699,
        0.12194106], dtype=float32)

In [27]:
pred_label_softprob = np.argmax(pred_softprob, axis=1)
print(pred_label_softprob.shape)

(110,)


In [28]:
pred_label_softprob[0]

3

In [13]:
error_rate_softprob = np.sum(pred_label_softprob != test_Y) / test_Y.shape[0]
print('Test error using softprob = {}'.format(error_rate_softprob))

Test error using softprob = 0.09090909090909091
