In [0]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb


#irisデータセットを使用
iris = load_iris()
X = iris.data
y = iris.target

#トレーニング用、検証用に分割
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0)


#Xgboostの比較対象としてRandomForestで予測
clf_rf = RandomForestClassifier()
clf_rf.fit(X_train, y_train)
y_pred = clf_rf.predict(X_test)

accu = accuracy_score(y_test, y_pred)
print('accuracy = {:>.4f}'.format(accu))

# Feature Importance
fti = clf_rf.feature_importances_   

print('Feature Importances:')
for i, feat in enumerate(iris['feature_names']):
    print('\t{0:20s} : {1:>.6f}'.format(feat, fti[i]))

accuracy = 0.9667
Feature Importances:
	sepal length (cm)    : 0.153661
	sepal width (cm)     : 0.033728
	petal length (cm)    : 0.431286
	petal width (cm)     : 0.381325




In [0]:
#Xgboostを使用して予測

#モデル定義
clf_xgb = xgb.XGBClassifier(objective='multi:softmax',
                        max_depth = 5,
                        learning_rate=0.1,
                        n_estimators=100)

#学習
clf_xgb.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='mlogloss',
        early_stopping_rounds=10,
        verbose=False,#Trueにするとn_estimatorsごとのvalidation_0-mloglossが表示される
           )


#予測
y_pred_proba = clf_xgb.predict_proba(X_test)


#3つのラベルに割り振られた確率で最大のものを選択
y_pred = np.argmax(y_pred_proba, axis=1)


#検証用データの精度を計算
accu = accuracy_score(y_test, y_pred)
print('accuracy = {:>.4f}'.format(accu))

# Feature Importance
fti = clf_xgb.feature_importances_   

print('Feature Importances:')
for i, feat in enumerate(iris['feature_names']):
    print('\t{0:20s} : {1:>.6f}'.format(feat, fti[i]))

accuracy = 1.0000
Feature Importances:
	sepal length (cm)    : 0.020034
	sepal width (cm)     : 0.030724
	petal length (cm)    : 0.595451
	petal width (cm)     : 0.353791


###Pandasデータからxgboostを利用する流れ

In [12]:
#ダミーデータ用意

import pandas as pd
import numpy as np

data = pd.DataFrame(np.arange(30).reshape((10,3)), columns=['a', 'b', 'c'])
label = pd.DataFrame(np.random.randint(2, size=10))

data

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11
4,12,13,14
5,15,16,17
6,18,19,20
7,21,22,23
8,24,25,26
9,27,28,29


In [13]:
#01の二値分類ラベル

label

Unnamed: 0,0
0,1
1,0
2,1
3,0
4,0
5,1
6,1
7,1
8,1
9,0


In [0]:
#shapeの変換

X = data.as_matrix()
y = label.as_matrix().ravel()

In [0]:
#検証用ダミーデータ
data_t = pd.DataFrame(np.arange(12).reshape((4,3)), columns=['a', 'b', 'c'])
label_t = pd.DataFrame(np.random.randint(2, size=4))

#shapeの変換
X_t = data_t.as_matrix()
y_t = label_t.as_matrix().ravel()

In [19]:
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt

# モデル定義
clf_xgb = xgb.XGBClassifier(objective='binary:logistic', n_estimators=1000)

#学習
evals_result = {}
clf_xgb.fit(X, y,
        eval_metric='logloss',
        # トレーニングデータと検証用データをセット
        eval_set=[
            (X, y),
            (X_t, y_t),
        ],
        #改善が見られなくなったら学習停止
        early_stopping_rounds=10,
        # 学習過程の記録はコールバック API で記録
        callbacks=[
            xgb.callback.record_evaluation(evals_result)
        ],
        )

[0]	validation_0-logloss:0.693147	validation_1-logloss:0.693147
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.693147	validation_1-logloss:0.693147
[2]	validation_0-logloss:0.693147	validation_1-logloss:0.693147
[3]	validation_0-logloss:0.693147	validation_1-logloss:0.693147
[4]	validation_0-logloss:0.693147	validation_1-logloss:0.693147
[5]	validation_0-logloss:0.693147	validation_1-logloss:0.693147
[6]	validation_0-logloss:0.693147	validation_1-logloss:0.693147
[7]	validation_0-logloss:0.693147	validation_1-logloss:0.693147
[8]	validation_0-logloss:0.693147	validation_1-logloss:0.693147
[9]	validation_0-logloss:0.693147	validation_1-logloss:0.693147
[10]	validation_0-logloss:0.693147	validation_1-logloss:0.693147
Stopping. Best iteration:
[0]	validation_0-logloss:0.693147	validation_1-logloss:0.693147



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [20]:
#予測データの精度検証

y_pred = clf.predict(X_t)
acc = accuracy_score(y_t, y_pred)
print('Accuracy:', acc)

Accuracy: 0.75


###新規ダミーデータで予測

In [21]:
predict_data = np.array([[30,31,32]])
clf_xgb.predict(predict_data)

array([0])