In [396]:
from heamy.dataset import Dataset
from heamy.estimator import Classifier
from heamy.pipeline import ModelsPipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import log_loss

In [397]:
# 加载数据集
from sklearn.datasets import fetch_covtype

data = fetch_covtype()

X, y = data['data'][:10000], data['target'][:10000]

ord = OrdinalEncoder()
y_enc = ord.fit_transform(y.reshape(-1, 1))
y_enc = y_enc.reshape(-1, )

X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.1, random_state=1)

print(X_test.shape)
print(y_train.shape)

(1000, 54)
(9000,)


In [398]:
# 创建数据集
'''
use_cache : bool, default True
    If use_cache=True then preprocessing step will be cached until function code is changed.
'''
dataset = Dataset(X_train=X_train, y_train=y_train, X_test=X_test, y_test=None, use_cache=True)
dataset

Dataset(eff47fde69d1e04cb6dc241bb4c1d9b5)

In [399]:
print(dataset.X_train, end='\n\n')
print(dataset.y_train, end='\n\n')
print(dataset.X_test, end='\n\n')
print(dataset.y_test, end='\n\n')

[[2167.  129.   26. ...    0.    0.    0.]
 [2813.  117.   13. ...    0.    0.    0.]
 [2993.  286.   14. ...    0.    0.    0.]
 ...
 [2929.   75.   15. ...    0.    0.    0.]
 [2208.  317.   33. ...    0.    0.    0.]
 [2606.  356.   18. ...    0.    0.    0.]]

[3. 1. 0. ... 4. 5. 1.]

[[2979.   89.   18. ...    0.    0.    0.]
 [2083.   21.   28. ...    0.    0.    0.]
 [2322.  281.   17. ...    0.    0.    0.]
 ...
 [2306.  224.   25. ...    0.    0.    0.]
 [3029.  113.   14. ...    0.    0.    0.]
 [2882.   37.   10. ...    0.    0.    0.]]

None



In [400]:
def xgb_model(X_train, y_train, X_test, y_test=None):
    """参数必须为X_train,y_train,X_test,y_test"""
    params = {'objective': 'multi:softprob',
              "eval_metric": 'mlogloss',
              "verbosity": 0,
              'num_class': 7,
              'nthread': -1}

    dtrain = xgb.DMatrix(X_train, y_train)
    dtest = xgb.DMatrix(X_test)
    model = xgb.train(params, dtrain, num_boost_round=300)
    predict = model.predict(dtest)

    return predict  # 返回值必须为X_test的预测


def lgb_model(X_train, y_train, X_test, y_test=None):
    lgb_train = lgb.Dataset(X_train, y_train)
    params = {"objective": "multiclass",
              "num_class": 7,
              "n_jobs": -1,
              "verbose": -4, "metric": ("multi_logloss",)}
    model = lgb.train(params, lgb_train, num_boost_round=300)
    predict = model.predict(X_test)

    return predict


def rf_model(X_train, y_train, X_test, y_test=None):
    params = {"n_estimators": 100, "n_jobs": -1}
    model = RandomForestClassifier(**params).fit(X_train, y_train)
    predict = model.predict_proba(X_test)

    return predict

In [401]:
'''
name : str, optional
    The unique name of Estimator object.

use_cache : bool, optional
    if True then validate/predict/stack/blend results will be cached.

'''
model_xgb = Classifier(dataset=dataset, estimator=xgb_model, name='xgb', use_cache=False)
model_lgb = Classifier(dataset=dataset, estimator=lgb_model, name='lgb', use_cache=False)
model_rf = Classifier(dataset=dataset, estimator=rf_model,
                      name='rf',  # 默认parameters=None
                      use_cache=False)  # 默认use_cache=True

In [402]:
pipeline = ModelsPipeline(model_xgb, model_lgb, model_rf)
pipeline

<heamy.pipeline.ModelsPipeline at 0x23c6779b4f0>

In [403]:

# Finds optimal weights for weighted average of models.
'''
scorer : function
    Scikit-learn like metric.

test_size : float, default 0.2

method : str
    Type of solver. Should be one of:
        ‘Nelder-Mead’
        ‘Powell’
        ‘CG’
        ‘BFGS’
        ‘Newton-CG’
        ‘L-BFGS-B’
        ‘TNC’
        ‘COBYLA’
        ‘SLSQP’
        ‘dogleg’
        ‘trust-ncg’
'''
pipeline.find_weights(scorer=log_loss, )

Best Score (log_loss): 0.36290251502879434
Best Weights: [0.30341374 0.33741425 0.35917201]


array([0.30341374, 0.33741425, 0.35917201])

In [404]:
# Applies weighted mean to models.
# 线性加权
# pipeline.weight([0.5, 0.3, 0.2])  # 这里指定xgb模型权重0.5,lgb权重为0.3,rf权重为0.2

In [405]:
'''
k : int, default 5
    Number of folds.

stratify : bool, default False

shuffle : bool, default True

seed : int, default 100

full_test : bool, default True
    If True then evaluate test dataset on the full data otherwise take the mean of every fold.
'''
stack_ds = pipeline.stack(k=5,
                          stratify=False,  # 是否为分层k折
                          shuffle=True,
                          seed=1,
                          full_test=False)
stack_ds

Dataset(c0f2b2a65c70c009b273d6d72e33a74d)

In [406]:
# 第一层模型训练第二层模型的输出;这里xgb模型输出为概率,故有xgb_0, xgb_1,...,xgb_6
print(stack_ds.X_train)
print(stack_ds.y_train)
print(stack_ds.X_test)  # 第一层模型测试第二层模型的输出
print(stack_ds.y_test)

             xgb_0         xgb_1         xgb_2         xgb_3         xgb_4  \
0     4.534841e-08  2.541179e-07  2.733032e-03  9.972505e-01  5.520512e-07   
1     1.026551e-03  5.745058e-01  6.193155e-05  6.089087e-06  4.243897e-01   
2     9.931676e-01  6.827588e-03  6.073082e-08  9.379712e-08  2.986425e-06   
3     1.024370e-05  1.215085e-03  2.224298e-06  2.022765e-06  9.987329e-01   
4     2.070961e-06  2.478734e-04  4.268239e-05  1.892980e-08  2.222941e-05   
...            ...           ...           ...           ...           ...   
8995  1.498728e-06  7.729919e-06  9.997123e-01  1.818914e-05  3.111986e-07   
8996  2.466629e-03  8.747428e-07  1.366907e-06  9.943136e-08  2.284424e-07   
8997  4.480495e-03  2.889213e-01  6.571934e-05  1.598638e-06  7.065006e-01   
8998  6.482577e-06  8.352361e-05  3.192859e-01  5.541099e-06  3.409723e-07   
8999  4.172878e-04  9.854028e-01  1.783861e-03  1.912048e-06  1.223199e-02   

             xgb_5         xgb_6         lgb_0         lgb_1   

In [407]:
stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={"solver": 'lbfgs', "max_iter": 1000},
                     use_cache=False)
# stack_ds.X_test的预测结果
predict_stack = stacker.predict()
predict_stack

array([[6.17553318e-01, 3.60830749e-01, 9.19086375e-04, ...,
        3.63706313e-03, 1.24903540e-03, 1.51891308e-02],
       [1.55367299e-03, 9.96381071e-03, 1.80455087e-01, ...,
        3.88229351e-03, 7.96309812e-01, 5.77207084e-04],
       [8.62217921e-04, 3.92357523e-03, 2.48886188e-02, ...,
        1.94906397e-03, 9.65114604e-01, 2.39838382e-04],
       ...,
       [2.46394030e-03, 1.17742519e-02, 7.18180360e-01, ...,
        7.78942773e-03, 1.13296532e-01, 1.44696358e-03],
       [2.12967336e-01, 7.72014207e-01, 1.22218033e-03, ...,
        8.04975898e-03, 2.18057651e-03, 3.13770198e-03],
       [3.12885419e-01, 6.27500426e-01, 2.16627637e-03, ...,
        4.78236524e-02, 2.40565796e-03, 6.50991160e-03]])

In [408]:
print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, :7].values, axis=1), y_test))
print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, 7:14].values, axis=1), y_test))
print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, 14:].values, axis=1), y_test))

# 通过stacking模型融合,准确率得到了提升
print(accuracy_score(np.argmax(predict_stack, axis=1), y_test))

0.858
0.865
0.864
0.87
