In [1]:
from heamy.dataset import Dataset
from heamy.estimator import Classifier
from heamy.pipeline import ModelsPipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import log_loss

In [2]:
# 加载数据集
from sklearn.datasets import fetch_covtype

data = fetch_covtype()

X, y = data['data'][:10000], data['target'][:10000]

ord = OrdinalEncoder()
y_enc = ord.fit_transform(y.reshape(-1, 1))
y_enc = y_enc.reshape(-1, )

X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.1, random_state=1)

print(X_test.shape)
print(y_train.shape)

(1000, 54)
(9000,)


In [3]:
# 创建数据集
'''
use_cache : bool, default True
    If use_cache=True then preprocessing step will be cached until function code is changed.
'''
dataset = Dataset(X_train=X_train, y_train=y_train, X_test=X_test, y_test=None, use_cache=True)
dataset

Dataset(eff47fde69d1e04cb6dc241bb4c1d9b5)

In [4]:
print(dataset.X_train, end='\n\n')
print(dataset.y_train, end='\n\n')
print(dataset.X_test, end='\n\n')
print(dataset.y_test, end='\n\n')

[[2167.  129.   26. ...    0.    0.    0.]
 [2813.  117.   13. ...    0.    0.    0.]
 [2993.  286.   14. ...    0.    0.    0.]
 ...
 [2929.   75.   15. ...    0.    0.    0.]
 [2208.  317.   33. ...    0.    0.    0.]
 [2606.  356.   18. ...    0.    0.    0.]]

[3. 1. 0. ... 4. 5. 1.]

[[2979.   89.   18. ...    0.    0.    0.]
 [2083.   21.   28. ...    0.    0.    0.]
 [2322.  281.   17. ...    0.    0.    0.]
 ...
 [2306.  224.   25. ...    0.    0.    0.]
 [3029.  113.   14. ...    0.    0.    0.]
 [2882.   37.   10. ...    0.    0.    0.]]

None



In [5]:
def xgb_model(X_train, y_train, X_test, y_test):
    """参数必须为X_train,y_train,X_test,y_test"""
    params = {'objective': 'multi:softprob',
              "eval_metric": 'mlogloss',
              "verbosity": 0,
              'num_class': 7,
              'nthread': -1}

    dtrain = xgb.DMatrix(X_train, y_train)
    dtest = xgb.DMatrix(X_test)
    model = xgb.train(params, dtrain, num_boost_round=300)
    predict = model.predict(dtest)

    return predict  # 返回值必须为X_test的预测


def lgb_model(X_train, y_train, X_test, y_test,
              **parameters):  # Classifier处对字典进行了解包,此处需要重新打包
    if parameters is None:
        parameters = {}
    lgb_train = lgb.Dataset(X_train, y_train)

    model = lgb.train(params=parameters, train_set=lgb_train, num_boost_round=300)
    predict = model.predict(X_test)

    return predict


def rf_model(X_train, y_train, X_test, y_test):
    params = {"n_estimators": 100, "n_jobs": -1}
    model = RandomForestClassifier(**params).fit(X_train, y_train)
    predict = model.predict_proba(X_test)

    return predict

In [6]:
params = {"objective": "multiclass",
          "num_class": 7,
          "n_jobs": -1,
          "verbose": -4, "metric": ("multi_logloss",)}
'''
name : str, optional
    The unique name of Estimator object.

parameters : dict, optional
    Arguments for estimator object.

use_cache : bool, optional
    if True then validate/predict/stack/blend results will be cached.
'''
model_xgb = Classifier(dataset=dataset, estimator=xgb_model, name='xgb', use_cache=False)
model_lgb = Classifier(dataset=dataset, estimator=lgb_model, name='lgb',
                       parameters=params,
                       use_cache=False)
model_rf = Classifier(dataset=dataset, estimator=rf_model,
                      name='rf',  # 默认parameters=None
                      use_cache=False)  # 默认use_cache=True

In [7]:
pipeline = ModelsPipeline(model_xgb, model_lgb, model_rf)
pipeline

<heamy.pipeline.ModelsPipeline at 0x270452e65b0>

In [8]:

# Finds optimal weights for weighted average of models.
'''
scorer : function
    Scikit-learn like metric.

test_size : float, default 0.2

method : str
    Type of solver. Should be one of:
        ‘Nelder-Mead’
        ‘Powell’
        ‘CG’
        ‘BFGS’
        ‘Newton-CG’
        ‘L-BFGS-B’
        ‘TNC’
        ‘COBYLA’
        ‘SLSQP’
        ‘dogleg’
        ‘trust-ncg’
'''
pipeline.find_weights(scorer=log_loss, )  # 输出最优权重组合

{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': No

array([0.36520686, 0.36295923, 0.27183391])

In [9]:
# Applies weighted mean to models.
# 线性加权
# pipeline.weight([0.5, 0.3, 0.2])  # 这里指定xgb模型权重0.5,lgb权重为0.3,rf权重为0.2

In [10]:
'''
k : int, default 5
    Number of folds.

stratify : bool, default False

shuffle : bool, default True

seed : int, default 100

full_test : bool, default True
    If True then evaluate test dataset on the full data otherwise take the mean of every fold.
'''
stack_ds = pipeline.stack(k=5,
                          stratify=False,  # 是否为分层k折
                          shuffle=True,
                          seed=1,
                          full_test=False)
stack_ds

{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': No

Dataset(82d81a96e92b9275fcfaad504f42268a)

In [11]:
# 第一层模型训练第二层模型的输出;这里xgb模型输出为概率,故有xgb_0, xgb_1,...,xgb_6
print(stack_ds.X_train)
print(stack_ds.y_train)
print(stack_ds.X_test)  # 第一层模型测试第二层模型的输出
print(stack_ds.y_test)

             xgb_0         xgb_1         xgb_2         xgb_3         xgb_4  \
0     4.534841e-08  2.541179e-07  2.733032e-03  9.972505e-01  5.520512e-07   
1     1.026551e-03  5.745058e-01  6.193155e-05  6.089087e-06  4.243897e-01   
2     9.931676e-01  6.827588e-03  6.073082e-08  9.379712e-08  2.986425e-06   
3     1.024370e-05  1.215085e-03  2.224298e-06  2.022765e-06  9.987329e-01   
4     2.070961e-06  2.478734e-04  4.268239e-05  1.892980e-08  2.222941e-05   
...            ...           ...           ...           ...           ...   
8995  1.498728e-06  7.729919e-06  9.997123e-01  1.818914e-05  3.111986e-07   
8996  2.466629e-03  8.747428e-07  1.366907e-06  9.943136e-08  2.284424e-07   
8997  4.480495e-03  2.889213e-01  6.571934e-05  1.598638e-06  7.065006e-01   
8998  6.482577e-06  8.352361e-05  3.192859e-01  5.541099e-06  3.409723e-07   
8999  4.172878e-04  9.854028e-01  1.783861e-03  1.912048e-06  1.223199e-02   

             xgb_5         xgb_6         lgb_0         lgb_1   

In [12]:
stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={"solver": 'lbfgs', "max_iter": 1000},
                     use_cache=False)
# stack_ds.X_test的预测结果
predict_stack = stacker.predict()
predict_stack

array([[6.03722656e-01, 3.73781367e-01, 9.89615059e-04, ...,
        4.49136615e-03, 1.33593031e-03, 1.50242305e-02],
       [1.44654239e-03, 9.46348836e-03, 1.76467576e-01, ...,
        3.86417773e-03, 8.01514943e-01, 5.68749745e-04],
       [8.18883226e-04, 4.21437492e-03, 3.30892764e-02, ...,
        2.00355422e-03, 9.56142085e-01, 2.67323794e-04],
       ...,
       [2.59388653e-03, 1.19928650e-02, 6.93582675e-01, ...,
        8.36630028e-03, 1.20881332e-01, 1.53229228e-03],
       [1.71136048e-01, 8.15486605e-01, 1.08040751e-03, ...,
        7.52031213e-03, 1.98592123e-03, 2.41761313e-03],
       [2.94233261e-01, 6.47412342e-01, 2.11817047e-03, ...,
        4.62637840e-02, 2.43883147e-03, 6.82969018e-03]])

In [13]:
print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, :7].values, axis=1), y_test))
print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, 7:14].values, axis=1), y_test))
print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, 14:].values, axis=1), y_test))

# 通过stacking模型融合,准确率得到了提升
print(accuracy_score(np.argmax(predict_stack, axis=1), y_test))

0.858
0.865
0.861
0.872


In [14]:
xgb_t = stack_ds.X_test.iloc[:, :7].values
lgb_t = stack_ds.X_test.iloc[:, 7:14].values
rf_t = stack_ds.X_test.iloc[:, 14:].values

In [15]:
result =  0.2 * xgb_t +  0.4 * lgb_t + 0.4 * rf_t
result

array([[5.85561098e-01, 3.17589063e-01, 4.00209028e-03, ...,
        5.52277362e-02, 9.60661942e-03, 2.72107432e-02],
       [1.20855528e-06, 4.74073804e-03, 2.78828596e-01, ...,
        3.46179151e-07, 7.12423291e-01, 3.11759999e-06],
       [8.15949367e-07, 1.60240044e-03, 4.41192607e-02, ...,
        4.38204625e-06, 9.22860807e-01, 1.01507293e-07],
       ...,
       [4.44137708e-06, 1.46998647e-04, 6.08681210e-01, ...,
        1.73260053e-06, 9.58145796e-02, 8.30107165e-07],
       [1.63556785e-01, 8.04290982e-01, 2.17962280e-05, ...,
        1.60077307e-02, 5.18871841e-06, 1.61107146e-02],
       [2.49616060e-01, 5.88103073e-01, 1.11673729e-05, ...,
        1.58260651e-01, 1.60057323e-03, 2.40082027e-03]])

In [16]:
print(accuracy_score(np.argmax(result, axis=1), y_test))  # 相比于线性加权,准确率得到了提升

0.865
