In [1]:
from heamy.dataset import Dataset
from heamy.estimator import Classifier
from heamy.pipeline import ModelsPipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import log_loss

  from pandas import MultiIndex, Int64Index


In [2]:
# 加载数据集
from sklearn.datasets import fetch_covtype

data = fetch_covtype()

X, y = data['data'][:10000], data['target'][:10000]

ord = OrdinalEncoder()
y_enc = ord.fit_transform(y.reshape(-1, 1))
y_enc = y_enc.reshape(-1, )

X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.1, random_state=1)

print(X_test.shape)
print(y_train.shape)

(1000, 54)
(9000,)


In [3]:
# 创建数据集
'''
use_cache : bool, default True
    If use_cache=True then preprocessing step will be cached until function code is changed.
'''
dataset = Dataset(X_train=X_train, y_train=y_train, X_test=X_test, y_test=None, use_cache=True)
dataset

Dataset(eff47fde69d1e04cb6dc241bb4c1d9b5)

In [4]:
print(dataset.X_train, end='\n\n')
print(dataset.y_train, end='\n\n')
print(dataset.X_test, end='\n\n')
print(dataset.y_test, end='\n\n')

[[2167.  129.   26. ...    0.    0.    0.]
 [2813.  117.   13. ...    0.    0.    0.]
 [2993.  286.   14. ...    0.    0.    0.]
 ...
 [2929.   75.   15. ...    0.    0.    0.]
 [2208.  317.   33. ...    0.    0.    0.]
 [2606.  356.   18. ...    0.    0.    0.]]

[3. 1. 0. ... 4. 5. 1.]

[[2979.   89.   18. ...    0.    0.    0.]
 [2083.   21.   28. ...    0.    0.    0.]
 [2322.  281.   17. ...    0.    0.    0.]
 ...
 [2306.  224.   25. ...    0.    0.    0.]
 [3029.  113.   14. ...    0.    0.    0.]
 [2882.   37.   10. ...    0.    0.    0.]]

None



In [5]:
def xgb_model(X_train, y_train, X_test, y_test):
    """参数必须为X_train,y_train,X_test,y_test"""
    params = {'objective': 'multi:softprob',
              "eval_metric": 'mlogloss',
              "verbosity": 0,
              'num_class': 7,
              'nthread': -1}

    dtrain = xgb.DMatrix(X_train, y_train)
    dtest = xgb.DMatrix(X_test)
    model = xgb.train(params, dtrain, num_boost_round=300)
    predict = model.predict(dtest)

    return predict  # 返回值必须为X_test的预测


def lgb_model(X_train, y_train, X_test, y_test,
              **parameters):  # Classifier处对字典进行了解包,此处需要重新打包
    if parameters is None:
        parameters = {}
    lgb_train = lgb.Dataset(X_train, y_train)

    model = lgb.train(params=parameters, train_set=lgb_train, num_boost_round=300)
    predict = model.predict(X_test)

    return predict


def rf_model(X_train, y_train, X_test, y_test):
    params = {"n_estimators": 100, "n_jobs": -1}
    model = RandomForestClassifier(**params).fit(X_train, y_train)
    predict = model.predict_proba(X_test)

    return predict

In [6]:
params = {"objective": "multiclass",
          "num_class": 7,
          "n_jobs": -1,
          "verbose": -4, "metric": ("multi_logloss",)}
'''
name : str, optional
    The unique name of Estimator object.

parameters : dict, optional
    Arguments for estimator object.

use_cache : bool, optional
    if True then validate/predict/stack/blend results will be cached.
'''
model_xgb = Classifier(dataset=dataset, estimator=xgb_model, name='xgb', use_cache=False)
model_lgb = Classifier(dataset=dataset, estimator=lgb_model, name='lgb',
                       parameters=params,
                       use_cache=False)
model_rf = Classifier(dataset=dataset, estimator=rf_model,
                      name='rf',  # 默认parameters=None
                      use_cache=False)  # 默认use_cache=True

In [7]:
pipeline = ModelsPipeline(model_xgb, model_lgb, model_rf)
pipeline

<heamy.pipeline.ModelsPipeline at 0x221578995e0>

In [8]:

# Finds optimal weights for weighted average of models.
'''
scorer : function
    Scikit-learn like metric.

test_size : float, default 0.2

method : str
    Type of solver. Should be one of:
        ‘Nelder-Mead’
        ‘Powell’
        ‘CG’
        ‘BFGS’
        ‘Newton-CG’
        ‘L-BFGS-B’
        ‘TNC’
        ‘COBYLA’
        ‘SLSQP’
        ‘dogleg’
        ‘trust-ncg’
'''
# 使用留出法计算每个模型的评估结果
# 求出最优权重组合使得整体评估结果最小
pipeline.find_weights(scorer=log_loss, test_size=0.2)  # 输出最优权重组合

Best Score (log_loss): 0.36464098170245846
Best Weights: [0.35853867 0.36476817 0.27669316]


array([0.35853867, 0.36476817, 0.27669316])

In [9]:
# Applies weighted mean to models.
# 线性加权
# pipeline.weight([0.5, 0.3, 0.2])  # 这里指定xgb模型权重0.5,lgb权重为0.3,rf权重为0.2

In [10]:
'''
k : int, default 5
    Number of folds.

stratify : bool, default False

shuffle : bool, default True

seed : int, default 100

full_test : bool, default True
    If True then evaluate test dataset on the full data otherwise take the mean of every fold.
'''
stack_ds = pipeline.stack(k=5,
                          stratify=False,  # 是否为分层k折
                          shuffle=True,
                          seed=1,
                          full_test=False)
stack_ds

Dataset(1585543b5c284b2fd7b13ceb7fb68816)

In [11]:
# 第一层模型训练第二层模型的输出;这里xgb模型输出为概率,故有xgb_0, xgb_1,...,xgb_6
print(stack_ds.X_train.shape)
print(stack_ds.y_train.shape)
print(stack_ds.X_test.shape)  # 第一层模型测试第二层模型的输出
print(stack_ds.y_test)

(9000, 21)
(9000,)
(1000, 21)
None


In [12]:
stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={"solver": 'lbfgs', "max_iter": 1000},
                     use_cache=False)
# stack_ds.X_test的预测结果
predict_stack = stacker.predict()
predict_stack

array([[6.11388578e-01, 3.65444804e-01, 1.08756013e-03, ...,
        4.41845714e-03, 1.42378642e-03, 1.55677176e-02],
       [1.46281786e-03, 1.00738688e-02, 1.88128889e-01, ...,
        3.73144024e-03, 7.89072609e-01, 5.71020859e-04],
       [7.63720491e-04, 3.99136593e-03, 2.83299624e-02, ...,
        1.69592047e-03, 9.61871688e-01, 2.44968597e-04],
       ...,
       [2.30384862e-03, 1.03668595e-02, 7.41239023e-01, ...,
        6.78895418e-03, 1.09809566e-01, 1.34047346e-03],
       [1.89951634e-01, 7.94199853e-01, 1.27640494e-03, ...,
        8.70828391e-03, 2.26309865e-03, 3.18150280e-03],
       [2.64335020e-01, 6.75597374e-01, 2.09267026e-03, ...,
        4.77810202e-02, 2.61898589e-03, 6.89277645e-03]])

In [13]:
print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, :7].values, axis=1), y_test))
print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, 7:14].values, axis=1), y_test))
print(accuracy_score(np.argmax(stack_ds.X_test.iloc[:, 14:].values, axis=1), y_test))

# 通过stacking模型融合,准确率得到了提升
print(accuracy_score(np.argmax(predict_stack, axis=1), y_test))

0.858
0.865
0.863
0.869


In [14]:
xgb_t = stack_ds.X_test.iloc[:, :7].values
lgb_t = stack_ds.X_test.iloc[:, 7:14].values
rf_t = stack_ds.X_test.iloc[:, 14:].values

In [15]:
result = 0.2 * xgb_t + 0.4 * lgb_t + 0.4 * rf_t
result

array([[5.84761098e-01, 3.08789063e-01, 8.00209028e-03, ...,
        5.84277362e-02, 1.20066194e-02, 2.72107432e-02],
       [1.20855528e-06, 5.54073804e-03, 2.86028596e-01, ...,
        3.46179151e-07, 7.06023291e-01, 3.11759999e-06],
       [8.00815949e-04, 1.60240044e-03, 3.45192607e-02, ...,
        8.04382046e-04, 9.38060807e-01, 1.01507293e-07],
       ...,
       [4.44137708e-06, 1.46998647e-04, 6.23881210e-01, ...,
        1.73260053e-06, 9.82145796e-02, 8.30107165e-07],
       [1.61156785e-01, 7.90690982e-01, 2.17962280e-05, ...,
        2.00077307e-02, 5.18871841e-06, 2.81107146e-02],
       [2.37616060e-01, 5.92903073e-01, 1.11673729e-05, ...,
        1.63860651e-01, 1.60057323e-03, 4.00082027e-03]])

In [16]:
print(accuracy_score(np.argmax(result, axis=1), y_test))  # 相比于线性加权,准确率得到了提升

0.863
