# gbdt+lr

In [1]:
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)
X.shape

(569, 30)

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

In [3]:
from sklearn.ensemble import GradientBoostingClassifier
mm = GradientBoostingClassifier(n_estimators=10, max_depth=2)
mm.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=2,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=10,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [4]:
from sklearn.metrics import roc_curve, auc, roc_auc_score

y_pred = mm.predict_proba(X_test)[:, 1]

print('GBDT AUC: %.5f' % roc_auc_score(y_test, y_pred))

GBDT AUC: 0.96856


In [5]:
X_train_leaves = mm.apply(X_train)[:, :, 0]
train_rows, cols = X_train_leaves.shape
X_test_leaves = mm.apply(X_test)[:, :, 0]

In [7]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(categories='auto')

X_trans = enc.fit_transform(
    np.concatenate((X_train_leaves, X_test_leaves), axis=0))

In [8]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='lbfgs')
lr.fit(X_trans[:train_rows, :], y_train)

y_pred_gbdtlr = lr.predict_proba(X_trans[train_rows:, :])[:, 1]

print('GBDT+LR AUC: %.5f' % roc_auc_score(y_test, y_pred_gbdtlr))


GBDT+LR AUC: 0.97019


# Super Learner 

## 自定义实现

In [18]:
%run superLearnerexample.py

In [19]:
# 测试数据集
from sklearn.datasets import make_regression

random_state = 42
X, y = make_regression(n_samples=1000,
                       n_features=3,
                       noise=1,
                       random_state=random_state)
X[0]

array([-0.18912039, -1.33031363,  0.92165011])

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor

# 定义线性基模型
base_models = [
    LinearRegression(),
    DecisionTreeRegressor(random_state=random_state),
    BaggingRegressor(random_state=random_state)
]

# 定义元模型
meta_model = LinearRegression()

spl = SuperLearnerExample(X,
                          y,
                          base_models,
                          meta_model,
                          random_state=random_state)
spl.fit()
spl.evaluate_meta_models()

Super Learner: MSE 0.978


In [21]:
spl.meta_model.coef_

array([9.97922299e-01, 2.42673881e-04, 2.14871114e-03])

## 使用SuperLearner库实现

In [23]:
from mlens.ensemble import SuperLearner

[MLENS] backend: threading


In [24]:
def bulid_super_learner(X,
                        y,
                        base_models,
                        meta_model,
                        score_fun,
                        kfolds=3,
                        test_size=0.3,
                        random_state=None):
    X, X_val, y, y_val = train_test_split(X,
                                          y,
                                          test_size=test_size,
                                          random_state=random_state)
    
    ensemble = SuperLearner(scorer=score_fun,
                            folds=kfolds,
                            shuffle=True,
                            sample_size=len(X),
                            random_state=random_state)
    ensemble.add(base_models)
    ensemble.add_meta(meta_model)
    ensemble.fit(X, y)
    preds = ensemble.predict(X_val)
    print('Super Learner Train MSE {:.3f}'.format(mean_squared_error(y_val, preds)))
    return ensemble

In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor

base_models = [
    LinearRegression(),
    DecisionTreeRegressor(random_state=random_state),
    BaggingRegressor(random_state=random_state)
]
meta_model = LinearRegression()


random_state = 42
X, y = make_regression(n_samples=1000,
                       n_features=3,
                       noise=1,
                       random_state=random_state)

ensemble = bulid_super_learner(X,
                               y,
                               base_models,
                               meta_model,
                               mean_squared_error,
                               random_state=random_state)

Super Learner Train MSE 0.975


In [26]:
print(ensemble.data)

                                  score-m  score-s  ft-m  ft-s  pt-m  pt-s
layer-1  baggingregressor          680.29   102.88  0.06  0.00  0.00  0.00
layer-1  decisiontreeregressor    1262.92   279.05  0.01  0.00  0.00  0.00
layer-1  linearregression            1.00     0.02  0.00  0.00  0.00  0.00



In [27]:
len(ensemble.layers)

2

In [28]:
ensemble.layers

[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
    name='layer-1', propagate_features=None, raise_on_exception=True,
    random_state=7270, shuffle=True,
    stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
    indexer=FoldIndex(X=None, folds=3, raise_on_exception=True),
    learners=[Learner(attr='predict', backend='threading', dtype=<class 'numpy.float32'>,
     estimator=BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_feat...ed_error at 0x1217e3d90>)],
    n_jobs=-1, name='group-0', raise_on_exception=True, transformers=[])],
    verbose=0),
 Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
    name='layer-2', propagate_features=None, raise_on_exception=True,
    random_state=7270, shuffle=True,
    stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
    indexer=FullIndex(X=None),
    learners=[Learner(attr='predict', backend='threading', dtype=<class 'numpy.float32'>,
     estimator=LinearRegres

In [29]:
ensemble.remove(1)

SuperLearner(array_check=None, backend=None, folds=3,
       layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
   name='layer-1', propagate_features=None, raise_on_exception=True,
   random_state=7270, shuffle=True,
   stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
   indexer=FoldIndex(X=None, folds=3, raise_on_exc...17e3d90>)],
   n_jobs=-1, name='group-0', raise_on_exception=True, transformers=[])],
   verbose=0)],
       model_selection=False, n_jobs=None, raise_on_exception=True,
       random_state=42, sample_size=700,
       scorer=<function mean_squared_error at 0x1217e3d90>, shuffle=True,
       verbose=False)

In [30]:
ensemble.layers

[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
    name='layer-1', propagate_features=None, raise_on_exception=True,
    random_state=7270, shuffle=True,
    stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
    indexer=FoldIndex(X=None, folds=3, raise_on_exception=True),
    learners=[Learner(attr='predict', backend='threading', dtype=<class 'numpy.float32'>,
     estimator=BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_feat...ed_error at 0x1217e3d90>)],
    n_jobs=-1, name='group-0', raise_on_exception=True, transformers=[])],
    verbose=0)]