https://towardsdatascience.com/ensemble-learning-stacking-blending-voting-b37737c4f483

https://github.com/FernandoLpz/Stacking-Blending-Voting-Ensembles

In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [2]:
class Ensemble:
    def __init__(self):
        self.x_train = None
        self.x_test = None
        self.y_train = None
        self.y_test = None
        self.k = 5

    def load_data(self):
        x, y = load_breast_cancer(return_X_y=True)
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y, test_size=0.3, random_state=23)

    def StackingClassifier(self):

        # Define weak learners
        weak_learners = [('dt', DecisionTreeClassifier()),
                         ('knn', KNeighborsClassifier()),
                         ('rf', RandomForestClassifier()),
                         ('gb', GradientBoostingClassifier()),
                         ('gn', GaussianNB())]

        # Final learner or meta model
        final_learner = LogisticRegression()

        train_meta_model = None
        test_meta_model = None

        # Start stacking
        for clf_id, clf in weak_learners:
            # Predictions for each classifier based on k-fold
            predictions_clf = self.k_fold_cross_validation(clf)

            # Predictions for test set for each classifier based on train of level 0
            test_predictions_clf = self.train_level_0(clf)

            # Stack predictions which will form 
            # the input data for the data model
            if isinstance(train_meta_model, np.ndarray):
                train_meta_model = np.vstack((train_meta_model, predictions_clf))
            else:
                train_meta_model = predictions_clf

            # Stack predictions from test set
            # which will form test data for meta model
            if isinstance(test_meta_model, np.ndarray):
                test_meta_model = np.vstack((test_meta_model, test_predictions_clf))
            else:
                test_meta_model = test_predictions_clf

        # Transpose train_meta_model
        train_meta_model = train_meta_model.T

        # Transpose test_meta_model
        test_meta_model = test_meta_model.T

        # Training level 1
        self.train_level_1(final_learner, train_meta_model, test_meta_model)

    def k_fold_cross_validation(self, clf):

        predictions_clf = None

        # Number of samples per fold
        batch_size = int(len(self.x_train) / self.k)

        # Stars k-fold cross validation
        for fold in range(self.k):

            # Settings for each batch_size
            if fold == (self.k - 1):
                test = self.x_train[(batch_size * fold):, :]
                batch_start = batch_size * fold
                batch_finish = self.x_train.shape[0]
            else:
                test = self.x_train[(batch_size * fold): (batch_size * (fold + 1)), :]
                batch_start = batch_size * fold
                batch_finish = batch_size * (fold + 1)

            # test & training samples for each fold iteration
            fold_x_test = self.x_train[batch_start:batch_finish, :]
            fold_x_train = self.x_train[[index for index in range(self.x_train.shape[0]) if
                                         index not in range(batch_start, batch_finish)], :]

            # test & training targets for each fold iteration
            fold_y_test = self.y_train[batch_start:batch_finish]
            fold_y_train = self.y_train[
                [index for index in range(self.x_train.shape[0]) if index not in range(batch_start, batch_finish)]]

            # Fit current classifier
            clf.fit(fold_x_train, fold_y_train)
            fold_y_pred = clf.predict(fold_x_test)

            # Store predictions for each fold_x_test
            if isinstance(predictions_clf, np.ndarray):
                predictions_clf = np.concatenate((predictions_clf, fold_y_pred))
            else:
                predictions_clf = fold_y_pred

        return predictions_clf

    def train_level_0(self, clf):
        # Train in full real training set
        clf.fit(self.x_train, self.y_train)
        # Get predictions from full real test set
        y_pred = clf.predict(self.x_test)

        return y_pred

    def train_level_1(self, final_learner, train_meta_model, test_meta_model):
        # Train is carried out with final learner or meta model
        final_learner.fit(train_meta_model, self.y_train)
        # Getting train and test accuracies from meta_model
        print(f"Train accuracy: {final_learner.score(train_meta_model, self.y_train)}")
        print(f"Test accuracy: {final_learner.score(test_meta_model, self.y_test)}")

In [3]:
ensemble = Ensemble()
ensemble.load_data()
ensemble.StackingClassifier()

Train accuracy: 0.957286432160804
Test accuracy: 0.9766081871345029


In [12]:
def k_fold_cross_validation(clf):

    predictions_clf = None

    # Number of samples per fold
    batch_size = int(len(x_train) / k)

    # Stars k-fold cross validation
    for fold in range(k):

        # Settings for each batch_size
        if fold == (k - 1):
            test = x_train[(batch_size * fold):, :]
            batch_start = batch_size * fold
            batch_finish = x_train.shape[0]
        else:
            test = x_train[(batch_size * fold): (batch_size * (fold + 1)), :]
            batch_start = batch_size * fold
            batch_finish = batch_size * (fold + 1)

        # test & training samples for each fold iteration
        fold_x_test = x_train[batch_start:batch_finish, :]
        fold_x_train = x_train[[index for index in range(x_train.shape[0]) if
                                     index not in range(batch_start, batch_finish)], :]

        # test & training targets for each fold iteration
        fold_y_test = y_train[batch_start:batch_finish]
        fold_y_train = y_train[
            [index for index in range(x_train.shape[0]) if index not in range(batch_start, batch_finish)]]

        # Fit current classifier
        clf.fit(fold_x_train, fold_y_train)
        fold_y_pred = clf.predict(fold_x_test)

        # Store predictions for each fold_x_test
        if isinstance(predictions_clf, np.ndarray):
            predictions_clf = np.concatenate((predictions_clf, fold_y_pred))
        else:
            predictions_clf = fold_y_pred

    return predictions_clf

In [13]:
def train_level_0(clf):
    # Train in full real training set
    clf.fit(x_train, y_train)
    # Get predictions from full real test set
    y_pred = clf.predict(x_test)

    return y_pred

In [14]:
def train_level_1(final_learner, train_meta_model, test_meta_model):
    # Train is carried out with final learner or meta model
    final_learner.fit(train_meta_model, y_train)
    # Getting train and test accuracies from meta_model
    print(f"Train accuracy: {final_learner.score(train_meta_model, y_train)}")
    print(f"Test accuracy: {final_learner.score(test_meta_model, y_test)}")

In [15]:
x_train = None
x_test = None
y_train = None
y_test = None
k = 5

In [16]:
x, y = load_breast_cancer(return_X_y=True)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=23)

In [17]:
weak_learners = [('dt', DecisionTreeClassifier()),
                 ('knn', KNeighborsClassifier()),
                 ('rf', RandomForestClassifier()),
                 ('gb', GradientBoostingClassifier()),
                 ('gn', GaussianNB())]

# Final learner or meta model
final_learner = LogisticRegression()

train_meta_model = None
test_meta_model = None

In [18]:
# Start stacking
for clf_id, clf in weak_learners:
    # Predictions for each classifier based on k-fold
    predictions_clf = k_fold_cross_validation(clf)

    # Predictions for test set for each classifier based on train of level 0
    test_predictions_clf = train_level_0(clf)

    # Stack predictions which will form 
    # the input data for the data model
    if isinstance(train_meta_model, np.ndarray):
        train_meta_model = np.vstack((train_meta_model, predictions_clf))
    else:
        train_meta_model = predictions_clf

    # Stack predictions from test set
    # which will form test data for meta model
    if isinstance(test_meta_model, np.ndarray):
        test_meta_model = np.vstack((test_meta_model, test_predictions_clf))
    else:
        test_meta_model = test_predictions_clf

In [19]:
train_meta_model

array([[1, 1, 1, ..., 0, 1, 1],
       [1, 0, 1, ..., 0, 1, 0],
       [1, 0, 1, ..., 0, 1, 1],
       [1, 0, 1, ..., 0, 1, 1],
       [1, 1, 0, ..., 0, 1, 1]])

In [30]:
predictions_clf.shape

(398,)

In [33]:
len(x_train), len(x_test)

(398, 171)

In [34]:
# Transpose train_meta_model
train_meta_model = train_meta_model.T

# Transpose test_meta_model
test_meta_model = test_meta_model.T

# Training level 1
#train_level_1(final_learner, train_meta_model, test_meta_model)

In [37]:
train_meta_model.shape

(398, 5)

In [38]:
y_train.shape

(398,)

In [35]:
train_meta_model

array([[1, 1, 1, 1, 1],
       [1, 0, 0, 0, 1],
       [1, 1, 1, 1, 0],
       ...,
       [0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1],
       [1, 0, 1, 1, 1]])

In [42]:
y_train

array([1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1,

In [None]:
final_learner.fit(train_meta_model, y_train)
        # Getting train and test accuracies from meta_model
print(f"Train accuracy: {final_learner.score(train_meta_model, self.y_train)}")
print(f"Test accuracy: {final_learner.score(test_meta_model, self.y_test)}")