https://machinelearningmastery.com/out-of-fold-predictions-in-machine-learning/

In [1]:
# example of a stacked model for binary classification
import pandas as pd
from numpy import hstack
from numpy import array
from sklearn.datasets import make_blobs
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# create a meta dataset
def create_meta_dataset(data_x, yhat1, yhat2):
    # convert to columns
    yhat1 = array(yhat1).reshape((len(yhat1), 1))
    yhat2 = array(yhat2).reshape((len(yhat2), 1))
    # stack as separate columns
    meta_X = hstack((data_x, yhat1, yhat2))
    return meta_X


In [3]:
# make predictions with stacked model
def stack_prediction(model1, model2, meta_model, X):
    # make predictions
    yhat1 = model1.predict_proba(X)[:, 0]
    yhat2 = model2.predict_proba(X)[:, 0]
    # create input dataset
    meta_X = create_meta_dataset(X, yhat1, yhat2)
    # predict
    return meta_model.predict(meta_X)

In [4]:
# create the inputs and outputs
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
# split
X, X_val, y, y_val = train_test_split(X, y, test_size=0.33)

In [5]:
# collect out of sample predictions
data_x, data_y, knn_yhat, cart_yhat = list(), list(), list(), list()
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

for train_ix, test_ix in kfold.split(X):
    # get data
    train_X, test_X = X[train_ix], X[test_ix]
    train_y, test_y = y[train_ix], y[test_ix]
    data_x.extend(test_X)
    data_y.extend(test_y)
    # fit and make predictions with cart
    model1 = DecisionTreeClassifier()
    model1.fit(train_X, train_y)
    yhat1 = model1.predict_proba(test_X)[:, 0]
    cart_yhat.extend(yhat1)
    # fit and make predictions with cart
    model2 = KNeighborsClassifier()
    model2.fit(train_X, train_y)
    yhat2 = model2.predict_proba(test_X)[:, 0]
    knn_yhat.extend(yhat2)

In [15]:
data_y

[0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,


In [16]:
test_y

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       1])

In [14]:
pd.DataFrame(data_x)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-19.752081,-19.527539,3.002907,13.796752,5.988156,-9.246324,-0.694677,18.719332,-24.149986,9.853819,...,21.757180,-15.484629,8.432853,-28.332952,-9.808311,-32.212887,6.778358,25.022740,-14.049355,11.187837
1,-0.022236,3.207684,-9.184435,30.116221,-1.563353,-12.317129,1.730860,-30.186921,-10.889516,50.681302,...,-7.263085,5.206488,-15.305211,-35.795614,-15.456047,-6.887416,14.715998,-32.097752,3.202088,-23.603065
2,5.629981,16.459545,-5.126357,4.991694,3.711470,-17.137145,39.600401,-22.890150,6.088340,11.365548,...,19.844000,-19.594527,6.157826,-24.923586,3.505773,22.472013,0.822416,13.314386,-8.424767,-4.751421
3,-1.245725,-61.890360,-10.944223,-28.744022,-21.122776,5.692782,6.425568,-23.328235,12.141918,-26.956546,...,9.306114,-48.630856,-18.495349,38.186314,35.827498,0.698516,-32.681089,-37.501800,0.721025,-2.531215
4,-2.220171,-10.517216,11.609531,-38.920287,-5.892966,-16.320742,-9.273932,-27.707361,-13.854269,10.280570,...,-20.070528,28.307084,6.986093,-8.857934,12.256922,4.276123,14.467975,-10.705385,13.231528,-8.750230
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,-28.569509,-28.308935,-30.539431,35.985106,-6.512955,-8.778813,17.990215,-15.999391,-4.439684,47.295864,...,11.999165,-8.167683,7.689935,-8.331371,4.342946,12.497296,13.024842,-6.794076,-26.197068,23.458648
666,-17.063599,9.985508,-2.198110,7.652448,-24.864333,-12.079481,-17.011674,-4.491991,18.043424,14.764026,...,16.205670,18.794549,-4.575483,16.289337,11.720775,-10.685365,-12.825035,-6.985276,39.686379,4.781869
667,2.394116,-2.139921,2.009705,7.510385,-14.321433,-40.951049,19.609748,53.610940,-26.081454,4.527180,...,31.433962,-7.966083,-4.634480,27.071341,-9.443227,9.452167,-22.992694,-7.486482,-4.579166,-12.421986
668,36.761927,-25.000343,3.301388,0.362135,18.327313,-41.295488,1.343736,-18.088403,-10.867919,31.844518,...,-21.491986,3.743148,41.694799,-2.352035,6.136603,34.469769,-1.209818,-4.232712,18.122589,-2.093984


In [6]:
# construct meta dataset
meta_X = create_meta_dataset(data_x, knn_yhat, cart_yhat)
pd.DataFrame(meta_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,92,93,94,95,96,97,98,99,100,101
0,-19.752081,-19.527539,3.002907,13.796752,5.988156,-9.246324,-0.694677,18.719332,-24.149986,9.853819,...,8.432853,-28.332952,-9.808311,-32.212887,6.778358,25.022740,-14.049355,11.187837,1.0,1.0
1,-0.022236,3.207684,-9.184435,30.116221,-1.563353,-12.317129,1.730860,-30.186921,-10.889516,50.681302,...,-15.305211,-35.795614,-15.456047,-6.887416,14.715998,-32.097752,3.202088,-23.603065,0.0,1.0
2,5.629981,16.459545,-5.126357,4.991694,3.711470,-17.137145,39.600401,-22.890150,6.088340,11.365548,...,6.157826,-24.923586,3.505773,22.472013,0.822416,13.314386,-8.424767,-4.751421,0.8,0.0
3,-1.245725,-61.890360,-10.944223,-28.744022,-21.122776,5.692782,6.425568,-23.328235,12.141918,-26.956546,...,-18.495349,38.186314,35.827498,0.698516,-32.681089,-37.501800,0.721025,-2.531215,0.8,1.0
4,-2.220171,-10.517216,11.609531,-38.920287,-5.892966,-16.320742,-9.273932,-27.707361,-13.854269,10.280570,...,6.986093,-8.857934,12.256922,4.276123,14.467975,-10.705385,13.231528,-8.750230,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,-28.569509,-28.308935,-30.539431,35.985106,-6.512955,-8.778813,17.990215,-15.999391,-4.439684,47.295864,...,7.689935,-8.331371,4.342946,12.497296,13.024842,-6.794076,-26.197068,23.458648,0.6,1.0
666,-17.063599,9.985508,-2.198110,7.652448,-24.864333,-12.079481,-17.011674,-4.491991,18.043424,14.764026,...,-4.575483,16.289337,11.720775,-10.685365,-12.825035,-6.985276,39.686379,4.781869,0.4,1.0
667,2.394116,-2.139921,2.009705,7.510385,-14.321433,-40.951049,19.609748,53.610940,-26.081454,4.527180,...,-4.634480,27.071341,-9.443227,9.452167,-22.992694,-7.486482,-4.579166,-12.421986,1.0,1.0
668,36.761927,-25.000343,3.301388,0.362135,18.327313,-41.295488,1.343736,-18.088403,-10.867919,31.844518,...,41.694799,-2.352035,6.136603,34.469769,-1.209818,-4.232712,18.122589,-2.093984,0.6,0.0


In [7]:
# fit final submodels
model1 = DecisionTreeClassifier()
model1.fit(X, y)
model2 = KNeighborsClassifier()
model2.fit(X, y)

KNeighborsClassifier()

In [8]:
# construct meta classifier
meta_model = LogisticRegression(solver='liblinear')
meta_model.fit(meta_X, data_y)

LogisticRegression(solver='liblinear')

In [9]:
# evaluate sub models on hold out dataset
acc1 = accuracy_score(y_val, model1.predict(X_val))
acc2 = accuracy_score(y_val, model2.predict(X_val))
print('Model1 Accuracy: %.3f, Model2 Accuracy: %.3f' % (acc1, acc2))

Model1 Accuracy: 0.773, Model2 Accuracy: 0.882


In [10]:
# evaluate meta model on hold out dataset
yhat = stack_prediction(model1, model2, meta_model, X_val)
acc = accuracy_score(y_val, yhat)
print('Meta Model Accuracy: %.3f' % (acc))

Meta Model Accuracy: 0.927
