https://towardsdatascience.com/simple-model-stacking-explained-and-automated-1b54e4357916

In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_classification

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score


  from pandas import MultiIndex, Int64Index


In [2]:
RANDOM_STATE : int = 42
N_SAMPLES : int = 10000
N_FEATURES : int = 25
N_CLASSES : int = 3
N_CLUSTERS_PER_CLASS : int = 2
    
FEATURE_NAME_PREFIX : str = "Feature"
TARGET_NAME : str = "Target"
    
N_SPLITS : int = 5
    
np.set_printoptions(suppress=True)

In [3]:
def make_classification_dataframe(n_samples : int = 10000, n_features : int = 25, n_classes : int = 2, n_clusters_per_class : int = 2, feature_name_prefix : str = "Feature", target_name : str = "Target", random_state : int = 42) -> pd.DataFrame:
    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, n_informative = n_classes * n_clusters_per_class, random_state=random_state)

    feature_names = [feature_name_prefix + " " + str(v) for v in np.arange(1, n_features+1)]
    return pd.concat([pd.DataFrame(X, columns=feature_names), pd.DataFrame(y, columns=[target_name])], axis=1)

df_data = make_classification_dataframe(n_samples=N_SAMPLES, n_features=N_FEATURES, n_classes=N_CLASSES, n_clusters_per_class=N_CLUSTERS_PER_CLASS, feature_name_prefix=FEATURE_NAME_PREFIX, target_name=TARGET_NAME, random_state=RANDOM_STATE)

X = df_data.drop([TARGET_NAME], axis=1).to_numpy()
y = df_data[TARGET_NAME].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=RANDOM_STATE)

df_data.head()

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,...,Feature 17,Feature 18,Feature 19,Feature 20,Feature 21,Feature 22,Feature 23,Feature 24,Feature 25,Target
0,-0.131637,2.281512,0.46881,0.707735,1.628051,0.622273,-0.434003,-0.992722,0.053795,-1.764985,...,-1.673779,0.276305,-1.685462,-0.801336,0.806151,0.369108,-0.843748,0.966868,-0.547149,2
1,-1.231544,-1.58088,0.684543,-0.343771,0.498176,-0.008396,-0.859592,-0.666477,-0.832989,-0.287655,...,0.341136,1.116596,1.134896,1.232907,1.295312,-0.253926,-0.528711,0.502124,0.896065,1
2,-1.301585,-1.922563,-0.623878,-0.740534,-0.723667,1.484827,1.227018,-0.050878,0.164059,0.301672,...,-0.90029,0.682905,0.680959,-0.02355,0.932216,0.109495,0.500366,0.956182,-2.268742,0
3,-0.899385,0.991619,0.494529,-0.672954,0.421605,-0.271674,1.245351,0.146567,0.389313,1.479558,...,-0.285753,-1.446158,-0.062296,0.583408,1.588965,0.412651,-1.891714,-0.575163,0.786847,0
4,-3.026721,0.745777,0.18845,-0.794256,1.40257,1.057481,0.454773,-0.174391,0.951417,-0.403872,...,0.959229,-1.964891,-0.296422,-0.755737,-0.489769,0.516726,-4.807225,1.215506,0.799321,1


In [50]:
logreg = LogisticRegression(random_state=RANDOM_STATE)
forest = RandomForestClassifier(random_state=RANDOM_STATE)
xgboost = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE)
xtrees = ExtraTreesClassifier(random_state=RANDOM_STATE)

classifiers = dict()
classifiers["Logistic Regression"] = logreg
classifiers["Random Forest"] = forest
classifiers["XG Boost"] = xgboost
classifiers["Extra Random Trees"] = xtrees

In [57]:
stacking = StackingClassifier(estimators=list(classifiers.items()), final_estimator=forest, passthrough=False, stack_method='predict_proba')
X_train_transformed = pd.DataFrame(stacking.fit_transform(X_train, y_train))
X_train_transformed.head(50)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.225436,0.320894,0.45367,0.14,0.84,0.02,0.022234,0.964327,0.013439,0.0,1.0,0.0
1,0.495543,0.163691,0.340766,0.07,0.08,0.85,0.001658,0.067161,0.931181,0.0,0.0,1.0
2,0.083266,0.901984,0.01475,0.03,0.94,0.03,0.004582,0.994284,0.001134,0.0,1.0,0.0
3,0.040706,0.354275,0.605019,0.03,0.03,0.94,0.023676,0.002184,0.97414,0.0,0.0,1.0
4,0.029568,0.895861,0.074572,0.76,0.19,0.05,0.907884,0.052006,0.04011,1.0,0.0,0.0
5,0.447072,0.380114,0.172814,0.15,0.81,0.04,0.131787,0.794447,0.073766,0.0,1.0,0.0
6,0.136434,0.634008,0.229558,0.0,1.0,0.0,0.001136,0.99826,0.000604,0.0,1.0,0.0
7,0.324279,0.352179,0.323542,0.13,0.86,0.01,0.024332,0.974973,0.000695,0.0,1.0,0.0
8,0.608856,0.295115,0.096029,0.97,0.0,0.03,0.998816,0.000907,0.000277,1.0,0.0,0.0
9,0.251321,0.187937,0.560743,0.04,0.07,0.89,0.027435,0.069736,0.90283,0.0,0.0,1.0


In [62]:
from sklearn.preprocessing import OneHotEncoder

In [64]:
xtrees.fit(X_train, y_train)
y_pred = xtrees.predict(X_train)

In [68]:
y_pred

array([1, 2, 1, ..., 1, 1, 1])

In [123]:
OneHotEncoder().fit_transform(y_pred.reshape(-1, 1)).toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [73]:
enc = OneHotEncoder()
pd.DataFrame(enc.fit_transform(pd.DataFrame(y_pred)).toarray()).head(50)

Unnamed: 0,0,1,2
0,0.0,1.0,0.0
1,0.0,0.0,1.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
5,0.0,1.0,0.0
6,0.0,1.0,0.0
7,0.0,1.0,0.0
8,1.0,0.0,0.0
9,0.0,0.0,1.0


In [56]:
stacking = StackingClassifier(estimators=list(classifiers.items()), final_estimator=forest, passthrough=False, stack_method='predict')
X_train_transformed = pd.DataFrame(stacking.fit_transform(X_train, y_train))
X_train_transformed.head(50)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.225436,0.320894,0.45367,0.14,0.84,0.02,0.022234,0.964327,0.013439,0.0,1.0,0.0
1,0.495543,0.163691,0.340766,0.07,0.08,0.85,0.001658,0.067161,0.931181,0.0,0.0,1.0
2,0.083266,0.901984,0.01475,0.03,0.94,0.03,0.004582,0.994284,0.001134,0.0,1.0,0.0
3,0.040706,0.354275,0.605019,0.03,0.03,0.94,0.023676,0.002184,0.97414,0.0,0.0,1.0
4,0.029568,0.895861,0.074572,0.76,0.19,0.05,0.907884,0.052006,0.04011,1.0,0.0,0.0
5,0.447072,0.380114,0.172814,0.15,0.81,0.04,0.131787,0.794447,0.073766,0.0,1.0,0.0
6,0.136434,0.634008,0.229558,0.0,1.0,0.0,0.001136,0.99826,0.000604,0.0,1.0,0.0
7,0.324279,0.352179,0.323542,0.13,0.86,0.01,0.024332,0.974973,0.000695,0.0,1.0,0.0
8,0.608856,0.295115,0.096029,0.97,0.0,0.03,0.998816,0.000907,0.000277,1.0,0.0,0.0
9,0.251321,0.187937,0.560743,0.04,0.07,0.89,0.027435,0.069736,0.90283,0.0,0.0,1.0


In [45]:
pd.DataFrame(y_train)

Unnamed: 0,0
0,1
1,2
2,1
3,2
4,0
...,...
7495,1
7496,0
7497,1
7498,1


In [43]:
X_train_transformed.iloc[:,6].value_counts()

0.0    4985
1.0    2515
Name: 6, dtype: int64

In [7]:
X_test_transformed = pd.DataFrame(stacking.transform(X_test))
X_test_transformed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,0.70,0.06,0.24,0.963820,0.001190,0.034990,0.72,0.10,0.18,-1.923680,...,-0.266477,-1.024834,-1.062337,-1.306103,0.184019,1.433802,-0.919363,-0.080104,2.121039,-0.156530
1,0.06,0.04,0.90,0.022556,0.047805,0.929639,0.08,0.14,0.78,-2.239125,...,0.242999,1.739232,1.649360,-0.976718,1.183260,-0.933728,-0.524347,-1.170550,-0.885650,1.189926
2,0.72,0.01,0.27,0.478593,0.002203,0.519203,0.62,0.11,0.27,-0.500576,...,-1.176086,1.107225,0.581034,-0.949959,1.437078,-1.176040,1.082862,-1.009292,-0.441545,0.968120
3,0.63,0.33,0.04,0.964388,0.027408,0.008205,0.62,0.29,0.09,0.845388,...,0.143708,-1.725321,0.084987,1.258574,-0.326457,-0.427567,-2.325347,3.777429,-0.555361,0.972041
4,0.31,0.08,0.61,0.170270,0.004125,0.825605,0.23,0.04,0.73,-2.532036,...,-0.733581,2.254119,0.373960,1.297756,-1.290494,-0.033883,0.487246,-3.598182,-1.025821,0.397011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,0.11,0.11,0.78,0.017525,0.006610,0.975865,0.18,0.12,0.70,1.278444,...,1.851266,0.910141,0.290917,1.176805,-0.766175,-0.769960,0.304190,-0.003604,-0.639746,-0.210730
2496,0.91,0.04,0.05,0.999155,0.000714,0.000132,0.94,0.04,0.02,2.606106,...,1.358832,0.761766,-1.532720,0.705756,-0.275926,-0.425304,-0.535919,1.451105,0.521238,1.173778
2497,0.37,0.02,0.61,0.117575,0.000518,0.881907,0.39,0.00,0.61,0.102733,...,0.607906,0.854272,-0.654608,0.357886,-0.635132,0.776470,0.392627,-0.126965,-0.839296,-1.069467
2498,0.09,0.83,0.08,0.070149,0.928011,0.001840,0.17,0.73,0.10,-0.919652,...,0.254336,-1.200407,1.490431,-0.745242,-1.682237,1.202939,-0.466124,-3.233246,-1.103422,-1.044651


In [8]:
logreg.fit(X_train, y_train)
forest.fit(X_train, y_train)
xgboost.fit(X_train, y_train)
xtrees.fit(X_train, y_train)

ExtraTreesClassifier(random_state=42)

In [9]:
#print(f"Accuracy of logreg: {accuracy_score(y_test, logreg.predict(X_test))}")
#print(f"Accuracy of forest: {accuracy_score(y_test, forest.predict(X_test))}")
#print(f"Accuracy of xgboost: {accuracy_score(y_test, xgboost.predict(X_test))}")
#print(f"Accuracy of xtrees: {accuracy_score(y_test, xtrees.predict(X_test))}")
#print(f"Accuracy of stacking: {accuracy_score(y_test, stacking.predict(X_test))}")

In [9]:
1/0

ZeroDivisionError: division by zero

In [13]:
kfold = KFold(n_splits=5, shuffle=False)

In [None]:
print(f"Accuracy of logreg: {np.mean(cross_val_score(logreg, X_train, y_train, cv=kfold))}")
print(f"Accuracy of forest: {np.mean(cross_val_score(forest, X_train, y_train, cv=kfold))}")
print(f"Accuracy of xgboost: {np.mean(cross_vbal_score(xgboost, X_train, y_train, cv=kfold))}")
print(f"Accuracy of xtrees: {np.mean(cross_val_score(xtrees, X_train, y_train, cv=kfold))}")
print(f"Accuracy of stacking: {np.mean(cross_val_score(stacking, X_train, y_train, cv=kfold))}")

In [16]:
from typing import Tuple
import copy as cp

In [17]:
def cross_val_predict(model, kfold : KFold, X : np.array, y : np.array) -> Tuple[np.array, np.array, np.array]:

    model_ = cp.deepcopy(model)
    
    no_classes = len(np.unique(y))
    
    actual_classes = np.empty([0], dtype=int)
    predicted_classes = np.empty([0], dtype=int)
    predicted_proba = np.empty([0, no_classes]) 

    for train_ndx, test_ndx in kfold.split(X):

        train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]

        actual_classes = np.append(actual_classes, test_y)

        model_.fit(train_X, train_y)
        predicted_classes = np.append(predicted_classes, model_.predict(test_X))

        try:
            predicted_proba = np.append(predicted_proba, model_.predict_proba(test_X), axis=0)
        except:
            predicted_proba = np.append(predicted_proba, np.zeros((len(test_X), no_classes), dtype=float), axis=0)

    return actual_classes, predicted_classes, predicted_proba

In [18]:
actual_classes, predicted_classes, predicted_proba = cross_val_predict(forest, kfold, X, y)

In [19]:
predicted_classes

array([2, 1, 0, ..., 1, 0, 0])

In [20]:
train_ndx

NameError: name 'train_ndx' is not defined

In [21]:
model = forest

In [22]:
model_ = cp.deepcopy(model)

no_classes = len(np.unique(y))

actual_classes = np.empty([0], dtype=int)
predicted_classes = np.empty([0], dtype=int)
predicted_proba = np.empty([0, no_classes]) 

for train_ndx, test_ndx in kfold.split(X):

    train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]

    actual_classes = np.append(actual_classes, test_y)

    model_.fit(train_X, train_y)
    predicted_classes = np.append(predicted_classes, model_.predict(test_X))

    try:
        predicted_proba = np.append(predicted_proba, model_.predict_proba(test_X), axis=0)
    except:
        predicted_proba = np.append(predicted_proba, np.zeros((len(test_X), no_classes), dtype=float), axis=0)

In [24]:
train_ndx

array([   0,    1,    2, ..., 7997, 7998, 7999])

In [26]:
model_.predict(test_X)

array([1, 1, 2, ..., 1, 0, 0])

In [27]:
model_.predict_proba(test_X)

array([[0.06, 0.85, 0.09],
       [0.05, 0.86, 0.09],
       [0.11, 0.08, 0.81],
       ...,
       [0.02, 0.95, 0.03],
       [0.79, 0.05, 0.16],
       [0.72, 0.07, 0.21]])

In [34]:
model_ = cp.deepcopy(model)

no_rows = X.shape[0]
no_classes = len(np.unique(y))

In [95]:
predicted_classes = np.zeros((no_rows))
predicted_probas = np.zeros((no_rows, no_classes))

In [96]:
kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

In [129]:
def convert_predicted_to_predicted_proba(predicted : np.ndarray) -> np.ndarray:
    return OneHotEncoder().fit_transform(predicted.reshape(-1, 1)).toarray()

VERY IMPORTANT!!!

In [132]:
for train_ndx, test_ndx in kfold.split(X):

    train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]
    model_.fit(train_X, train_y)
    
    predicted_classes[test_ndx] = model_.predict(test_X)
    
    if hasattr(model_, "predict_proba"):
        predicted_probas[test_ndx] = model_.predict_proba(test_X)
    else:
        predicted_probas[test_ndx] = convert_predicted_to_predicted_proba(predicted_classes[test_ndx])

In [135]:
model_.predict_proba(test_X)

array([[0.22, 0.61, 0.17],
       [0.05, 0.94, 0.01],
       [0.06, 0.03, 0.91],
       ...,
       [0.46, 0.08, 0.46],
       [0.15, 0.  , 0.85],
       [0.01, 0.97, 0.02]])

In [134]:
hasattr(model_, "predict_proba")

True

In [127]:
logreg.fit(train_X, train_y)
logreg.predict_proba(test_X)

array([[0.38985601, 0.51936127, 0.09078271],
       [0.26500694, 0.32920024, 0.40579283],
       [0.1129679 , 0.01012418, 0.87690792],
       ...,
       [0.38909524, 0.26281518, 0.34808958],
       [0.15035778, 0.03440465, 0.81523757],
       [0.02246867, 0.84043645, 0.13709489]])

In [125]:
predicted_probas

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       ...,
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [128]:
predicted_probas[test_ndx] = logreg.predict_proba(test_X)

In [117]:
type(model_.predict(test_X))

numpy.ndarray

In [116]:
hasattr(model_, "predicted_proba")

False

In [114]:
model_.predicted_proba(test_X)

AttributeError: 'RandomForestClassifier' object has no attribute 'predicted_proba'

In [112]:
accuracy_score(y, predicted_classes)

0.8742

In [108]:
unique, frequency = np.unique(predicted_classes, return_counts = True)

In [109]:
print(unique)
print(frequency)

[0. 1. 2.]
[3377 3499 3124]


In [92]:
predicted_classes.shape

(10000, 1)

In [97]:
predicted_classes[test_ndx]

array([0., 0., 0., ..., 0., 0., 0.])

In [81]:
train_ndx

array([2000, 2001, 2002, ..., 9997, 9998, 9999])

In [41]:
model_.decision_function(test_X)

AttributeError: 'RandomForestClassifier' object has no attribute 'decision_function'

In [None]:
actual_classes = np.empty([0], dtype=int)
predicted_classes = np.empty([0], dtype=int)
predicted_proba = np.empty([0, no_classes]) 

for train_ndx, test_ndx in kfold.split(X):

    train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]

    actual_classes = np.append(actual_classes, test_y)

    model_.fit(train_X, train_y)
    predicted_classes = np.append(predicted_classes, model_.predict(test_X))

    try:
        predicted_proba = np.append(predicted_proba, model_.predict_proba(test_X), axis=0)
    except:
        predicted_proba = np.append(predicted_proba, np.zeros((len(test_X), no_classes), dtype=float), axis=0)