https://towardsdatascience.com/simple-model-stacking-explained-and-automated-1b54e4357916

In [1]:
import pandas as pd
import numpy as np
import copy as cp

from sklearn.datasets import make_classification

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier

from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score

#from sklearn.preprocessing import OneHotEncoder
from typing import Tuple

  from pandas import MultiIndex, Int64Index


In [2]:
RANDOM_STATE : int = 42
N_SAMPLES : int = 10000
N_FEATURES : int = 25
N_CLASSES : int = 3
N_CLUSTERS_PER_CLASS : int = 2
    
FEATURE_NAME_PREFIX : str = "Feature"
TARGET_NAME : str = "Target"
    
N_SPLITS : int = 5
    
np.set_printoptions(suppress=True)

In [3]:
def make_classification_dataframe(n_samples : int = 10000, n_features : int = 25, n_classes : int = 2, n_clusters_per_class : int = 2, feature_name_prefix : str = "Feature", target_name : str = "Target", random_state : int = 42) -> pd.DataFrame:
    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, n_informative = n_classes * n_clusters_per_class, random_state=random_state)

    feature_names = [feature_name_prefix + " " + str(v) for v in np.arange(1, n_features+1)]
    return pd.concat([pd.DataFrame(X, columns=feature_names), pd.DataFrame(y, columns=[target_name])], axis=1)

df_data = make_classification_dataframe(n_samples=N_SAMPLES, n_features=N_FEATURES, n_classes=N_CLASSES, n_clusters_per_class=N_CLUSTERS_PER_CLASS, feature_name_prefix=FEATURE_NAME_PREFIX, target_name=TARGET_NAME, random_state=RANDOM_STATE)

X = df_data.drop([TARGET_NAME], axis=1).to_numpy()
y = df_data[TARGET_NAME].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=RANDOM_STATE)

df_data.head()

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,...,Feature 17,Feature 18,Feature 19,Feature 20,Feature 21,Feature 22,Feature 23,Feature 24,Feature 25,Target
0,-0.131637,2.281512,0.46881,0.707735,1.628051,0.622273,-0.434003,-0.992722,0.053795,-1.764985,...,-1.673779,0.276305,-1.685462,-0.801336,0.806151,0.369108,-0.843748,0.966868,-0.547149,2
1,-1.231544,-1.58088,0.684543,-0.343771,0.498176,-0.008396,-0.859592,-0.666477,-0.832989,-0.287655,...,0.341136,1.116596,1.134896,1.232907,1.295312,-0.253926,-0.528711,0.502124,0.896065,1
2,-1.301585,-1.922563,-0.623878,-0.740534,-0.723667,1.484827,1.227018,-0.050878,0.164059,0.301672,...,-0.90029,0.682905,0.680959,-0.02355,0.932216,0.109495,0.500366,0.956182,-2.268742,0
3,-0.899385,0.991619,0.494529,-0.672954,0.421605,-0.271674,1.245351,0.146567,0.389313,1.479558,...,-0.285753,-1.446158,-0.062296,0.583408,1.588965,0.412651,-1.891714,-0.575163,0.786847,0
4,-3.026721,0.745777,0.18845,-0.794256,1.40257,1.057481,0.454773,-0.174391,0.951417,-0.403872,...,0.959229,-1.964891,-0.296422,-0.755737,-0.489769,0.516726,-4.807225,1.215506,0.799321,1


In [4]:
classifiers = dict()
classifiers["Logistic Regression"] = LogisticRegression(random_state=RANDOM_STATE)
classifiers["Random Forest"] = RandomForestClassifier(random_state=RANDOM_STATE)
classifiers["XG Boost"] = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE)
classifiers["Extra Random Trees"] = ExtraTreesClassifier(random_state=RANDOM_STATE)
classifiers["Light Gradient Boost"] = LGBMClassifier(random_state=RANDOM_STATE)

In [5]:
kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

In [6]:
stacking = StackingClassifier(estimators=list(classifiers.items()), final_estimator=classifiers["Random Forest"], passthrough=False, stack_method='predict', cv=kfold)
X_train_transformed = pd.DataFrame(stacking.fit_transform(X_train, y_train))
X_train_transformed.head(50)

Unnamed: 0,0,1,2,3,4
0,2,1,1,1,1
1,0,2,2,2,2
2,1,1,1,1,1
3,2,2,2,2,2
4,1,0,0,0,0
5,0,1,1,1,1
6,1,1,1,1,1
7,1,1,1,1,1
8,0,0,0,0,0
9,2,2,2,2,2


In [7]:
stacking = StackingClassifier(estimators=list(classifiers.items()), final_estimator=classifiers["Random Forest"], passthrough=False, stack_method='predict_proba', cv=kfold)
X_train_transformed = pd.DataFrame(stacking.fit_transform(X_train, y_train))
X_train_transformed.head(50)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.225436,0.320894,0.45367,0.14,0.84,0.02,0.022234,0.964327,0.013439,0.0,1.0,0.0,0.063863,0.900234,0.035904
1,0.495543,0.163691,0.340766,0.07,0.08,0.85,0.001658,0.067161,0.931181,0.0,0.0,1.0,0.021835,0.169403,0.808762
2,0.083266,0.901984,0.01475,0.03,0.94,0.03,0.004582,0.994284,0.001134,0.0,1.0,0.0,0.015852,0.9788,0.005347
3,0.040706,0.354275,0.605019,0.03,0.03,0.94,0.023676,0.002184,0.97414,0.0,0.0,1.0,0.007546,0.002722,0.989732
4,0.029568,0.895861,0.074572,0.76,0.19,0.05,0.907884,0.052006,0.04011,1.0,0.0,0.0,0.729493,0.16047,0.110037
5,0.447072,0.380114,0.172814,0.15,0.81,0.04,0.131787,0.794447,0.073766,0.0,1.0,0.0,0.194288,0.681134,0.124578
6,0.136434,0.634008,0.229558,0.0,1.0,0.0,0.001136,0.99826,0.000604,0.0,1.0,0.0,0.004742,0.992248,0.00301
7,0.324279,0.352179,0.323542,0.13,0.86,0.01,0.024332,0.974973,0.000695,0.0,1.0,0.0,0.097017,0.898744,0.004238
8,0.608856,0.295115,0.096029,0.97,0.0,0.03,0.998816,0.000907,0.000277,1.0,0.0,0.0,0.987112,0.004181,0.008707
9,0.251321,0.187937,0.560743,0.04,0.07,0.89,0.027435,0.069736,0.90283,0.0,0.0,1.0,0.036889,0.229955,0.733156


In [8]:
X_test_transformed = pd.DataFrame(stacking.transform(X_test))
X_test_transformed.head(50)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.62179,0.103891,0.274319,0.7,0.06,0.24,0.96382,0.00119,0.03499,0.72,0.1,0.18,0.863187,0.011215,0.125598
1,0.193028,0.176281,0.630691,0.06,0.04,0.9,0.022556,0.047805,0.929639,0.08,0.14,0.78,0.031565,0.04105,0.927385
2,0.600337,0.305408,0.094255,0.72,0.01,0.27,0.478593,0.002203,0.519203,0.62,0.11,0.27,0.689631,0.011785,0.298584
3,0.376967,0.466962,0.156071,0.63,0.33,0.04,0.964388,0.027408,0.008205,0.62,0.29,0.09,0.968239,0.025662,0.006099
4,0.275219,0.050864,0.673917,0.31,0.08,0.61,0.17027,0.004125,0.825605,0.23,0.04,0.73,0.328383,0.012138,0.659478
5,0.150842,0.218049,0.631109,0.73,0.1,0.17,0.997399,0.000581,0.002021,0.61,0.24,0.15,0.975294,0.003459,0.021247
6,0.434125,0.389606,0.176269,0.1,0.31,0.59,0.125768,0.096489,0.777742,0.13,0.31,0.56,0.150311,0.058854,0.790835
7,0.503085,0.458733,0.038182,0.11,0.21,0.68,0.002583,0.005254,0.992163,0.19,0.13,0.68,0.005783,0.016183,0.978034
8,0.492724,0.146682,0.360594,0.61,0.19,0.2,0.874488,0.046873,0.078639,0.59,0.17,0.24,0.683293,0.115734,0.200973
9,0.047425,0.00589,0.946684,0.02,0.07,0.91,6e-05,0.000686,0.999254,0.04,0.13,0.83,0.000622,0.000744,0.998634


In [9]:
#logreg.fit(X_train, y_train)
#forest.fit(X_train, y_train)
#xgboost.fit(X_train, y_train)
#xtrees.fit(X_train, y_train)
#light.fit(X_train, y_train)

In [10]:
#print(f"Accuracy of logreg: {np.mean(cross_val_score(logreg, X_train, y_train, cv=kfold))}")
#print(f"Accuracy of forest: {np.mean(cross_val_score(forest, X_train, y_train, cv=kfold))}")
#print(f"Accuracy of xgboost: {np.mean(cross_vbal_score(xgboost, X_train, y_train, cv=kfold))}")
#print(f"Accuracy of xtrees: {np.mean(cross_val_score(xtrees, X_train, y_train, cv=kfold))}")
#print(f"Accuracy of stacking: {np.mean(cross_val_score(stacking, X_train, y_train, cv=kfold))}")

In [11]:
def cross_val_predict(model, kfold : KFold, X : np.array, y : np.array) -> Tuple[np.array, np.array, np.array]:

    model_ = cp.deepcopy(model)
    
    predicted_classes = np.zeros((X.shape[0]), dtype=int)
    predicted_probas = np.zeros((X.shape[0], len(np.unique(y))))

    for train_ndx, test_ndx in kfold.split(X):

        train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]

        model_.fit(train_X, train_y)

        predicted_classes[test_ndx] = model_.predict(test_X)

        try: predicted_probas[test_ndx] = model_.predict_proba(test_X)
        except: pass

    return predicted_classes, predicted_probas

In [12]:
def stacking(X, y, classifiers : dict, kfold : KFold) -> Tuple[np.ndarray, np.ndarray]:

    all_prediction_classes = [None] * len(classifiers)
    all_predicted_probas = [None] * len(classifiers)
    
    for i, (model_name, model) in enumerate(classifiers.items()):
        print(model_name)
        %time all_prediction_classes[i], all_predicted_probas[i] = cross_val_predict(model, kfold, X_train, y_train)

    return all_prediction_classes, all_predicted_probas

In [13]:
def zip_uneven(A, B):
    return_list = []
    
    for i in A:
        for j in B:
            return_list.append((i, j))
            
    return return_list

In [18]:
def stacking_arrays_to_dataframes(y, all_predicted_classes : np.ndarray, all_predicted_probas : np.ndarray) -> Tuple[pd.DataFrame, pd.DataFrame]:

    df_all_predicted_classes = pd.DataFrame(np.array(all_predicted_classes).T)
    df_all_predicted_classes.columns = [f"classifier_{i}_predicted_class" for i in range(0, len(classifiers))]

    df_all_predicted_probas = pd.concat([pd.DataFrame(x) for x in all_predicted_probas], axis=1)
    df_all_predicted_probas.columns = [f"classifier_{i}_class_{j}_predicted_proba" for i, j in zip_uneven(range(0, len(classifiers)), range(0, len(np.unique(y))))]

    return df_all_predicted_classes, df_all_predicted_probas

In [19]:
all_predicted_classes, all_predicted_probas = stacking(X_train, y_train, classifiers, kfold)
df_all_predicted_classes, df_all_predicted_probas = stacking_arrays_to_dataframes(y_train, all_predicted_classes, all_predicted_probas)

Logistic Regression
Wall time: 241 ms
Random Forest
Wall time: 10.3 s
XG Boost
Wall time: 12.4 s
Extra Random Trees
Wall time: 3.76 s
Light Gradient Boost
Wall time: 1.56 s


In [20]:
df_stacking_classes.head(50)

Unnamed: 0,classifier_0_predicted_class,classifier_1_predicted_class,classifier_2_predicted_class,classifier_3_predicted_class,classifier_4_predicted_class
0,2,1,1,1,1
1,0,2,2,2,2
2,1,1,1,1,1
3,2,2,2,2,2
4,1,1,2,1,1
5,0,0,1,1,1
6,1,1,1,1,1
7,1,1,1,1,1
8,0,0,0,0,0
9,2,2,2,2,2


In [21]:
df_stacking_probas.head(50)

Unnamed: 0,classifier_0_class_0_predicted_proba,classifier_0_class_1_predicted_proba,classifier_0_class_2_predicted_proba,classifier_1_class_0_predicted_proba,classifier_1_class_1_predicted_proba,classifier_1_class_2_predicted_proba,classifier_2_class_0_predicted_proba,classifier_2_class_1_predicted_proba,classifier_2_class_2_predicted_proba,classifier_3_class_0_predicted_proba,classifier_3_class_1_predicted_proba,classifier_3_class_2_predicted_proba,classifier_4_class_0_predicted_proba,classifier_4_class_1_predicted_proba,classifier_4_class_2_predicted_proba
0,0.209734,0.310986,0.47928,0.26,0.66,0.08,0.058285,0.926034,0.015682,0.29,0.47,0.24,0.070437,0.893598,0.035965
1,0.502314,0.163884,0.333802,0.07,0.24,0.69,0.013956,0.203732,0.782313,0.13,0.29,0.58,0.05985,0.295898,0.644252
2,0.07197,0.913253,0.014777,0.2,0.72,0.08,0.018456,0.979195,0.002349,0.17,0.75,0.08,0.019265,0.97001,0.010725
3,0.03904,0.360447,0.600513,0.13,0.06,0.81,0.013684,0.000345,0.985971,0.18,0.08,0.74,0.008801,0.001011,0.990188
4,0.028016,0.903524,0.06846,0.27,0.54,0.19,0.075446,0.255408,0.669145,0.29,0.53,0.18,0.046453,0.692597,0.26095
5,0.439519,0.384033,0.176449,0.4,0.38,0.22,0.368864,0.471128,0.160008,0.37,0.44,0.19,0.344692,0.502292,0.153017
6,0.134552,0.638719,0.226729,0.0,1.0,0.0,0.000714,0.998805,0.000481,0.08,0.87,0.05,0.002605,0.994973,0.002422
7,0.317693,0.359471,0.322836,0.41,0.52,0.07,0.128778,0.870393,0.000829,0.33,0.57,0.1,0.246591,0.747595,0.005814
8,0.628097,0.271256,0.100648,0.78,0.14,0.08,0.997731,0.001785,0.000484,0.78,0.15,0.07,0.991307,0.00621,0.002483
9,0.24419,0.193804,0.562006,0.12,0.12,0.76,0.036776,0.343504,0.619721,0.27,0.22,0.51,0.042116,0.362149,0.595735
