https://towardsdatascience.com/simple-model-stacking-explained-and-automated-1b54e4357916

In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_classification

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score


  from pandas import MultiIndex, Int64Index


In [2]:
RANDOM_STATE : int = 42
N_SAMPLES : int = 10000
N_FEATURES : int = 25
N_CLASSES : int = 2
N_CLUSTERS_PER_CLASS : int = 2
    
FEATURE_NAME_PREFIX : str = "Feature"
TARGET_NAME : str = "Target"
    
N_SPLITS : int = 5
    
np.set_printoptions(suppress=True)

In [3]:
def make_classification_dataframe(n_samples : int = 10000, n_features : int = 25, n_classes : int = 2, n_clusters_per_class : int = 2, feature_name_prefix : str = "Feature", target_name : str = "Target", random_state : int = 42) -> pd.DataFrame:
    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, n_informative = n_classes * n_clusters_per_class, random_state=random_state)

    feature_names = [feature_name_prefix + " " + str(v) for v in np.arange(1, n_features+1)]
    return pd.concat([pd.DataFrame(X, columns=feature_names), pd.DataFrame(y, columns=[target_name])], axis=1)

df_data = make_classification_dataframe(n_samples=N_SAMPLES, n_features=N_FEATURES, n_classes=N_CLASSES, n_clusters_per_class=N_CLUSTERS_PER_CLASS, feature_name_prefix=FEATURE_NAME_PREFIX, target_name=TARGET_NAME, random_state=RANDOM_STATE)

X = df_data.drop([TARGET_NAME], axis=1).to_numpy()
y = df_data[TARGET_NAME].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=RANDOM_STATE)

df_data.head()

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,...,Feature 17,Feature 18,Feature 19,Feature 20,Feature 21,Feature 22,Feature 23,Feature 24,Feature 25,Target
0,0.070571,-1.130283,0.839677,1.274615,0.806554,0.381146,-0.204937,-0.53031,-0.643782,-1.660538,...,-0.829622,0.998934,-1.218144,-1.498923,-0.956609,-2.44064,1.867419,0.294196,-1.553635,0
1,-0.84059,1.494274,0.482513,0.656468,-0.545999,0.60465,-1.146993,1.292036,-1.72538,0.352301,...,0.628848,-1.041448,-1.194422,0.940196,0.443271,-1.61809,-0.434262,1.237573,-0.557803,0
2,2.722732,1.359169,0.97256,-0.830303,0.575233,1.68603,-0.512335,-0.911075,-0.675029,1.585742,...,-0.089353,0.196096,-1.96733,1.286646,2.213952,-1.054453,0.730491,-0.341204,1.033947,1
3,1.23824,0.671604,0.934616,2.15817,0.022018,0.574397,0.668441,-0.513554,1.65908,-2.31786,...,-0.66701,0.192512,0.759408,-0.341545,-1.269465,-3.275364,-0.021736,-0.178306,-0.732551,1
4,-0.140533,-0.367666,-0.354481,0.052573,1.010696,-1.063206,0.651451,-1.27946,-0.428956,-2.797526,...,3.016907,0.677306,-2.877704,-0.196636,-0.900747,3.589431,0.135074,0.059449,2.157824,0


In [4]:
forest = RandomForestClassifier(random_state=RANDOM_STATE)
xgboost = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE)
xtrees = ExtraTreesClassifier(random_state=RANDOM_STATE)
logreg = LogisticRegression(random_state=RANDOM_STATE)

classifiers = dict()
classifiers["Random Forest"] = forest
classifiers["XG Boost"] = xgboost
classifiers["Extra Random Trees"] = xtrees

In [5]:
stacking = StackingClassifier(estimators=list(classifiers.items()), final_estimator=forest, passthrough=True, stack_method='auto')

In [6]:
X_train_transformed = pd.DataFrame(stacking.fit_transform(X_train, y_train))
X_train_transformed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,0.00,0.000986,0.0,-1.890798,-0.820897,0.027454,-0.758479,-0.121849,-0.450950,-0.825182,...,0.800682,0.249175,0.010539,-1.184596,-0.058743,0.010661,-1.083764,0.656168,-0.846507,1.176347
1,0.01,0.000449,0.0,1.037791,-0.810117,0.878458,0.169326,0.539767,-1.263531,-0.089366,...,0.925682,-2.111345,0.072262,-0.601497,-1.229546,-1.684535,-1.078337,0.335416,0.018787,-0.053672
2,1.00,0.999901,1.0,-0.472732,-1.146887,-0.828661,0.619617,-1.641017,0.024614,2.049131,...,1.009305,1.004784,1.350867,1.215216,1.097283,0.744286,0.470633,-2.178767,0.104064,0.190475
3,0.99,0.999987,1.0,0.027449,0.577326,-0.243292,-0.073800,0.283755,0.428542,0.599659,...,0.899345,1.788373,-0.362664,1.292242,-0.546338,0.697256,-0.649141,0.976553,0.143573,-1.074178
4,0.91,0.985718,1.0,-1.162818,1.079029,-0.285378,-0.202614,-0.041241,-0.442634,0.440834,...,-2.526848,1.093803,0.391426,-1.269720,1.230010,-1.376395,-1.239515,-0.551828,-0.850033,-0.373546
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,0.02,0.000473,0.0,-0.894065,-1.099041,0.694203,0.784716,1.772547,0.724153,1.619800,...,2.463182,-1.288113,0.687864,-1.207455,-0.541035,0.927435,1.775722,0.727120,0.286639,0.769577
7496,0.01,0.001918,0.0,-1.925427,-0.748352,-0.719456,-0.321320,-1.397995,1.211832,0.453788,...,1.614808,-0.188150,-1.645921,-2.399614,2.335478,-0.400701,4.783661,-0.755358,-0.118149,0.821963
7497,0.06,0.039249,0.0,1.377576,0.622223,-1.882078,-0.054372,0.021745,-0.853999,-0.189219,...,0.373809,0.022213,1.545025,-0.205557,1.027997,0.361686,1.736339,-0.439960,-0.043446,0.599378
7498,0.91,0.952235,1.0,0.528051,-1.386392,-0.230937,-0.565218,1.861328,-0.134951,-0.714880,...,-0.283669,-0.128680,-1.955854,0.478391,-0.332809,0.421780,2.327378,-0.224688,-0.351049,0.178467


In [7]:
X_test_transformed = pd.DataFrame(stacking.transform(X_test))
X_test_transformed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,0.36,0.021654,0.38,-1.116231,-0.165217,-0.453874,0.894287,0.905040,-0.414390,0.782977,...,1.228276,-0.964808,1.267398,0.943528,-0.992728,-0.370286,3.182693,0.264317,-1.686468,-0.896478
1,0.90,0.998550,0.80,-0.464860,0.770462,-0.339809,-0.609718,0.433825,0.421999,-0.623473,...,-0.108923,0.581493,0.291507,1.288656,-1.552046,0.185461,-3.414209,1.601693,2.591739,0.012746
2,0.06,0.000495,0.08,0.527295,0.369743,0.082750,-0.463134,1.599087,1.854572,0.764849,...,1.681180,-0.712856,-1.059541,-1.641284,-1.100612,1.168561,2.680552,1.473598,0.373078,-1.029882
3,0.60,0.535063,0.54,1.248859,0.854145,-0.664763,1.341085,-2.292267,1.105842,-0.449332,...,-1.066580,0.068374,-0.460319,0.541586,1.332353,-1.341671,-1.430363,-1.260925,-0.078624,-0.525947
4,0.93,0.999097,0.93,-0.879259,0.706285,-1.093639,2.012105,-1.161510,-1.046290,1.156288,...,0.071314,2.162160,0.424344,0.248813,0.806376,1.560065,0.217187,-0.108620,-0.647381,0.518267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,0.14,0.006708,0.14,0.419126,-1.810173,-1.003915,-0.934490,-0.305715,1.664819,1.516759,...,-1.135885,1.586598,0.112717,-0.405194,-0.296500,-1.557889,-1.990256,-0.135821,-1.523495,0.026518
2496,0.28,0.431432,0.41,0.171527,0.619409,0.249913,-1.238045,0.067260,-0.087755,-0.383894,...,-0.346917,-1.375387,-0.380063,-1.719611,1.594456,1.081443,0.495701,2.079036,-0.170664,-1.103172
2497,0.98,0.999858,0.91,-1.130117,1.185428,0.038803,-1.247651,-1.457146,-1.018690,0.946337,...,1.044901,0.387935,-0.059200,-0.151501,-0.973389,0.459888,-0.860254,-1.110882,1.107713,-1.561118
2498,0.32,0.118486,0.44,-0.018714,1.981024,-2.912818,0.704183,-1.461290,1.247252,-1.952860,...,0.576238,-1.918325,-1.414725,-0.445390,1.042036,-0.310295,1.192157,0.891257,0.291166,-0.111179


In [8]:
logreg.fit(X_train, y_train)
forest.fit(X_train, y_train)
xgboost.fit(X_train, y_train)
xtrees.fit(X_train, y_train)

ExtraTreesClassifier(random_state=42)

In [12]:
1/0

ZeroDivisionError: division by zero

In [10]:
kfold = KFold(n_splits=5, shuffle=False)

In [11]:
print(f"Accuracy of logreg: {np.mean(cross_val_score(logreg, X_train, y_train, cv=kfold))}")
print(f"Accuracy of forest: {np.mean(cross_val_score(forest, X_train, y_train, cv=kfold))}")
print(f"Accuracy of xgboost: {np.mean(cross_vbal_score(xgboost, X_train, y_train, cv=kfold))}")
print(f"Accuracy of xtrees: {np.mean(cross_val_score(xtrees, X_train, y_train, cv=kfold))}")
print(f"Accuracy of stacking: {np.mean(cross_val_score(stacking, X_train, y_train, cv=kfold))}")

Accuracy of logreg: 0.7313333333333334
Accuracy of forest: 0.9248
Accuracy of xgboost: 0.9212
Accuracy of xtrees: 0.9224


KeyboardInterrupt: 

In [None]:
def cross_val_predict(model, kfold : KFold, X : np.array, y : np.array) -> Tuple[np.array, np.array, np.array]:

    model_ = cp.deepcopy(model)
    
    no_classes = len(np.unique(y))
    
    actual_classes = np.empty([0], dtype=int)
    predicted_classes = np.empty([0], dtype=int)
    predicted_proba = np.empty([0, no_classes]) 

    for train_ndx, test_ndx in kfold.split(X):

        train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]

        actual_classes = np.append(actual_classes, test_y)

        model_.fit(train_X, train_y)
        predicted_classes = np.append(predicted_classes, model_.predict(test_X))

        try:
            predicted_proba = np.append(predicted_proba, model_.predict_proba(test_X), axis=0)
        except:
            predicted_proba = np.append(predicted_proba, np.zeros((len(test_X), no_classes), dtype=float), axis=0)

    return actual_classes, predicted_classes, predicted_proba