In [2]:
import pandas as pd
import numpy as np
import copy as cp
import matplotlib.pyplot as plt
from icecream import ic

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor

from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict, train_test_split
from sklearn.ensemble import StackingClassifier, StackingRegressor
from sklearn.metrics import accuracy_score
from imblearn.pipeline import Pipeline

from typing import Tuple

import sys
sys.path.insert(1, r'C:\Users\GHarrison\OneDrive - Lincoln College\Python Projects\Data Science\Utilities')

from misc_tools import make_classification_dataframe, make_regression_dataframe
#from pipeline_tools import DataBinaryClassifierStacker

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
RANDOM_STATE : int = 42

FEATURE_NAME_PREFIX : str = "feature_"
TARGET_NAME : str = "target"
    
ic.enable()

In [4]:
df_classification = make_classification_dataframe(n_samples=1200, n_features=25, n_classes=2, feature_name_prefix=FEATURE_NAME_PREFIX, target_name=TARGET_NAME, random_state=RANDOM_STATE)

X = df_classification.drop([TARGET_NAME], axis=1)
y = df_classification[TARGET_NAME]

X, X_val, y, y_val = train_test_split(X, y, test_size=200, random_state=RANDOM_STATE)

X.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25
808,1.51273,-0.169446,0.014846,-2.828687,-0.030509,2.571755,-1.063632,1.073635,0.775254,2.460824,...,0.866567,0.458418,0.147444,0.010072,-0.526291,-1.093318,0.339562,0.509393,-2.220689,2.06267
907,0.240776,0.019366,-1.43009,0.152658,0.701261,0.32804,-0.276158,-0.534686,0.378379,1.643665,...,1.42399,2.052654,1.185314,-1.046971,0.976296,1.33573,1.223209,0.174802,-0.029188,-0.484682
657,-1.346543,-0.282905,-1.24729,0.457061,-0.010965,0.272648,-1.098173,1.208717,0.613984,-0.064836,...,1.096648,0.675428,-0.237639,-1.609636,0.844471,0.490604,-1.327062,0.682108,-0.907528,0.729052
956,-0.284754,-1.049815,2.402844,1.198485,-0.214063,-0.438494,-2.103193,-0.811754,0.130803,1.349681,...,-0.050278,-0.094065,1.518612,1.673591,0.413294,2.114339,-0.045554,-0.789869,0.520921,-2.004984
885,0.130724,2.003741,1.417081,-1.149047,0.87507,2.064449,0.085855,-0.028957,-0.750355,0.103558,...,-2.152612,1.143486,-1.496239,-0.765354,-1.158733,-0.303889,-0.410975,0.201964,-0.766173,0.538481


In [5]:
level_0_classifiers = dict()
level_0_classifiers["logreg"] = LogisticRegression(random_state=RANDOM_STATE)
level_0_classifiers["forest"] = RandomForestClassifier(random_state=RANDOM_STATE)
level_0_classifiers["xgboost"] = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE)
level_0_classifiers["xtrees"] = ExtraTreesClassifier(random_state=RANDOM_STATE)

level_1_classifier = RandomForestClassifier(random_state=RANDOM_STATE)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

stacking_clf = StackingClassifier(estimators=list(level_0_classifiers.items()), final_estimator=level_1_classifier, passthrough=False, cv=kfold)

In [6]:
df_X_new_features = pd.DataFrame(stacking_clf.fit_transform(X, y), columns=[f"{name}_prediction" for name in level_0_classifiers.keys()])
df_X_new_features

Unnamed: 0,logreg_prediction,forest_prediction,xgboost_prediction,xtrees_prediction
0,0.076957,0.07,0.000514,0.0
1,0.513460,0.95,0.994133,1.0
2,0.852920,0.94,0.990162,1.0
3,0.879913,0.97,0.999854,1.0
4,0.721514,0.77,0.940432,1.0
...,...,...,...,...
995,0.312670,0.09,0.010553,0.0
996,0.832097,1.00,0.999595,1.0
997,0.547054,0.11,0.018569,0.0
998,0.081879,0.02,0.002624,0.0


In [6]:
stacker = DataBinaryClassifierStacker(level_0_classifiers)
stacker.fit(X, y)
stacker.transform(X)

ic| 'DataBinaryClassifierStacker.init'
ic| 'DataBinaryClassifierStacker.fit'
ic| 'DataBinaryClassifierStacker.transform'


Unnamed: 0,logreg_prediction,forest_prediction,xgboost_prediction,xtrees_prediction
0,0.076957,0.07,0.000514,0.0
1,0.513460,0.95,0.994133,1.0
2,0.852920,0.94,0.990162,1.0
3,0.879913,0.97,0.999854,1.0
4,0.721514,0.77,0.940432,1.0
...,...,...,...,...
995,0.312670,0.09,0.010553,0.0
996,0.832097,1.00,0.999595,1.0
997,0.547054,0.11,0.018569,0.0
998,0.081879,0.02,0.002624,0.0


In [7]:
df_X_val_features = pd.DataFrame(stacking_clf.transform(X_val), columns=[f"{name}_prediction" for name in level_0_classifiers.keys()])
df_X_val_features

Unnamed: 0,logreg_prediction,forest_prediction,xgboost_prediction,xtrees_prediction
0,0.098688,0.52,0.797417,0.54
1,0.680668,0.52,0.892057,0.57
2,0.118382,0.67,0.956150,0.59
3,0.408617,0.30,0.041542,0.35
4,0.376713,0.21,0.000640,0.16
...,...,...,...,...
195,0.642742,0.89,0.998618,0.82
196,0.184693,0.59,0.920475,0.64
197,0.916046,0.97,0.999343,0.93
198,0.157261,0.04,0.000503,0.11


In [8]:
stacker.transform(X_val)

ic| 'DataBinaryClassifierStacker.transform'


Unnamed: 0,logreg_prediction,forest_prediction,xgboost_prediction,xtrees_prediction
0,0.098688,0.52,0.797417,0.54
1,0.680668,0.52,0.892057,0.57
2,0.118382,0.67,0.956150,0.59
3,0.408617,0.30,0.041542,0.35
4,0.376713,0.21,0.000640,0.16
...,...,...,...,...
195,0.642742,0.89,0.998618,0.82
196,0.184693,0.59,0.920475,0.64
197,0.916046,0.97,0.999343,0.93
198,0.157261,0.04,0.000503,0.11


In [7]:
pd.DataFrame(xtrees.predict_proba(X_val)[:,1], columns=["xtrees_prediction"])

NameError: name 'xtrees' is not defined

In [9]:
pd.DataFrame(stacking_clf.predict_proba(X_val)[:, 1], columns=["y_val_predict_proba"])

Unnamed: 0,y_val_predict_proba
0,0.79
1,0.82
2,0.84
3,0.00
4,0.00
...,...
195,1.00
196,0.96
197,1.00
198,0.00


In [10]:
print(f"Accuracy of scikit-learn stacking: {accuracy_score(y_val, stacking_clf.predict(X_val))}")

Accuracy of scikit-learn stacking: 0.885


In [13]:
forest = cp.deepcopy(level_1_classifier)
forest.fit(df_X_new_features, y)

pd.DataFrame(forest.predict_proba(df_X_val_features)[:,1], columns=["y_val_predict_proba"])

Unnamed: 0,y_val_predict_proba
0,0.75
1,0.75
2,1.00
3,0.00
4,0.00
...,...
195,1.00
196,1.00
197,1.00
198,0.00


In [14]:
y_val_predict_proba = cross_val_predict(estimator=forest, X=df_X_val_features, y=y_val, cv=kfold, method="predict_proba")
pd.DataFrame(y_val_predict_proba[:,1], columns=["y_val_predict_proba"])

Unnamed: 0,y_val_predict_proba
0,0.92
1,0.99
2,0.95
3,0.05
4,0.05
...,...
195,0.86
196,0.99
197,1.00
198,0.00


In [15]:
print(f"Accuracy of scratch-built stacking: {accuracy_score(y_val, forest.predict(df_X_val_features))}")

Accuracy of scratch-built stacking: 0.89


In [16]:
print(f"Accuracy of scratch-built stacking with level 1 cross validation: {accuracy_score(y_val, cross_val_predict(estimator=forest, X=df_X_val_features, y=y_val, cv=kfold))}")

Accuracy of scratch-built stacking with level 1 cross validation: 0.88


In [29]:
stacker = DataBinaryClassifierStacker(level_0_classifiers, save_x=True)
forest = RandomForestClassifier(random_state=RANDOM_STATE)

stacking_clf = Pipeline([
                         ('stacker', stacker), 
                         ('forest', forest) 
                        ])

stacking_clf.fit(X, y)
stacker.X

ic| 'DataBinaryClassifierStacker.init'
ic| 'DataBinaryClassifierStacker.fit'
ic| 'DataBinaryClassifierStacker.transform'


Unnamed: 0,logreg_prediction,forest_prediction,xgboost_prediction,xtrees_prediction
0,0.076957,0.07,0.000514,0.0
1,0.513460,0.95,0.994133,1.0
2,0.852920,0.94,0.990162,1.0
3,0.879913,0.97,0.999854,1.0
4,0.721514,0.77,0.940432,1.0
...,...,...,...,...
995,0.312670,0.09,0.010553,0.0
996,0.832097,1.00,0.999595,1.0
997,0.547054,0.11,0.018569,0.0
998,0.081879,0.02,0.002624,0.0


In [31]:
y_val_predict_probas = stacking_clf.predict_proba(X_val)[:, 1]
y_val_predict_probas

ic| 'DataBinaryClassifierStacker.transform'


array([0.75, 0.75, 1.  , 0.  , 0.  , 0.  , 1.  , 1.  , 0.  , 0.  , 1.  ,
       1.  , 0.  , 0.  , 0.  , 1.  , 1.  , 1.  , 0.04, 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.33, 0.  , 0.  , 1.  , 0.42, 1.  , 1.  , 0.33, 0.67,
       0.  , 0.  , 1.  , 0.  , 1.  , 1.  , 1.  , 1.  , 0.  , 0.  , 1.  ,
       0.  , 0.  , 0.28, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.72, 0.  ,
       1.  , 1.  , 0.  , 1.  , 0.  , 1.  , 0.67, 1.  , 1.  , 0.  , 0.  ,
       0.  , 0.95, 1.  , 1.  , 0.  , 1.  , 1.  , 0.08, 1.  , 0.  , 0.  ,
       1.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  , 0.  , 0.  , 1.  , 1.  ,
       1.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  , 0.  , 0.  , 0.  , 1.  ,
       0.  , 0.  , 1.  , 1.  , 1.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  ,
       0.  , 1.  , 1.  , 0.  , 0.  , 0.  , 1.  , 0.72, 0.  , 1.  , 1.  ,
       0.68, 0.  , 0.  , 1.  , 1.  , 0.  , 0.  , 1.  , 0.67, 0.  , 1.  ,
       1.  , 0.  , 0.  , 0.  , 0.65, 1.  , 1.  , 0.  , 0.61, 0.  , 1.  ,
       1.  , 0.39, 0.  , 1.  , 0.14, 1.  , 0.  , 1.

In [32]:
y_val_predict = stacking_clf.predict(X_val)
y_val_predict

ic| 'DataBinaryClassifierStacker.transform'


array([1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 1])

In [33]:
stacker.X

Unnamed: 0,logreg_prediction,forest_prediction,xgboost_prediction,xtrees_prediction
0,0.098688,0.52,0.797417,0.54
1,0.680668,0.52,0.892057,0.57
2,0.118382,0.67,0.956150,0.59
3,0.408617,0.30,0.041542,0.35
4,0.376713,0.21,0.000640,0.16
...,...,...,...,...
195,0.642742,0.89,0.998618,0.82
196,0.184693,0.59,0.920475,0.64
197,0.916046,0.97,0.999343,0.93
198,0.157261,0.04,0.000503,0.11


In [34]:
print(f"Accuracy of scratch-built stacking classifier: {accuracy_score(y_val, stacking_clf.predict(X_val))}")

ic| 'DataBinaryClassifierStacker.transform'


Accuracy of scratch-built stacking classifier: 0.89
