In [1]:
import pandas as pd
import numpy as np
import copy as cp
import matplotlib.pyplot as plt
from icecream import ic

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor

from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict, train_test_split
from sklearn.ensemble import StackingClassifier, StackingRegressor
from sklearn.metrics import accuracy_score
from imblearn.pipeline import Pipeline

from typing import Tuple

import sys
sys.path.insert(1, r'C:\Users\GHarrison\OneDrive - Lincoln College\Python Projects\Data Science\Utilities')

from misc_tools import make_classification_dataframe, make_regression_dataframe
from pipeline_tools import DataBinaryClassifierLevel0Stacker

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

  from pandas import MultiIndex, Int64Index


In [2]:
RANDOM_STATE : int = 42

FEATURE_NAME_PREFIX : str = "feature_"
TARGET_NAME : str = "target"
PASSTHROUGH : bool = True
    
ic.enable()

results = list()

In [3]:
df_classification = make_classification_dataframe(n_samples=12000, n_features=25, n_classes=2, feature_name_prefix=FEATURE_NAME_PREFIX, target_name=TARGET_NAME, random_state=RANDOM_STATE)

X = df_classification.drop([TARGET_NAME], axis=1)
y = df_classification[TARGET_NAME]

X, X_val, y, y_val = train_test_split(X, y, test_size=2000, random_state=RANDOM_STATE)

X.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25
9334,0.136638,-0.713842,0.157276,0.055179,-1.499486,-0.801896,2.413641,1.05526,-0.360709,0.390722,...,-0.951505,0.919107,0.327989,1.240349,1.916928,0.560735,0.149824,1.345824,1.194973,-0.079852
895,-1.966571,0.152219,0.859824,-2.668824,-1.029491,-1.125497,-2.036671,-0.798295,-0.256613,-0.717745,...,0.977914,0.464718,-0.742434,-1.61832,0.198326,-1.046548,0.523912,0.939122,-0.528114,0.179478
11264,-0.373778,-1.119354,-1.258326,-0.852311,-2.152573,-0.423335,-1.046887,0.50248,-0.140387,-0.276063,...,-1.698537,1.660224,-1.115649,-0.0828,-0.492556,1.075708,0.199834,1.419582,-0.767532,1.047571
7724,-0.755063,-1.821703,-1.222018,-0.566342,-1.165231,2.542285,0.20489,0.127901,1.21173,-2.145323,...,-0.380833,1.201113,1.500583,0.250396,1.149989,3.020883,-1.3429,-0.488408,0.632942,-0.5831
765,0.975995,-0.876509,0.98074,0.997508,-0.926742,5.945441,0.244172,-0.699326,2.134937,0.793963,...,-1.506136,-1.196388,3.100057,0.917948,0.522144,2.914381,-2.820182,0.18517,0.630581,-0.837476


In [4]:
level_0_classifiers = dict()
level_0_classifiers["logreg"] = LogisticRegression(random_state=RANDOM_STATE)
level_0_classifiers["forest"] = RandomForestClassifier(random_state=RANDOM_STATE)
level_0_classifiers["xgboost"] = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE)
level_0_classifiers["xtrees"] = ExtraTreesClassifier(random_state=RANDOM_STATE)

#level_1_classifier = ExtraTreesClassifier(random_state=RANDOM_STATE)
level_1_classifier = RandomForestClassifier(random_state=RANDOM_STATE)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

In [5]:
sk_stacking_clf = StackingClassifier(estimators=list(level_0_classifiers.items()), final_estimator=level_1_classifier, passthrough=PASSTHROUGH, cv=kfold)

In [6]:
if PASSTHROUGH == True:
    columns = [f"{name}_prediction" for name in level_0_classifiers.keys()] + list(X.columns)
else:
    columns = [f"{name}_prediction" for name in level_0_classifiers.keys()]

In [7]:
df_X_new_features = pd.DataFrame(sk_stacking_clf.fit_transform(X, y), columns=columns)
df_X_new_features

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,logreg_prediction,forest_prediction,xgboost_prediction,xtrees_prediction
0,0.746721,0.91,0.908653,1.0,0.136638,-0.713842,0.157276,0.055179,-1.499486,-0.801896,...,-0.951505,0.919107,0.327989,1.240349,1.916928,0.560735,0.149824,1.345824,1.194973,-0.079852
1,0.301752,0.07,0.058239,0.0,-1.966571,0.152219,0.859824,-2.668824,-1.029491,-1.125497,...,0.977914,0.464718,-0.742434,-1.618320,0.198326,-1.046548,0.523912,0.939122,-0.528114,0.179478
2,0.786726,0.89,0.963420,1.0,-0.373778,-1.119354,-1.258326,-0.852311,-2.152573,-0.423335,...,-1.698537,1.660224,-1.115649,-0.082800,-0.492556,1.075708,0.199834,1.419582,-0.767532,1.047571
3,0.681193,0.96,0.979050,1.0,-0.755063,-1.821703,-1.222018,-0.566342,-1.165231,2.542285,...,-0.380833,1.201113,1.500583,0.250396,1.149989,3.020883,-1.342900,-0.488408,0.632942,-0.583100
4,0.206296,0.03,0.008406,0.0,0.975995,-0.876509,0.980740,0.997508,-0.926742,5.945441,...,-1.506136,-1.196388,3.100057,0.917948,0.522144,2.914381,-2.820182,0.185170,0.630581,-0.837476
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.805857,0.91,0.972783,1.0,-0.110827,-1.415858,0.770497,-0.934064,0.392124,-0.365793,...,-0.841045,1.081216,0.603585,-0.391405,0.062470,1.166114,0.599579,-0.116912,-0.228063,-0.725702
9996,0.808834,1.00,0.994323,1.0,1.015395,0.206763,0.251755,0.512048,0.105519,-0.597608,...,-0.418663,0.237959,0.200852,-0.286942,0.459424,-1.125065,1.595078,-1.334188,-0.977567,-0.034062
9997,0.595495,0.93,0.984356,1.0,0.053380,2.320757,-0.843711,2.480900,0.470590,0.787763,...,-0.581459,-0.674763,-0.203324,-1.646048,-1.236306,-1.222219,-1.697821,1.534112,1.330090,0.602752
9998,0.655721,0.89,0.872995,1.0,0.114489,-0.611051,0.800832,0.411475,1.681659,0.123260,...,-1.325198,-1.955506,-0.665176,-0.851898,-0.283757,1.030944,-0.749034,-0.589223,1.008358,-0.312456


In [8]:
df_X_val_features = pd.DataFrame(sk_stacking_clf.transform(X_val), columns=columns)
df_X_val_features

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,logreg_prediction,forest_prediction,xgboost_prediction,xtrees_prediction
0,0.717201,0.64,0.178837,0.52,0.415469,0.411862,-1.634722,0.493020,-1.250545,-0.908314,...,0.736321,-1.220649,-0.405279,-0.412312,-0.261661,-1.121773,0.932722,0.669847,-0.089025,-1.017019
1,0.381207,0.61,0.477847,0.53,0.170100,-0.104261,1.917754,0.183027,0.163020,0.471627,...,1.014576,-0.318079,0.773256,-1.076906,-1.732387,0.432308,-0.959863,0.766296,2.154993,-0.094526
2,0.598337,0.05,0.000796,0.04,-0.084925,-0.633780,1.418194,-0.117754,-0.709695,2.174506,...,-1.612763,-0.449997,0.467080,0.288697,0.648790,0.249238,0.957683,0.544059,0.958509,0.028478
3,0.800211,0.88,0.913390,0.83,0.655658,-1.313357,-0.348048,-0.570056,1.358423,0.496086,...,0.461585,-0.567885,-0.513629,-0.292769,-1.225837,1.828138,-0.486385,0.779645,0.014967,-0.820339
4,0.639493,0.26,0.147193,0.26,-1.700129,0.245802,-1.620627,-1.255857,0.499093,1.223248,...,1.021395,-0.603323,0.975825,-0.014682,-1.132178,0.567860,-1.137821,0.064875,-0.827798,0.031466
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.371030,0.69,0.861986,0.66,0.461364,0.063958,1.132255,0.461108,0.875910,1.692888,...,-1.718175,-0.568335,0.847304,-0.527557,-0.021910,1.466664,-2.612767,0.186981,0.378239,-0.006171
1996,0.135996,0.67,0.935728,0.71,-0.205624,2.511897,-0.329651,1.070317,-0.284051,2.489013,...,-0.809895,1.588344,1.408561,-1.359416,0.970821,-2.603045,-0.385369,1.441050,0.035564,0.257851
1997,0.494486,0.27,0.298372,0.31,-0.510378,0.463750,-2.569178,0.491924,1.129880,-2.018141,...,0.477701,0.451899,-0.985815,-0.982354,0.813381,-1.165687,0.264327,-1.038564,0.419433,-0.410661
1998,0.246052,0.45,0.576192,0.43,0.044800,0.082125,-1.948593,0.626859,0.656630,0.393313,...,0.880140,-0.108929,-0.540090,0.339565,-0.251932,-0.126483,-0.734668,-0.446358,0.695878,-0.024620


In [9]:
pd.DataFrame(sk_stacking_clf.predict_proba(X_val)[:, 1], columns=["y_val_predict_proba"])

Unnamed: 0,y_val_predict_proba
0,0.36
1,0.62
2,0.00
3,0.98
4,0.07
...,...
1995,0.89
1996,0.82
1997,0.21
1998,0.46


In [10]:
#scores = cross_val_score(sk_stacking_clf, X_val, y_val, cv=kfold)
#results.append(scores)
#print(f"Accuracy of scikit-learn stacking: {np.mean(scores)}")

In [11]:
print(f"Accuracy of scikit-learn stacking classifier: {accuracy_score(y_val, sk_stacking_clf.predict(X_val))}")

Accuracy of scikit-learn stacking classifier: 0.882


In [12]:
stacker = DataBinaryClassifierLevel0Stacker(cp.deepcopy(level_0_classifiers), passthrough=PASSTHROUGH, save_x=True)
level_1 = cp.deepcopy(level_1_classifier)

stacking_clf = Pipeline([
                         ('stacker', stacker), 
                         ('level_1', level_1) 
                        ])

stacking_clf.fit(X, y)
stacker.X

ic| 'DataBinaryClassifierLevel0Stacker.init'
ic| 'DataBinaryClassifierLevel0Stacker.fit'
ic| 'DataBinaryClassifierLevel0Stacker.transform'


Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,logreg_prediction,forest_prediction,xgboost_prediction,xtrees_prediction
0,0.136638,-0.713842,0.157276,0.055179,-1.499486,-0.801896,2.413641,1.055260,-0.360709,0.390722,...,1.916928,0.560735,0.149824,1.345824,1.194973,-0.079852,0.746721,0.91,0.908653,1.0
1,-1.966571,0.152219,0.859824,-2.668824,-1.029491,-1.125497,-2.036671,-0.798295,-0.256613,-0.717745,...,0.198326,-1.046548,0.523912,0.939122,-0.528114,0.179478,0.301752,0.07,0.058239,0.0
2,-0.373778,-1.119354,-1.258326,-0.852311,-2.152573,-0.423335,-1.046887,0.502480,-0.140387,-0.276063,...,-0.492556,1.075708,0.199834,1.419582,-0.767532,1.047571,0.786726,0.89,0.963420,1.0
3,-0.755063,-1.821703,-1.222018,-0.566342,-1.165231,2.542285,0.204890,0.127901,1.211730,-2.145323,...,1.149989,3.020883,-1.342900,-0.488408,0.632942,-0.583100,0.681193,0.96,0.979050,1.0
4,0.975995,-0.876509,0.980740,0.997508,-0.926742,5.945441,0.244172,-0.699326,2.134937,0.793963,...,0.522144,2.914381,-2.820182,0.185170,0.630581,-0.837476,0.206296,0.03,0.008406,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-0.110827,-1.415858,0.770497,-0.934064,0.392124,-0.365793,0.072646,2.370523,1.014336,-0.242340,...,0.062470,1.166114,0.599579,-0.116912,-0.228063,-0.725702,0.805857,0.91,0.972783,1.0
9996,1.015395,0.206763,0.251755,0.512048,0.105519,-0.597608,0.728701,-1.131978,-0.395691,0.636372,...,0.459424,-1.125065,1.595078,-1.334188,-0.977567,-0.034062,0.808834,1.00,0.994323,1.0
9997,0.053380,2.320757,-0.843711,2.480900,0.470590,0.787763,-0.871454,0.445174,-0.438621,-0.802435,...,-1.236306,-1.222219,-1.697821,1.534112,1.330090,0.602752,0.595495,0.93,0.984356,1.0
9998,0.114489,-0.611051,0.800832,0.411475,1.681659,0.123260,0.910106,2.215521,1.134489,0.162280,...,-0.283757,1.030944,-0.749034,-0.589223,1.008358,-0.312456,0.655721,0.89,0.872995,1.0


In [13]:
y_val_predict_proba = stacking_clf.predict_proba(X_val)[:, 1]
pd.DataFrame(y_val_predict_proba, columns=["y_val_predict_proba"])

ic| 'DataBinaryClassifierLevel0Stacker.transform'


Unnamed: 0,y_val_predict_proba
0,0.90
1,0.94
2,0.01
3,1.00
4,0.00
...,...
1995,1.00
1996,1.00
1997,0.00
1998,0.04


In [14]:
stacker.X

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,logreg_prediction,forest_prediction,xgboost_prediction,xtrees_prediction
0,0.415469,0.411862,-1.634722,0.493020,-1.250545,-0.908314,0.774669,-0.654257,-0.310905,-1.278994,...,-0.261661,-1.121773,0.932722,0.669847,-0.089025,-1.017019,0.717201,0.64,0.178837,0.52
1,0.170100,-0.104261,1.917754,0.183027,0.163020,0.471627,0.234345,1.001994,0.710237,-0.888785,...,-1.732387,0.432308,-0.959863,0.766296,2.154993,-0.094526,0.381207,0.61,0.477847,0.53
2,-0.084925,-0.633780,1.418194,-0.117754,-0.709695,2.174506,-0.795447,-0.165586,-0.789234,0.240126,...,0.648790,0.249238,0.957683,0.544059,0.958509,0.028478,0.598337,0.05,0.000796,0.04
3,0.655658,-1.313357,-0.348048,-0.570056,1.358423,0.496086,-0.268185,-0.201080,-0.479328,-0.538986,...,-1.225837,1.828138,-0.486385,0.779645,0.014967,-0.820339,0.800211,0.88,0.913390,0.83
4,-1.700129,0.245802,-1.620627,-1.255857,0.499093,1.223248,-1.394655,-0.948536,1.233670,1.263330,...,-1.132178,0.567860,-1.137821,0.064875,-0.827798,0.031466,0.639493,0.26,0.147193,0.26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.461364,0.063958,1.132255,0.461108,0.875910,1.692888,-0.398486,-0.651813,-1.268504,1.095510,...,-0.021910,1.466664,-2.612767,0.186981,0.378239,-0.006171,0.371030,0.69,0.861986,0.66
1996,-0.205624,2.511897,-0.329651,1.070317,-0.284051,2.489013,0.066763,1.889454,-2.236448,1.070888,...,0.970821,-2.603045,-0.385369,1.441050,0.035564,0.257851,0.135996,0.67,0.935728,0.71
1997,-0.510378,0.463750,-2.569178,0.491924,1.129880,-2.018141,0.224241,0.028497,-0.432341,0.183069,...,0.813381,-1.165687,0.264327,-1.038564,0.419433,-0.410661,0.494486,0.27,0.298372,0.31
1998,0.044800,0.082125,-1.948593,0.626859,0.656630,0.393313,-0.006703,1.985926,2.359756,-1.140756,...,-0.251932,-0.126483,-0.734668,-0.446358,0.695878,-0.024620,0.246052,0.45,0.576192,0.43


In [15]:
#scores = cross_val_score(stacking_clf, X_val, y_val, cv=kfold)
#results.append(scores)
#print(f"Accuracy of scratch-built stacking classifier: {np.mean(scores)}")

In [16]:
print(f"Accuracy of scratch-built stacking classifier: {accuracy_score(y_val, stacking_clf.predict(X_val))}")

ic| 'DataBinaryClassifierLevel0Stacker.transform'


Accuracy of scratch-built stacking classifier: 0.8735


In [17]:
for classifier_name, classifier in level_0_classifiers.items():
    #scores = cross_val_score(classifier, X_val, y_val, cv=kfold)
    #results.append(scores)
    classifier.fit(X, y)
    print(f"Accuracy of standalone {classifier_name} classifier: {accuracy_score(y_val, classifier.predict(X_val))}")

Accuracy of standalone logreg classifier: 0.737
Accuracy of standalone forest classifier: 0.8675
Accuracy of standalone xgboost classifier: 0.8645
Accuracy of standalone xtrees classifier: 0.8635


In [18]:
#labels=["scikit stack", "scratch stack"] + list(level_0_classifiers.keys())

#plt.figure(figsize=(10,10))
#plt.boxplot(results, labels=labels, showmeans=True)
#plt.show()

Grid searching next!!