# Stack Ensemble


In [19]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, r2_score, accuracy_score
from sklearn.svm import  SVC
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, ElasticNet
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.naive_bayes import GaussianNB

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import warnings
warnings.simplefilter('ignore')
from xgboost import XGBRegressor, XGBClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier, BaggingRegressor, RandomForestClassifier, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingRegressor, GradientBoostingClassifier, StackingClassifier
import matplotlib.pyplot as plt

## Wisconsin Dataset (Breast Cancer)

In [7]:
cancer = pd.read_csv("./Cases/Wisconsin/BreastCancer.csv", index_col=0)
cancer.head()

Unnamed: 0_level_0,Clump,UniCell_Size,Uni_CellShape,MargAdh,SEpith,BareN,BChromatin,NoemN,Mitoses,Class
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
61634,5,4,3,1,2,2,2,3,1,Benign
63375,9,1,2,6,4,10,7,7,2,Malignant
76389,10,4,7,2,2,8,6,1,1,Malignant
95719,6,10,10,10,8,10,7,10,7,Malignant
128059,1,1,1,1,2,5,5,1,1,Benign


In [8]:
X,y = cancer.drop('Class',axis=1), cancer['Class']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=24, stratify=y)

In [9]:
svm = SVC(kernel='linear')
lr = LogisticRegression()
dtc = DecisionTreeClassifier(random_state=24)
rf = RandomForestClassifier(random_state=24, n_estimators=25)
stack = StackingClassifier(estimators=[('SVM', svm),('LR',lr),('TREE',dtc)], final_estimator=rf)
stack.fit(X_train, y_train)
y_pred = stack.predict(X_test)
print(f"F1 Score : {f1_score(y_test, y_pred, pos_label='Malignant')}")

F1 Score : 0.9523809523809523


In [10]:
stack = StackingClassifier(estimators=[('SVM', svm),('LR',lr),('TREE',dtc)], final_estimator=rf, passthrough=True)
stack.fit(X_train, y_train)
y_pred = stack.predict(X_test)
print(f"F1 Score : {f1_score(y_test, y_pred, pos_label='Malignant')}")

F1 Score : 0.9523809523809523


## HR dataset

In [11]:
hr = pd.read_csv("./Cases/human-resources-analytics/HR_comma_sep.csv")
hr.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.1,0.77,6,247,4,0,1,0,sales,low
3,0.92,0.85,5,259,5,0,1,0,sales,low
4,0.89,1.0,5,224,5,0,1,0,sales,low


In [12]:
X, y = hr.drop('left', axis=1), hr['left']
ohe = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore').set_output(transform='pandas')
ct = make_column_transformer((ohe, make_column_selector(dtype_include=object)),("passthrough", make_column_selector(dtype_exclude=object)),verbose_feature_names_out=False).set_output(transform='pandas')
X_ohe = ct.fit_transform(X)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_ohe, y, test_size=0.3, random_state=24, stratify=y)
X_trn_ohe = ct.fit_transform(X_train)
X_tst_ohe = ct.transform(X_test)

In [24]:
lr = LogisticRegression()
ld = LinearDiscriminantAnalysis()
nb = GaussianNB()
dtc = DecisionTreeClassifier()
stack = StackingClassifier(estimators=[('LR', lr),('LD',ld),('NB',nb)], final_estimator=dtc, passthrough=True)
stack.fit(X_trn_ohe, y_train)
y_pred = stack.predict(X_tst_ohe)
print(f"F1 Score for dtc as final estimator: {f1_score(y_test, y_pred, pos_label=1)}")

F1 Score for dtc as final estimator: 0.9402985074626866


In [26]:
lr = LogisticRegression()
ld = LinearDiscriminantAnalysis()
nb = GaussianNB()
xgb = XGBClassifier()
stack = StackingClassifier(estimators=[('LR', lr),('LD',ld),('NB',nb)], final_estimator=xgb, passthrough=True)
stack.fit(X_trn_ohe, y_train)
y_pred = stack.predict(X_tst_ohe)
print(f"F1 Score for dtc as final estimator: {f1_score(y_test, y_pred, pos_label=1)}")

F1 Score for dtc as final estimator: 0.96633475580844


In [31]:
from lightgbm import LGBMRegressor, LGBMClassifier

In [30]:
lr = LogisticRegression()
ld = LinearDiscriminantAnalysis()
nb = GaussianNB()
lgbm = LGBMClassifier()
stack = StackingClassifier(estimators=[('LR', lr),('LD',ld),('NB',nb)], final_estimator=lgbm, passthrough=True)
stack.fit(X_trn_ohe, y_train)
y_pred = stack.predict(X_tst_ohe)
print(f"F1 Score for dtc as final estimator: {f1_score(y_test, y_pred, pos_label=1)}")

[LightGBM] [Info] Number of positive: 2497, number of negative: 7999
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000333 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1182
[LightGBM] [Info] Number of data points in the train set: 10496, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.237900 -> initscore=-1.164227
[LightGBM] [Info] Start training from score -1.164227
F1 Score for dtc as final estimator: 0.9662065683008091
