## Stacking Ensemble

In [1]:
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

cancer_data = load_breast_cancer()

X_data = cancer_data.data
y_label = cancer_data.target

X_train , X_test , y_train , y_test = train_test_split(X_data , y_label , test_size=0.2 , random_state=0)

In [11]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds):
    kf=KFold(n_splits=n_folds, shuffle=False, random_state=0)
    
    train_fold_pred = np.zeros((X_train_n.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0], n_folds))
    print(model.__class__.__name__, 'model start')
    
    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train_n)):
        print('\t Fold set', folder_counter, 'start')
        X_tr = X_train_n[train_index]
        y_tr = y_train_n[train_index]
        X_te = X_train_n[valid_index]
        
        model.fit(X_tr, y_tr)
        
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1, 1)
        
        test_pred[:, folder_counter] = model.predict(X_test_n)
        
    test_pred_mean = np.mean(test_pred, axis = 1). reshape(-1, 1)
    
    return train_fold_pred, test_pred_mean

In [12]:
# Classifier for each ML model
knn_clf  = KNeighborsClassifier(n_neighbors=4)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=0)
dt_clf = DecisionTreeClassifier()
ada_clf = AdaBoostClassifier(n_estimators=100)

#  Classifier for final stacking model. 
lr_final = LogisticRegression(C=10)


In [13]:
#Train and test the dataset on seperate models

knn_train, knn_test = get_stacking_base_datasets(knn_clf, X_train, y_train, X_test, 7)
rf_train, rf_test = get_stacking_base_datasets(rf_clf, X_train, y_train, X_test, 7)
dt_train, dt_test = get_stacking_base_datasets(dt_clf, X_train, y_train, X_test, 7)
ada_train, ada_test = get_stacking_base_datasets(ada_clf, X_train, y_train, X_test, 7)


KNeighborsClassifier model start
	 Fold set 0 start
	 Fold set 1 start
	 Fold set 2 start
	 Fold set 3 start
	 Fold set 4 start
	 Fold set 5 start
	 Fold set 6 start
RandomForestClassifier model start
	 Fold set 0 start




	 Fold set 1 start
	 Fold set 2 start
	 Fold set 3 start
	 Fold set 4 start
	 Fold set 5 start
	 Fold set 6 start




DecisionTreeClassifier model start
	 Fold set 0 start
	 Fold set 1 start
	 Fold set 2 start
	 Fold set 3 start
	 Fold set 4 start
	 Fold set 5 start
	 Fold set 6 start
AdaBoostClassifier model start
	 Fold set 0 start
	 Fold set 1 start
	 Fold set 2 start
	 Fold set 3 start
	 Fold set 4 start
	 Fold set 5 start
	 Fold set 6 start


In [14]:
# concatenate them together for final logit regression
Stack_final_X_train = np.concatenate((knn_train, rf_train, dt_train, ada_train), axis=1)
Stack_final_X_test = np.concatenate((knn_test, rf_test, dt_test, ada_test), axis=1)

print('Orignal train feature Shape:',X_train.shape, 'Original test feature Shape:',X_test.shape)
print('Stakcing train feature data Shape:', Stack_final_X_train.shape,
      'stacking test feature data Shape:',Stack_final_X_test.shape)

Orignal train feature Shape: (455, 30) Original test feature Shape: (114, 30)
Stakcing train feature data Shape: (455, 4) stacking test feature data Shape: (114, 4)


In [15]:
lr_final.fit(Stack_final_X_train, y_train)
stack_final = lr_final.predict(Stack_final_X_test)

print('final model accuracy', accuracy_score(y_test, stack_final))


final model accuracy 0.9736842105263158
