In [1]:
# # code for loading the format for the notebook
# import os

# # path : store the current path to convert back to it later
# path = os.getcwd()
# os.chdir( os.path.join('..', 'notebook_format') )
# from formats import load_style
# load_style(css_style = 'custom2.css', plot_style = False)

In [2]:
# os.chdir(path)

# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
%matplotlib inline
%load_ext watermark
%load_ext autoreload 
%autoreload 2

import numpy as np
import pandas as pd

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,sklearn

Ethen 2017-06-17 16:45:07 

CPython 3.5.2
IPython 5.3.0

numpy 1.12.1
pandas 0.19.2
sklearn 0.18.1


In [33]:
filename = 'BikeBuyerWithLocation.csv'
data = pd.read_csv(filename, encoding = 'latin1')

# columns that we won't be using at all
drop_cols = ['ID', 'Latitude', 'Longitude', 'City', 'Zip Code', 'Country']
data = data.drop(drop_cols, axis = 1)
print('dimensions: ', data.shape)
data.head()

dimensions:  (10000, 12)


Unnamed: 0,Marital Status,Gender,Yearly Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,BikeBuyer
0,Married,Female,20000,0,Partial College,Manual,No,1,0-1 Miles,Europe,47,Yes
1,Married,Female,10000,1,High School,Manual,No,1,1-2 Miles,Europe,46,No
2,Single,Female,10000,1,High School,Manual,No,1,2-5 Miles,Europe,46,No
3,Single,Male,10000,1,High School,Manual,No,1,1-2 Miles,Europe,46,No
4,Single,Male,10000,0,Partial College,Manual,No,1,2-5 Miles,Europe,64,No


In [34]:
from collections import defaultdict
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler


label_col = 'BikeBuyer'
label = data[label_col]
data = data.drop(label_col, axis = 1)
label_encode = LabelEncoder()
y = label_encode.fit_transform(label)

# slightly imbalanced problem
print('original labels: ', label_encode.classes_)
print('encoded labels distribution:', np.bincount(y))

test_size = 0.2
random_state = 1234
data_train, data_test, y_train, y_test = train_test_split(
    data, y, test_size = test_size, random_state = random_state, stratify = y)

original labels:  ['No' 'Yes']
encoded labels distribution: [9000 1000]


In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class Preprocess(BaseEstimator, TransformerMixin):
    
    def __init__(self, num_cols, cat_cols):
        self.num_cols = num_cols
        self.cat_cols = cat_cols

    def fit(self, data):
        data = data.copy()
        
        # Label encoding across multiple columns in scikit-learn
        # https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn
        self.label_encode_dict_ = defaultdict(LabelEncoder)
        label_encoded = (data[self.cat_cols]
                         .apply(lambda x: self.label_encode_dict_[x.name].fit_transform(x)))

        self.cat_encode_ = OneHotEncoder(sparse = False)
        self.cat_encode_.fit(label_encoded)

        self.scaler_ = StandardScaler().fit(data[self.num_cols])
        
        # store the column names (numeric columns comes before the
        # categorical columns) so we can refer to them later
        colnames = self.num_cols.copy()
        for col in self.cat_cols:
            cat_colnames = [col + '_' + classes 
                            for classes in self.label_encode_dict_[col].classes_]
            colnames += cat_colnames

        self.colnames = colnames
        return self

    def transform(self, data):
        label_encoded = (data[self.cat_cols]
                         .apply(lambda x: self.label_encode_dict_[x.name].transform(x)))
        cat_encoded = self.cat_encode_.transform(label_encoded)
        scaled = self.scaler_.transform(data[self.num_cols])

        # combine encoded categorical columns and scaled numerical
        # columns, it's the same as concatenate it along axis 1
        X = np.hstack((scaled, cat_encoded))
        return X

In [7]:
num_cols = ['Yearly Income', 'Children', 'Cars', 'Age']
cat_cols = ['Marital Status', 'Gender', 'Education', 'Occupation',
            'Home Owner', 'Commute Distance', 'Region']

preprocess = Preprocess(num_cols, cat_cols)
X_train = preprocess.fit_transform(data_train)
X_test = preprocess.transform(data_test)
X_train

array([[ 0.11431662, -0.69821864,  0.29928303, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.35562367,  2.41917677,  2.07544344, ...,  0.        ,
         0.        ,  1.        ],
       [-0.81666367,  1.17221861,  0.29928303, ...,  1.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.73497015, -0.69821864,  0.29928303, ...,  0.        ,
         1.        ,  0.        ],
       [-0.19601014, -0.69821864,  0.29928303, ...,  0.        ,
         1.        ,  0.        ],
       [-1.12699043, -0.69821864,  0.29928303, ...,  0.        ,
         1.        ,  0.        ]])

In [37]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state = 1234)
X_resampled, y_resampled = sm.fit_sample(X_train, y_train)

In [26]:
# encode = Pipeline([
#     ('label_encode', label_encode), 
#     ('onehot_encode', onehot_encode)
# ])

Pipeline(steps=[('label_encode', LabelEncoder()), ('onehot_encode', OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=False))])

# Stacking

Stacking is an ensemble learning technique to combine multiple predictive models (also referred to as base models) via a meta-classifier. Often times (especially in multi-class classification problems) the stacked model will outperform each of the individual models due its prowess to highlight each individual models where it performs best and discredit where it performs poorly. 

Thus, in order for this strategy to be effective it should be obvious that the base models should be different in some way so they don't all create the same error. Popular and powerful non-linear algorithms that are commonly used as base models includes [Random Forest, ExtraTrees](http://nbviewer.jupyter.org/github/ethen8181/machine-learning/blob/master/trees/random_forest.ipynb), [Gradient Boosting Machine](http://nbviewer.jupyter.org/github/ethen8181/machine-learning/blob/master/trees/gbm/gbm.ipynb), [Feed Forward Deep Learning](http://nbviewer.jupyter.org/github/ethen8181/machine-learning/blob/master/deep_learning/nn_tensorflow.ipynb). As for the meta-classifier, logistic regression is one of the most common ways of stacking the base models. We can probably use any model as the meta-classifier, but sticking with simpler ones like logistic regression or decision tree will most likely be a safer bet that prevents overfitting.

Letâ€™s say we want to perform 2-fold stacking (we can choose the number of folds we wish to use, the 2 is simply for easier illustration purpose), then the overall process will be:

- Split the train set in 2 parts: train_a and train_b
- Fit the base model(s) on train_a and create predictions for train_b
- Fit the same model on train_b and create predictions for train_a
- Train the meta-classifier/stacking model on the probabilities generated from the base model(s)
- Finally fit the model on the entire train set and create predictions for the test set.



That was a mouthful, in the next section, we will look at how this can be implemented.

In [8]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.datasets import make_classification, load_iris, load_digits
import numpy as np

# n_features = 20
# n_samples = 10000
# X, y = make_classification(n_features = n_features, n_samples = n_samples)

#digits = load_digits()
#X, y = digits.data, digits.target
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1234, stratify = y)
# X_train.shape

In [38]:
from sklearn.tree import DecisionTreeClassifier

model_tree = DecisionTreeClassifier(max_depth = 13)
# model_tree.fit(X_train, y_train)
model_tree.fit(X_resampled, y_resampled)

tree_train_pred = model_tree.predict(X_train)
tree_test_pred = model_tree.predict(X_test)

# accuracy_score(y_test, y_pred)
tree_train_score = roc_auc_score(y_train, tree_train_pred)
tree_test_score = roc_auc_score(y_test, tree_test_pred)
score_output = 'training score: {}, testing score: {}'.format(tree_train_score, tree_test_score)
print(score_output)

training score: 0.7946527777777779, testing score: 0.7022222222222222


In [57]:
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# from mlxtend.classifier import StackingCVClassifier

model_xgb = XGBClassifier(max_depth = 11, learning_rate = 0.01, 
                          colsample_bytree = 0.9)
# model_xgb.fit(X_train, y_train)

model_xgb.fit(X_resampled, y_resampled)

xgb_train_pred = model_xgb.predict(X_train)
xgb_test_pred = model_xgb.predict(X_test)
xgb_train_score = roc_auc_score(y_train, xgb_train_pred)
xgb_test_score = roc_auc_score(y_test, xgb_test_pred)
score_output = 'training score: {}, testing score: {}'.format(xgb_train_score, xgb_test_score)
print(score_output)

training score: 0.7910416666666668, testing score: 0.7208333333333334


In [88]:
from copy import deepcopy
from sklearn.base import BaseEstimator

class StackingClassifier(BaseEstimator):
    
    def __init__(self, classifiers, meta_classifier, kfold):
        self.kfold = kfold
        self.classifiers = classifiers
        self.meta_classifier = meta_classifier
        
    def fit(self, X, y):
        self.clfs_ = [deepcopy(clf) for clf in self.classifiers]
        self.meta_clf_ = deepcopy(self.meta_classifier)
        
        n_rows = X.shape[0]
        self._n_models = len(self.clfs_)
        self._n_classes = np.unique(y).shape[0] - 1
        
        all_model_predictions = np.zeros((n_rows, self._n_models * self._n_classes))
        for model_idx, model in enumerate(self.clfs_):
            
            model_prediction = np.zeros((n_rows, self._n_classes))
            for fold_idx, (train_idx, test_idx) in enumerate(self.kfold.split(X, y)):
                model.fit(X[train_idx], y[train_idx])
                # do we need just n_classes - 1 prediction ??
                model_prediction[test_idx] = model.predict_proba(X[test_idx])[:, self._n_classes:]
            
            # do we need just n_classes - 1 prediction ??
            start_idx = model_idx * self._n_classes
            columns = slice(start_idx, start_idx + self._n_classes)
            all_model_predictions[:, columns] = model_prediction
        
        print(all_model_predictions.shape)
        # is calling split multiple times a runtime bottleneck ??
        # compare this memory efficient way versus storing it directly as a list
        # and re-use the index multiple times
        reordered_labels = np.zeros(n_rows)
        for _, test_idx in self.kfold.split(X, y):
            reordered_labels[test_idx] = y[test_idx]
        
        # stacking
        self.meta_clf_.fit(all_model_predictions, reordered_labels)
        
        # fit the base models this time using all the input data
        for model in self.clfs_:
            model.fit(X, y)
        
        return self
    
    def predict_proba(self, X):
        n_rows = X.shape[0]
        all_model_predictions = np.zeros((n_rows, self._n_models * self._n_classes))
        for model_idx, model in enumerate(self.clfs_):
            model_prediction = model.predict_proba(X)[:, self._n_classes:]
            
            # do we need just n_classes - 1 prediction ??
            start_idx = model_idx * self._n_classes
            columns = slice(start_idx, start_idx + self._n_classes)
            all_model_predictions[:, columns] = model_prediction
            
        y_pred_proba = self.meta_clf_.predict_proba(all_model_predictions)
        return y_pred_proba
    
    def predict(self, X):
        y_pred_proba = self.predict_proba(X)
        y_pred = np.argmax(y_pred_proba, axis = 1)
        return y_pred

In [86]:
model_xgb = XGBClassifier(max_depth = 12, learning_rate = 0.01, 
                          colsample_bytree = 0.9)
model_rf = RandomForestClassifier(max_depth = 12)
model_lr = LogisticRegression()

n_splits = 10
shuffle = True
random_state = 1234
skf = StratifiedKFold(n_splits = n_splits, shuffle = shuffle, random_state = random_state)
stack_param = {
    'classifiers': [model_rf, model_xgb],
    'meta_classifier': model_lr,
    'kfold': skf
}

In [83]:
model_stacking.meta_clf_.coef_

array([[-2.16835025,  2.13210661, -4.54822883,  4.51198521]])

In [89]:
model_stacking = StackingClassifier(**stack_param)
# model_stacking.fit(X_train, y_train)
model_stacking.fit(X_resampled, y_resampled)

stacking_train_pred = model_stacking.predict(X_train)
stacking_test_pred = model_stacking.predict(X_test)
stacking_train_score = roc_auc_score(y_train, stacking_train_pred)
stacking_test_score = roc_auc_score(y_test, stacking_test_pred)
score_output = 'training score: {}, testing score: {}'.format(stacking_train_score, stacking_test_score)
print(score_output)

(14400, 2)
training score: 0.8228472222222222, testing score: 0.7122222222222222


In [82]:
model_stacking = StackingClassifier(**stack_param)
# model_stacking.fit(X_train, y_train)
model_stacking.fit(X_resampled, y_resampled)

stacking_train_pred = model_stacking.predict(X_train)
stacking_test_pred = model_stacking.predict(X_test)
stacking_train_score = roc_auc_score(y_train, stacking_train_pred)
stacking_test_score = roc_auc_score(y_test, stacking_test_pred)
score_output = 'training score: {}, testing score: {}'.format(stacking_train_score, stacking_test_score)
print(score_output)

(14400, 4)
training score: 0.8227083333333334, testing score: 0.7061111111111111


In [67]:
model_stacking = StackingClassifier(**stack_param)
# model_stacking.fit(X_train, y_train)
model_stacking.fit(X_resampled, y_resampled)

stacking_train_pred = model_stacking.predict(X_train)
stacking_test_pred = model_stacking.predict(X_test)
stacking_train_score = roc_auc_score(y_train, stacking_train_pred)
stacking_test_score = roc_auc_score(y_test, stacking_test_pred)
score_output = 'training score: {}, testing score: {}'.format(stacking_train_score, stacking_test_score)
print(score_output)

training score: 0.8174999999999999, testing score: 0.7144444444444445


In [35]:
stacking = StackingCVClassifier(classifiers = [rf, xgb], 
                                meta_classifier = lr, n_folds = 10)
stacking.fit(X_train, y_train)
y_pred_stacking = stacking.predict(X_test)

model_xgb.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_test)

print('stacking score:', roc_auc_score(y_test, y_pred_stacking))
print('single model score: ', xgb_test_score)

stacking accuracy: 0.9656
a single model accuracy:  0.9656


In [5]:
n_splits = 3
skf = StratifiedKFold(n_splits = n_splits, random_state = 1234)

n_models = 3
clfs = [DecisionTreeClassifier()] * n_models

In [6]:
level1_train = np.zeros((X_train.shape[0], n_models))
level1_test = np.zeros((X_test.shape[0], n_models))

In [7]:
for clf_idx, clf in enumerate(clfs):
    level1_test_fold = np.zeros((X_test.shape[0], n_splits))
    for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X_train, y_train)):
        X_train_fold = X_train[train_idx]
        X_test_fold = X_train[test_idx]      
        y_train_fold = y_train[train_idx]
        y_test_fold = y_train[test_idx]
        clf.fit(X_train_fold, y_train_fold)
        
        # only need the positive class for 2-class classification
        level1_train[test_idx, clf_idx] = clf.predict_proba(X_test_fold)[:, 1]
        level1_test_fold[:, fold_idx] = clf.predict_proba(X_test)[:, 1]
    
    level1_test[:, clf_idx] = level1_test_fold.mean(axis = 1)

In [13]:
from sklearn.linear_model import LogisticRegression 
lr = LogisticRegression()
lr.fit(level1_train, y_train)
pred = lr.predict(level1_test)
accuracy_score(y_test, pred)

0.94079999999999997

In [18]:
for k, clf in enumerate(clfs):
    for j, (train_index, test_index) in enumerate(skf.split(X_train, y)):
        # L^(-j), L_j 
        X_train_cv, X_test_cv = X_train[train_index], X_test[test_index]
        y_train_cv, y_test_cv = y_train[train_index], y_test[test_index]
        
        # M_k^(-j) - level 0 model (M_k) on the training set L^{-j}
        clf.fit(X_train_cv, y_train_cv)
        
        # L_cv = z_kj 
        # we use this dataset to train the level-1 model 
        # this is a 2-class problems, so we consider only the probability
        # p of class 0. 
        level_1_train[test_index, k] = clf.predict_proba(X_test_cv)[:, 0]
        
        # We build a level-1 test set to be used with the level 1 classifier.
        # This is the output of model M_k^(-j) on the held out test set
        level_1_test[:, k, j] = clf.predict_proba(X_test)[:, 0]

IndexError: index 7500 is out of bounds for axis 0 with size 7500

# Reference

- [Blog: KAGGLE ENSEMBLING GUIDE](https://mlwave.com/kaggle-ensembling-guide/)
- [mlxtend Documentation: StackingClassifier](https://rasbt.github.io/mlxtend/user_guide/classifier/StackingClassifier/)