# stackEnsemble

The stackEnsemble function implements the technique of stack ensembling; the practice of using 1st level models to predict on the trainset (out of folds). These models are used to create predictions on the entire testset. This yields 'meta-features', and can be used as features for 2nd level models.


In [None]:
# imports for predictive models and validation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# imports for stackEnsemble()
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split

In [None]:
Xtr = pd.read_pickle('./X_train1.pkl')
Xte = pd.read_pickle('./X_test1.pkl')
y = pd.read_pickle('../y_train.pkl')

In [None]:
''' Function to perform stack ensembling on arbitrary dataset with sklearn models.

# Arguments:
    models:  list, a list of models to be used to create meta-features
    X:       dataframe, the trainset features
    y:       dataframe, the trainset labels
    Xtest:   dataframe, the testset features
    splits:  int, the number of splits for trainset CV
    verbose: bool, true when print outputs are desired
    
# Returns:
    X:       dataframe, new trainset with meta-features
    Xtest:   dataframe, new testset with meta-features
'''

def stackEnsemble(models, X, y, Xtest, splits, verbose):

    # assert correct data-types 
    assert type(models) == list
    assert type(splits) == int
    assert type(verbose) == bool

    # init variables
    kf = KFold(n_splits = splits)
    predsTR = {}
    predsTE = {}

    # iterate over all inserted models
    for n, model in enumerate(models):
        if verbose: print('Using model %d to make predictions..' % (n+1))

        # prepare split for predictions
        predsTR['model'+str(n+1)] = []
        for i, (train, test) in enumerate(kf.split(X)):
            if verbose: print('..on split %d' % (i+1))

            # fit on split and predict
            model.fit(X.iloc[train], y[train])
            predsTR['model'+str(n+1)].append(list(model.predict(X.iloc[test])))

        # predict on testset
        predsTE['model'+str(n+1)] = list(model.predict(Xtest))
    
    # combine trainset predictions in dataframe, join with trainset
    meta_feats = pd.DataFrame(columns = [col for col in predsTR.keys()])
    for model in predsTR.keys():
        meta_feats[model] = np.array([item for lst in predsTR[model] for item in lst])
    X = pd.concat([X, meta_feats], axis=1)

    # combine testset predictions in dataframe, join with testset
    meta_feats = pd.DataFrame(columns = [col for col in predsTE.keys()])
    for model in predsTE.keys():
        meta_feats[model] = np.array(predsTE[model])
    Xtest = pd.concat([Xtest, meta_feats], axis=1)

    # return trainset and testset with metafeatures
    return X, Xtest

## Example

In the blocks below I will use the stackEnsemble function to create metafeatures for the provided dataset. I will use metafeatures from three simple algorithms: AdaBoost ensemble of decision trees, a Support Vector Classifier and a Logistic Regression model.

In [None]:
# define the models
model1 = LogisticRegression(C=1e5, class_weight='balanced')
model3 = GradientBoostingClassifier(learning_rate=0.5
                                   , max_depth=9
                                   , max_features=0.05
                                   , min_samples_leaf=18
                                   , min_samples_split=12
                                   , subsample=1.0)
model4 = LogisticRegression(C=1e3, class_weight='balanced')
model5 = KNeighborsClassifier(n_jobs=-1, n_neighbors=2)
model6 = KNeighborsClassifier(n_jobs=-1, n_neighbors=4)
model7 = KNeighborsClassifier(n_jobs=-1, n_neighbors=16)

In [None]:
models = [model1, model3, model4, model5, model6, model7]
X_train_stack, X_test_stack = stackEnsemble(models, Xtr, y, Xte, 3, True)

In [None]:
X_test_stack.head(10)

In [None]:
te_preds = pd.read_csv('../../xgb_te_preds.csv', header=None)
tr_preds = pd.read_csv('../../xgb_tr_preds.csv', header=None)

te_preds = te_preds.iloc[:, 1]
tr_preds = tr_preds.iloc[:, 1]

tr_preds = pd.DataFrame(tr_preds)
tr_preds.columns = ['xgb']

te_preds = pd.DataFrame(te_preds)
te_preds.columns = ['xgb']

X_train_stack = pd.concat([X_train_stack, tr_preds], axis=1)
X_test_stack = pd.concat([X_test_stack, te_preds], axis=1)

In [None]:
X_test_stack.to_pickle('../X_test_stack.pkl')
X_train_stack.to_pickle('../X_train_stack.pkl')

## Validation

Below I test whether stack ensembling improves classification performance with Logistic Regression. The first example uses the raw dataset; it has no metafeatures. In the second example I use the metafeatures from the models I ensembled with above. 

In [None]:
print accuracy_score(y_pred=model1.fit(X_train, y_train).predict(X_test), y_true=y_test)

In [None]:
print accuracy_score(y_pred=model1.fit(X_train_stack, y_train).predict(X_test_stack), y_true=y_test)