In [113]:
from functools import reduce
import operator

import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline

from tabulate import tabulate
from tqdm import tqdm

In [2]:
from utils import *

In [3]:
df = pd.read_csv('train.csv', index_col='ID')
print("Loading dataset... ok")

X, y = xy_split(df)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=57)
print("Creating train and validation datasets... ok")

# Split dataset into to 2 dataframes (one with many NaNs, another with few NaNs)
X_full, X_nan, y_full, y_nan, idx_full, idx_nan = preprocess(X_train, y_train)
print("Splitting dataset... ok")

Loading dataset... ok
Creating train and validation datasets... ok
Splitting dataset... ok


In [4]:
pipe_full = Pipeline([
    ('imputer', DataFrameImputer()),
    ('dummyfier', Dummyfier()),
    ('pca', PCA(n_components=25, svd_solver='arpack')),
    # ('clf', RandomForestClassifier(n_estimators=10, max_depth=4)),
])
pipe_nan = Pipeline([
    ('imputer', DataFrameImputer()),
    ('dummyfier', Dummyfier()),
    ('pca', PCA(n_components=25, svd_solver='arpack')),
    # ('clf', RandomForestClassifier(n_estimators=10, max_depth=4)),
])

pipe_full.fit(X_full, y_full)
print("Reducing Full Model... ok")
pipe_nan.fit(X_nan, y_nan)
print("Reducing NaN Model... ok")

Reducing Full Model... ok
Reducing NaN Model... ok


In [5]:
clfs_full = [
    ('rf25', RandomForestClassifier(n_estimators=25, max_depth=4, n_jobs=3)), 
    ('rf40', RandomForestClassifier(n_estimators=40, max_depth=3, n_jobs=3)),
    ('logreg1', LogisticRegression(C=1.0)),
    ('logreg3', LogisticRegression(C=3.0)),
]

In [96]:
def copy_estimator(est):
    return est.__class__(**est.get_params(deep=True))

In [125]:
class StackingClassifier():
    def __init__(self, probas_clfs, final_clf, transformer=None, k=3):
        """
        Args:
            probas_clfs (list of tuples): list of estimators to predict the probabilities, it takes to form of a list of tuple `[('est1_name', est1), ('est2_name', est2), ...]`
            final_clf: the final classifier, trained over the intermediate probabilities
            transformer (sklearn.base.TransformerMixin): object that processes transformations over the dataset (dummyfication, PCA, etc.)
            k (int): number of folds in the CV process
        """
        self.k = k
        self.transformer = transformer
        # we need an independent set of estimators for each fold of the CV for the 1st level model
        self.probas_clfs = [[(name, copy_estimator(clf)) for name, clf in probas_clfs] for _ in range(self.k)]
        self.final_clf = final_clf
        self._transformer_fitted = transformer is None
        self._skf = StratifiedKFold(n_splits=self.k)
        self._layer_fitted = False
        self._layer_probas = None
        self._final_fitted = False
    
    def _fit_transformer(self, X, y=None):
        if self.transformer is not None:
            self.transformer.fit(X, y)
            self._transformer_fitted = True
    
    def _transform(self, X):
        if not self._transformer_fitted:
            raise Exception("Transformer not fitted.")
        return X if self.transformer is None else self.transformer.transform(X)
    
    def _fit_layer(self, X, y):
        probas = np.zeros((y.shape[0], len(self.probas_clfs[0])))
        for i, (train_idx, test_idx) in enumerate(self._skf.split(X, y)):
            print("Fold %d/%d" % (i + 1, len(self.probas_clfs)))
            for j, (clf_name, clf) in enumerate(self.probas_clfs[i]):
                print("    Estimator '%s'" % (clf_name))
                clf.fit(self._transform(X.iloc[train_idx]), y.iloc[train_idx])
                probas[test_idx, j] = clf.predict_proba(self._transform(X.iloc[test_idx]))[:, 1]
        self._layer_fitted = True
        return probas

    def _predict_layer_probas(self, X):
        if not self._layer_fitted:
            raise Exception("Intermediate estimators not fitted.")
        probas = np.empty((X.shape[0], len(self.probas_clfs[0]), self.k))
        for i in range(self.k):
            for j, (_, c) in enumerate(self.probas_clfs[i]):
                probas[:, j, i] = c.predict_proba(self._transform(X))[:, 1]
        return np.mean(probas, axis=2)
    
    def _fit_final(self, probas, y):
        if not self._layer_fitted:
            raise Exception("Intermediate estimators not fitted.")
        self.final_clf.fit(probas, y)
        print("Final classifier fitted")
        self._final_fitted = True
    
    def fit(self, X, y):
        self._fit_transformer(X, y)
        self._layer_probas = self._fit_layer(X, y)
        self._fit_final(self._layer_probas, y)
    
    def predict_proba(self, X):
        if not self._layer_fitted:
            raise Exception("Intermediate estimators not fitted.")
        if not self._final_fitted:
            raise Exception("Final classifier not fitted.")
        probas = self._predict_layer_probas(X)
        return self.final_clf.predict_proba(probas)[:, 1]
    
    def predict(self, X, threshold=0.5):
        if not self._layer_fitted:
            raise Exception("Intermediate estimators not fitted.")
        if not self._final_fitted:
            raise Exception("Final classifier not fitted.")
        return (self.predict_proba(X) >= threshold).astype(int)

In [126]:
stack_full = StackingClassifier(clfs_full, LogisticRegression(), transformer=pipe_full)

In [127]:
stack_full.fit(X_full, y_full)

Fold 1/3
    Estimator 'rf25'
    Estimator 'rf40'
    Estimator 'logreg1'
    Estimator 'logreg3'
Fold 2/3
    Estimator 'rf25'
    Estimator 'rf40'
    Estimator 'logreg1'
    Estimator 'logreg3'
Fold 3/3
    Estimator 'rf25'
    Estimator 'rf40'
    Estimator 'logreg1'
    Estimator 'logreg3'
Final classifier fitted


In [129]:
pr = stack_full.predict_proba(X_full)

In [130]:
pr.shape

(58036,)

In [131]:
stack_full.predict(X_full, 0.8)

array([1, 1, 0, ..., 1, 1, 1])

In [None]:
# Validation set predictions
Xv_full, Xv_nan, yv_full, yv_nan, idxv_full, idxv_nan = preprocess(X_val, y_val)
preds_full = pipe_full.predict(Xv_full)
probas_full = pipe_full.predict_proba(Xv_full)
preds_nan = pipe_nan.predict(Xv_nan)
probas_nan = pipe_nan.predict_proba(Xv_nan)
y_pred = reconstruct([preds_full, preds_nan], [idxv_full, idxv_nan])
y_probas = reconstruct([probas_full, probas_nan], [idxv_full, idxv_nan])

# Evaluation
confusion = metrics.confusion_matrix(y_val, y_pred)
loss = metrics.log_loss(y_val, y_probas)
print("Confusion matrix")
print(tabulate(confusion, tablefmt="fancy_grid"))
print("Log-loss: {:0.4f}".format(loss))

# Prediction
X_test = pd.read_csv('test.csv')
X_test_ID = X_test.ID.copy()
X_test.drop(['ID'], axis=1, inplace=True)
Xt_full, Xt_nan, _, _, idxt_full, idxt_nan = preprocess(X_test)
probas_full = pipe_full.predict_proba(Xt_full)
probas_nan = pipe_nan.predict_proba(Xt_nan)
y_probas = reconstruct([probas_full, probas_nan], [idxt_full, idxt_nan])
y_probas_df = pd.DataFrame({'ID': X_test_ID, 'PredictedProb': y_probas[:, 1]})
y_probas_df.to_csv('submission.csv', index=None)