# Cross validation comparison of different models

In this notebook, I compare the accuracy of different models for the pre-processed data of the Kaggle competition Mercedes-Benz Greener Manufacturing.

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
% matplotlib inline
import seaborn as sns
sns.set_palette('Spectral')
import time
import os

#import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.utils import check_array
from sklearn.linear_model import LassoLarsCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder


import warnings
warnings.filterwarnings('ignore')

In [7]:
# Quickstart
# load train and test data
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
# encode categorical data
for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))
# remove the previously identified outlier
train = train.drop(883, axis=0)
X_train = train.drop('y', axis=1)
y_train = train['y']
X_test = test

In [4]:
class StackingEstimator(BaseEstimator, TransformerMixin):
    
    def __init__(self, estimator):
        self.estimator = estimator
        
    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
    
    def transform(self, X):
        X = check_array(X)
        X_transformed = np.copy(X)
        
        # add class probabilities as a synthetic additional feature
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))

        # add class prediction as a synthetic additional feature
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))

        return X_transformed

## LassoLarsCV stand-alone model

In [5]:
t0 = time.time()
lassolarscv = LassoLarsCV()
lassolarscv.fit(X_train, y_train)
seed = 420
kfold = KFold(n_splits=10, random_state=seed)
results_lasso = cross_val_score(lassolarscv, X_train, y_train, cv=kfold)
print('Accuracy:', results_lasso.mean())
print("Done: {:.1f} s".format(time.time() - t0))

Accuracy: 0.56071456294
Done: 3.1 s


## GradientBoostingRegressor stand-alone model

In [9]:
t0 = time.time()
seed = 420
gradient = GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, 
                                                          max_features=0.55, min_samples_leaf=18, min_samples_split=14, 
                                                          subsample=0.7, random_state=seed)
gradient.fit(X_train, y_train)
kfold = KFold(n_splits=10, random_state=seed)
results_gradient = cross_val_score(gradient, X_train, y_train, cv=kfold)
print('Accuracy:', results_gradient.mean())
print("Done: {:.1f} s".format(time.time() - t0))

Accuracy: 0.0882573111935
Done: 38.6 s


## LassoLarsCV double-stacked model

In [10]:
stacked_pipeline_1 = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    LassoLarsCV()
)

# stacked model is trained without the extra features
t0 = time.time()
stacked_pipeline_1.fit(X_train, y_train)
seed = 420
kfold = KFold(n_splits=10, random_state=seed)
results_pipe = cross_val_score(stacked_pipeline_1, X_train, y_train, cv=kfold)
print('Accuracy:', results_pipe.mean())
print("Done: {:.1f} s".format(time.time() - t0))

Accuracy: 0.573467511832
Done: 7.5 s


## GradientBoostRegressor / LassoLarsCV stacked model

In [12]:
seed = 420
stacked_pipeline_1 = make_pipeline(
    StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, 
                                                          max_features=0.55, min_samples_leaf=18, min_samples_split=14, 
                                                          subsample=0.7, random_state=seed)),
    LassoLarsCV()
)

# stacked model is trained without the extra features
t0 = time.time()
stacked_pipeline_1.fit(X_train, y_train)
kfold = KFold(n_splits=10, random_state=seed)
results_pipe = cross_val_score(stacked_pipeline_1, X_train, y_train, cv=kfold)
print('Accuracy:', results_pipe.mean())
print("Done: {:.1f} s".format(time.time() - t0))

Accuracy: 0.595300388475
Done: 44.7 s


## LassoLarsCV / GradientBoostRegressor stacked model

In [13]:
seed = 420
stacked_pipeline_1 = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, 
                                                          max_features=0.55, min_samples_leaf=18, min_samples_split=14, 
                                                          subsample=0.7, random_state=seed))
    
# stacked model is trained without the extra features
t0 = time.time()
stacked_pipeline_1.fit(X_train, y_train)
kfold = KFold(n_splits=10, random_state=seed)
results_pipe = cross_val_score(stacked_pipeline_1, X_train, y_train, cv=kfold)
print('Accuracy:', results_pipe.mean())
print("Done: {:.1f} s".format(time.time() - t0))

Accuracy: 0.0904126700999
Done: 45.2 s


## LassoLarsCV / GradientBoostRegressor / LassoLarsCV stacked model

In [14]:
seed = 420
stacked_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, 
                                                          max_features=0.55, min_samples_leaf=18, min_samples_split=14, 
                                                          subsample=0.7, random_state=seed)),
    LassoLarsCV()
)
# stacked model is trained without the extra features
t0 = time.time()
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(stacked_pipeline, X_train, y_train, cv=kfold)
print('Accuracy:', results.mean())
print("Done: {:.1f} s".format(time.time() - t0))

Accuracy: 0.59736964941
Done: 43.8 s


## 2xLassoLarsCV / GradientBoostRegressor / LassoLarsCV stacked model

In [15]:
seed = 420
stacked_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, 
                                                          max_features=0.55, min_samples_leaf=18, min_samples_split=14, 
                                                          subsample=0.7, random_state=seed)),
    LassoLarsCV()
)
# stacked model is trained without the extra features
t0 = time.time()
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(stacked_pipeline, X_train, y_train, cv=kfold)
print('Accuracy:', results.mean())
print("Done: {:.1f} s".format(time.time() - t0))

Accuracy: 0.596577261699
Done: 43.0 s


## LassoLarsCV / GradientBoostRegressor / 2xLassoLarsCV stacked model

In [16]:
seed = 420
stacked_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, 
                                                          max_features=0.55, min_samples_leaf=18, min_samples_split=14, 
                                                          subsample=0.7, random_state=seed)),
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    LassoLarsCV()
)
# stacked model is trained without the extra features
t0 = time.time()
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(stacked_pipeline, X_train, y_train, cv=kfold)
print('Accuracy:', results.mean())
print("Done: {:.1f} s".format(time.time() - t0))

Accuracy: 0.597468073476
Done: 42.9 s


From the tested models and stacking configurations, this one provided the second highest accuracy. The best accuracy was provided by the next configuration, however with only a marginal increase. To reduce the risk of overfitting, we select this model as the best stacking model from the group.

## LassoLarsCV / GradientBoostRegressor / 3xLassoLarsCV stacked model

In [17]:
seed = 420
stacked_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, 
                                                          max_features=0.55, min_samples_leaf=18, min_samples_split=14, 
                                                          subsample=0.7, random_state=seed)),
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    LassoLarsCV()
)
# stacked model is trained without the extra features
t0 = time.time()
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(stacked_pipeline, X_train, y_train, cv=kfold)
print('Accuracy:', results.mean())
print("Done: {:.1f} s".format(time.time() - t0))

Accuracy: 0.597471532978
Done: 46.4 s
