In [234]:
import io, os, sys, types
from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell

import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import(
    KFold,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
    GridSearchCV
)


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import(
    train_test_split,
    cross_val_score,
)

from sklearn.linear_model import(
    LassoCV,
    ElasticNetCV,
    LinearRegression
)

from sklearn.feature_selection import(
    VarianceThreshold,
    chi2,
    SelectKBest,
)

from sklearn.ensemble import(
    RandomForestRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor
)

from sklearn.metrics import(
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    explained_variance_score
)

from scipy.stats import skew

import pickle
import xgboost
from sklearn.externals import joblib

In [246]:
# Basic dummy variable dataframe

train = pd.read_csv("train.csv")
train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)
train['GarageYrBlt'].fillna(train['YearBuilt'], inplace=True)
train["PoolQC"].fillna("None", inplace=True)
train["MiscFeature"].fillna("None", inplace=True)
train["Alley"].fillna("None", inplace=True)
train["MasVnrArea"].fillna(0, inplace=True)
train["Fence"].fillna("None", inplace=True)
train["FireplaceQu"].fillna("None", inplace=True)
# train["TotalSF"] = train["TotalBsmtSF"] + train["1stFlrSF"] + train["2ndFlrSF"]
# train.drop(["TotalBsmtSF","1stFlrSF","2ndFlrSF"], axis=1, inplace=True)
train['Electrical'] = train['Electrical'].fillna(train['Electrical'].mode()[0])
train['KitchenQual'] = train['KitchenQual'].fillna(train['KitchenQual'].mode()[0])
train['Exterior1st'] = train['Exterior1st'].fillna(train['Exterior1st'].mode()[0])
train['Exterior2nd'] = train['Exterior2nd'].fillna(train['Exterior2nd'].mode()[0])
train['SaleType'] = train['SaleType'].fillna(train['SaleType'].mode()[0])

train["LotFrontage"] = train.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    train[col] = train[col].fillna('None') 
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    train[col] = train[col].fillna(0)
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    train[col] = train[col].fillna('None')
train["Functional"] = train["Functional"].fillna("Typ")
train.drop(['Utilities'], axis=1, inplace=True)


def categorize(df):
    df['MSSubClass'] = df['MSSubClass'].astype("str")
    df['OverallCond'] = df['OverallCond'].astype("str")
    df['YrSold'] = df['YrSold'].astype("str")
    df['MoSold'] = df['MoSold'].astype("str")
    df["OverallQual"] = df['OverallQual'].astype("str")

    return df

train = categorize(train)
numeric_feats = train.dtypes[train.dtypes != "object"].index

# Applying logarithmic transform to skewed features
# This code was pulled from Alexandru Papiu:
# https://www.kaggle.com/apapiu/regularized-linear-models

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
train[skewed_feats] = np.log1p(train[skewed_feats])

# Getting dummy variables
train_dummies = pd.get_dummies(train)
train_dummies = train_dummies.fillna(0)
train_dummies = train_dummies[train_dummies.columns[~train_dummies.columns.str.contains('_None')]]

In [247]:
# Full dataframe into Train-test Split
def full_set_split(df):
    X = df.drop(["SalePrice","Id"], axis=1)
#     X = vif_filter(X, thresh=5)
    y = df["SalePrice"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    return (X_train, X_test, y_train, y_test)
X_train, X_test, y_train, y_test = full_set_split(train_dummies)

In [282]:
from sklearn.pipeline import make_pipeline

def get_models():
    """Generate a library of base learners."""
    lm = LinearRegression()
    lasso = model_lasso = LassoCV(normalize=True, cv=6)
    elastic_net = ElasticNetCV(alphas=[.001])
    gbm = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.085,
                                       max_depth=2, max_features=10,loss="huber")
    rf_5 = RandomForestRegressor(n_estimators=500)
#     ada = AdaBoostRegressor(base_estimator=rf)
    models = {'Lasso': lasso,
              "Linear": lm,
              'ElasticNet': elastic_net,
              'GradientBoost': gbm,
              'RandomForest5': rf_5,
#               'RandomForest1': rf1,
#               'RandomForest2': rf2,
#               'AdaBoost': ada
              }
    return models


def train_predict(model_list):
    """Fit models in list on training set and return preds"""
    P = np.zeros((y_test.shape[0], len(model_list)))
    P = pd.DataFrame(P)

    print("Fitting models.")
    cols = list()
    for i, (name, m) in enumerate(models.items()):
        print("%s..." % name, end=" ", flush=False)
        m.fit(X_train, y_train)
        P.iloc[:, i] = m.predict(X_test)#[:, 1]
        cols.append(name)
        print("done")

    P.columns = cols
    print("Done.\n")
    return P


def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

def score_models(P, y):
    """Score model in prediction DF"""
    print("Scoring models.")
    for k,v in models.items():
        print(k)
        print(rmse_cv(v).mean())

In [274]:
# models = get_models()
# P = train_predict(models)
# score_models(P, y_test)
# P

In [275]:
def train_base_learners(base_learners, inp, out):
    """Train all base learners in the library."""
    print("Fitting models.")
    for i, (name, m) in enumerate(base_learners.items()):
        print("%s..." % name, end=" ", flush=False)
        m.fit(inp, out)
        print("done")

In [276]:
def predict_base_learners(pred_base_learners, inp):
    """Generate a prediction matrix."""
    P = np.zeros((inp.shape[0], len(pred_base_learners)))
    print("Generating base learner predictions.")
    for i, (name, m) in enumerate(pred_base_learners.items()):
        print("%s..." % name, end=" ", flush=False)
        p = m.predict(inp)
        P[:, i] = p
        print("done")

    return P

In [285]:
def ensemble_predict(base_learners, meta_learner, inp):
    """Generate predictions from the ensemble."""
    P_pred = predict_base_learners(base_learners, inp)
    return P_pred, meta_learner.predict(P_pred)

In [278]:
# P_pred, p = ensemble_predict(base_learners, meta_learner, xtest)

In [279]:
from sklearn.base import clone

def stacking(base_learners, meta_learner, X, y, generator):
    """Simple training routine for stacking."""

    # Train final base learners for test time
    print("Fitting final base learners...", end="")
    train_base_learners(base_learners, X, y)
    print("done")

    # Generate predictions for training meta learners
    # Outer loop:
    print("Generating cross-validated predictions...")
    cv_preds, cv_y = [], []
    for i, (train_idx, test_idx) in enumerate(generator.split(X)):

        fold_xtrain, fold_ytrain = X[train_idx, :], y[train_idx]
        fold_xtest, fold_ytest = X[test_idx, :], y[test_idx]

        # Inner loop: step 4 and 5
        fold_base_learners = {name: clone(model)
                              for name, model in base_learners.items()}
        train_base_learners(
            fold_base_learners, fold_xtrain, fold_ytrain)

        fold_P_base = predict_base_learners(
            fold_base_learners, fold_xtest)

        cv_preds.append(fold_P_base)
        cv_y.append(fold_ytest)
        print("Fold %i done" % (i + 1))

    print("CV-predictions done")
    
    # Be careful to get rows in the right order
    cv_preds = np.vstack(cv_preds)
    cv_y = np.hstack(cv_y)
#     print(cv_preds.shape)
#     print(len(cv_y))
    # Train meta learner
    print("Fitting meta learner...", end="")
    meta_learner.fit(cv_preds, cv_y)
    print("done")

    return base_learners, meta_learner,

In [280]:
meta_learner = GradientBoostingRegressor(n_estimators=2000)

In [286]:
from sklearn.model_selection import KFold

# Train with stacking
cv_base_learners, cv_meta_learner = stacking(
    get_models(), clone(meta_learner), X_train.values, y_train.values, KFold(3))

P_pred, p = ensemble_predict(cv_base_learners, cv_meta_learner, X_test)

Fitting final base learners...Fitting models.
Lasso... done
Linear... done
ElasticNet... done
GradientBoost... done
RandomForest5... done
done
Generating cross-validated predictions...
Fitting models.
Lasso... done
Linear... done
ElasticNet... done
GradientBoost... done
RandomForest5... done
Generating base learner predictions.
Lasso... done
Linear... done
ElasticNet... done
GradientBoost... done
RandomForest5... done
Fold 1 done
Fitting models.
Lasso... done
Linear... done
ElasticNet... done
GradientBoost... done
RandomForest5... done
Generating base learner predictions.
Lasso... done
Linear... done
ElasticNet... done
GradientBoost... done
RandomForest5... done
Fold 2 done
Fitting models.
Lasso... done
Linear... done
ElasticNet... done
GradientBoost... done
RandomForest5... done
Generating base learner predictions.
Lasso... done
Linear... done
ElasticNet... done
GradientBoost... done
RandomForest5... done
Fold 3 done
CV-predictions done
Fitting meta learner...done
Generating base lear

In [287]:
def rmse_cv_stack(model, X, y):
    rmse= np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv = 10))
    return(rmse)
print(rmse_cv_stack(cv_meta_learner, P_pred, y_test).mean())

0.14501375363992536
