In [15]:
import pandas as pd
import numpy as np
from scipy.stats import skew
import xgboost as xgb
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, Lasso
from sklearn.model_selection import GridSearchCV
from math import sqrt

In [16]:
TARGET = 'SalePrice'
NFOLDS = 4
SEED = 0
SUBMISSION_FILE = '/Users/danielfriar/Desktop/CSML/applied_ml/Applied_ML/house_prices_regression/submissions/xgboost_submission.csv'

train = pd.read_csv('/Users/danielfriar/Desktop/CSML/applied_ml/Applied_ML/house_prices_regression/data/train_clean.csv')
test = pd.read_csv('/Users/danielfriar/Desktop/CSML/applied_ml/Applied_ML/house_prices_regression/data/test_clean.csv')
ntrain = train.shape[0]
ntest = test.shape[0]

In [17]:
y_train = np.log(train[TARGET]+1)  # log response variables

train.drop([TARGET], axis=1, inplace=True)
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'], test.loc[:,'MSSubClass':'SaleCondition']))

# # Remove possible outliers
# outlier_indices = train[train.LotFrontage<300].index

# # Polynomial features
# all_data['carsSq'] = all_data.GarageCars ** 2
# train['carsSq'] = train.GarageCars ** 2

# all_data['gAreaSq'] = all_data.GarageArea ** 2
# train['gAreaSq'] = train.GarageArea ** 2

# train['fullBathSq'] = train.FullBath ** 2
# all_data['fullBathSq']= all_data.FullBath ** 2

# train['oQualSq'] = train.OverallQual ** 2
# all_data['oQualSq'] = all_data.OverallQual ** 2


# Try adding interaction terms between numeric variables
all_data['interQual'] = all_data.OverallQual * all_data.OverallCond
train['interQual'] = train.OverallQual * train.OverallCond

all_data['RemodFlrSF'] = all_data.YearRemodAdd * all_data.X2ndFlrSF
train['RemodFlrSF'] = train.YearRemodAdd * train.X2ndFlrSF

all_data['Basement'] = all_data.TotalBsmtSF * all_data.BsmtUnfSF
train['Basement'] = train.TotalBsmtSF * train.BsmtUnfSF

all_data['Ground'] = all_data.GrLivArea * all_data.YearBuilt
train['Ground'] = train.GrLivArea * train.YearBuilt

all_data['Bath'] = all_data.FullBath * all_data.YearBuilt
train['Bath'] = train.FullBath * train.YearBuilt

all_data['AboveGround'] = all_data.TotRmsAbvGrd * all_data.BedroomAbvGr
train['AboveGround'] = train.TotRmsAbvGrd * train.BedroomAbvGr

all_data['livingArea'] = all_data.TotalBsmtSF * all_data.GrLivArea
train['livingArea'] = train.TotalBsmtSF * train.GrLivArea

all_data['years'] = all_data.YearBuilt * all_data.YearRemodAdd
train['years'] = train.YearBuilt * train.YearRemodAdd

all_data['Lot'] = all_data.X1stFlrSF * all_data.LotFrontage
train['Lot'] = train.X1stFlrSF * train.LotFrontage

# # Log transform skewed numeric variables
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

all_data = pd.get_dummies(all_data)

# Create matrices for scikit learn
x_train = np.array(all_data[:train.shape[0]])
x_test = np.array(all_data[train.shape[0]:])

# Create cross val object
kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

In [18]:
# Wrapper class for scikit learn
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [19]:
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [20]:
# Model parameters
et_params = {
    'n_jobs': 40,
    'n_estimators': 100,
    'max_features': 0.6,
    'max_depth': 16,
    'min_samples_leaf': 1,
}

rf_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.6,
    'max_depth': 16,
    'min_samples_leaf': 2,
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'reg:linear',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
    'nrounds': 1000
}

rd_params={
    'alpha': 10
}

ls_params={
    'alpha': 0.0008
}

In [None]:
# Running cross validation
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)
rd = SklearnWrapper(clf=Ridge, seed=SEED, params=rd_params)
ls = SklearnWrapper(clf=Lasso, seed=SEED, params=ls_params)

xg_oof_train, xg_oof_test = get_oof(xg)  # xgboost
et_oof_train, et_oof_test = get_oof(et)  # extra trees
rf_oof_train, rf_oof_test = get_oof(rf)  # random forest
rd_oof_train, rd_oof_test = get_oof(rd)  # ridge regression
ls_oof_train, ls_oof_test = get_oof(ls)  # lasso

# # Save predictions to CSV files
# np.savetxt("predictions/xg.csv", xg_oof_train, delimiter=",")
# np.savetxt("predictions/et.csv", et_oof_train, delimiter=",")
# np.savetxt("predictions/rf.csv", rf_oof_train, delimiter=",")
# np.savetxt("predictions/rd.csv", rd_oof_train, delimiter=",")
# np.savetxt("predictions/ls.csv", ls_oof_train, delimiter=",")
# y_train.to_csv("predictions/y_train.csv")

print("XG-CV: {}".format(sqrt(mean_squared_error(y_train, xg_oof_train))))
print("ET-CV: {}".format(sqrt(mean_squared_error(y_train, et_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, rf_oof_train))))
print("RD-CV: {}".format(sqrt(mean_squared_error(y_train, rd_oof_train))))
print("LS-CV: {}".format(sqrt(mean_squared_error(y_train, ls_oof_train))))

In [31]:
# Stack the models together with XGBoost
x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, rd_oof_train, ls_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, rd_oof_test, ls_oof_test), axis=1)

print("{},{}".format(x_train.shape, x_test.shape))

dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.8,
    'silent': 1,
    'subsample': 0.6,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'max_depth': 1,
    'num_parallel_tree': 1,
    'min_child_weight': 5,
    'eval_metric': 'rmse',
}

# Optimizing number of rounds with cross val
res = np.array(xgb.cv(xgb_params, dtrain, num_boost_round=1000, nfold=4, seed=SEED, show_stdv=True))

[0]	cv-test-rmse:11.415963+0.022152	cv-train-rmse:11.415981+0.007305
[1]	cv-test-rmse:11.302162+0.022218	cv-train-rmse:11.302182+0.007242
[2]	cv-test-rmse:11.189412+0.022281	cv-train-rmse:11.189432+0.007191
[3]	cv-test-rmse:11.077799+0.022298	cv-train-rmse:11.077817+0.007186
[4]	cv-test-rmse:10.967371+0.022371	cv-train-rmse:10.967393+0.007118
[5]	cv-test-rmse:10.857962+0.022496	cv-train-rmse:10.857983+0.007001
[6]	cv-test-rmse:10.749652+0.022578	cv-train-rmse:10.749673+0.006925
[7]	cv-test-rmse:10.642532+0.022667	cv-train-rmse:10.642554+0.006846
[8]	cv-test-rmse:10.536440+0.022759	cv-train-rmse:10.536459+0.006767
[9]	cv-test-rmse:10.431309+0.022841	cv-train-rmse:10.431330+0.006692
[10]	cv-test-rmse:10.327326+0.022847	cv-train-rmse:10.327348+0.006696
[11]	cv-test-rmse:10.224466+0.022921	cv-train-rmse:10.224486+0.006628
[12]	cv-test-rmse:10.122467+0.023036	cv-train-rmse:10.122492+0.006518
[13]	cv-test-rmse:10.021549+0.023065	cv-train-rmse:10.021570+0.006505
[14]	cv-test-rmse:9.921663+0.0

(1460, 5),(1459, 5)


[999]	cv-test-rmse:0.119493+0.010639	cv-train-rmse:0.105965+0.003559


In [32]:
best_nrounds = res.shape[0] - 1
cv_mean = res[-1]
cv_std = res[-1]

print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std))

Ensemble-CV: [999]	cv-test-rmse:0.119493+0.010639	cv-train-rmse:0.105965+0.003559+[999]	cv-test-rmse:0.119493+0.010639	cv-train-rmse:0.105965+0.003559


In [101]:
# # Grid search to optimize xgboost params

# cv_params = {
#     'learning_rate': [0.01],
#     'colsample_bytree': [0.7, 0.8, 0.9],
#     'min_child_weight': [5, 6, 7, 8],
#     'max_depth' : [1, 2, 3],
#     'subsample': [0.6, 0.7, 0.8]
# }

# cv = StratifiedKFold(y_train)
# grid = GridSearchCV(xgb.XGBRegressor(seed=0, silent=1, subsample=0.6, objective='reg:linear'), 
#                                      param_grid=cv_params, scoring='neg_mean_squared_error', cv=cv)
# grid.fit(x_train, y_train)
# print (grid.best_params_)

In [33]:
gbdt = xgb.train(xgb_params, dtrain, best_nrounds)

submission = pd.read_csv(SUBMISSION_FILE)
submission.iloc[:, 1] = gbdt.predict(dtest)
saleprice = np.exp(submission['SalePrice'])-1
submission['SalePrice'] = saleprice
submission.to_csv('xgstacker_submission.csv', index=None)