# Initialization

In [138]:
# %load data_cleaning.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

# sklearn imports
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

# import custom modules
import model_params as mp
import data_cleaning as dc

# load data and take a look at it
df_train = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")

# Data Cleaning

In [139]:
# Dropping useless columns
trainID = df_train['Id']
df_train.drop('Id', axis=1, inplace=True)
testID = df_test['Id']
df_test.drop('Id', axis=1, inplace=True)

# Removing outliers
df_train = df_train[df_train['GrLivArea'] < 4000]

# Target feature and Log transform the target for official scoring
y_train = np.log1p(df_train['SalePrice'].values)
df_train.drop('SalePrice', axis=1, inplace=True)

In [140]:
# Dataframes length
len_train = len(df_train)
len_test = len(df_test)

In [141]:
# Putting together the dataframes for easier cleaning
df_all = [df_train, df_test]

In [142]:
mean_group_cols = ['LotFrontage', 'MasVnrArea', 'BsmtUnfSF', 'BsmtUnfSF']
grp = 'Neighborhood'
for df in df_all:
    dc.replace_nan(df, mean_group_cols, 'mean_group', group=grp)

In [143]:
mode_group_cols = ['MSZoning', 'Utilities', 'Electrical', 'Exterior1st', 'Exterior2nd', 'Functional']
grp = 'Neighborhood'
for df in df_all:
    dc.replace_nan(df, mode_group_cols, 'mode_group', group=grp)

In [144]:
none_cols = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
             'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC',
             'Fence', 'MiscFeature']
for df in df_all:
    dc.replace_nan(df, none_cols, 'const', value='None')

In [145]:
zero_cols = ['GarageCars', 'GarageArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath',
             'TotalBsmtSF']
for df in df_all:
    dc.replace_nan(df, zero_cols, 'const', value=0)

In [146]:
mode_cols = ['KitchenQual', 'SaleType']
for df in df_all:
    dc.replace_nan(df, mode_cols, 'mode')

In [147]:
num_to_cat_cols = ['MSSubClass', 'OverallCond', 'YrSold', 'MoSold']
for df in df_all:
    dc.numerical_to_categorical(df, num_to_cat_cols)

In [148]:
# Number of NaNs
print(df_train.isna().sum())

MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
Length: 79, dtype: int64


In [149]:
# Enconding and labeling train and test dataframes together
df_enc = df_train.append(df_test, ignore_index=True)

In [150]:
encode_cols = ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'ExterQual',
            'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 'BsmtFinType2',
            'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope', 'LotShape',
            'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
            'YrSold', 'MoSold']
dc.encode_labels(df_enc, encode_cols)

In [151]:
# Inserting dummy variables to categorical features
df_enc = pd.get_dummies(df_enc)

In [153]:
# Splitting dataframes again into test and train sets
df_train = df_enc[:len_train]
df_test = df_enc[len_train:]

In [154]:
print(df_train.shape)

(1456, 324)


In [155]:
print(df_test.shape)

(1459, 324)


# Training data splitting

In [156]:
# Some useful parameters which will come in handy later on
ntrain = df_train.shape[0]
ntest = df_test.shape[0]
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(n_splits= NFOLDS)

In [157]:
#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True).get_n_splits(df_train.values)
    rmse= np.sqrt(-cross_val_score(model, df_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [158]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))

In [159]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

In [160]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

In [161]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

In [162]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

In [163]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [164]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Lasso score: 0.1126 (0.0065)



In [165]:
lasso.fit(df_train, y_train)
pred = lasso.predict(df_test)

In [167]:
sub = pd.DataFrame()
sub['Id'] = testID
sub['SalePrice'] = pred
sub.to_csv('submission.csv',index=False)