In [2]:
#import some necessary librairies

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import matplotlib.pyplot as plt  # Matlab-style plotting
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings(action='ignore')


from scipy import stats
from scipy.stats import norm, skew #for some statistics


pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points


In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
# nominal vars
nominal_vars = [
    'MSZoning', 'LandContour', 'Utilities',
    'LotConfig', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
    'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 
    'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'Electrical', 
    'GarageType', 'MiscFeature', 
    'SaleType', 'SaleCondition'
]

In [5]:
# ranking vars
ranking_vars = [
    'OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
    'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual',
    'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'Street', 'Alley',
    'LandSlope', 'Functional', 'GarageFinish', 'MoSold', 'YrSold', 'PavedDrive', 
    'CentralAir', 'LotShape', 'MSSubClass', 
]

In [79]:
# continuous vars
continue_vars = [
    'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
    'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
    'BsmtHalfBath', 'FullBath', 'HalfBath', 'TotRmsAbvGrd', 'BedroomAbvGr', 'KitchenAbvGr',
    'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
    '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'GarageYrBlt', 'YearBuilt', 'YearRemodAdd',
    'OverallQual'
]

In [6]:
train.SalePrice = np.log1p(train.SalePrice)

In [7]:
# delete ID
train_id = train.Id
test_id = test.Id
test_idx = test.index
train.drop(['Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

In [8]:
train.drop(train.GrLivArea.sort_values(ascending=False)[:2].index, axis=0, inplace=True)

In [9]:
ntrain = train.shape[0]
ntest = test.shape[0]
y_train = train['SalePrice'].values
all_data = pd.concat((train, test), axis=0).reset_index(drop=True)
all_data.drop(['SalePrice'], axis=1, inplace=True)

In [84]:
count = all_data.isnull().sum()
percent = (count / all_data.isnull().count() * 100).sort_values(ascending=False)
missing_table = pd.DataFrame({'percent': percent})
missing_table.head(10)

Unnamed: 0,percent
PoolQC,99.691
MiscFeature,96.4
Alley,93.212
Fence,80.425
FireplaceQu,48.68
LotFrontage,16.661
GarageCond,5.451
GarageQual,5.451
GarageYrBlt,5.451
GarageFinish,5.451


In [85]:
# specific value replacements
all_data["PoolQC"] = all_data["PoolQC"].fillna("None")
all_data["MiscFeature"] = all_data["MiscFeature"].fillna("None")
all_data["Alley"] = all_data["Alley"].fillna("None")
all_data["Fence"] = all_data["Fence"].fillna("None")
all_data["FireplaceQu"] = all_data["FireplaceQu"].fillna("None")

for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    all_data[col] = all_data[col].fillna('None')

for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    all_data[col] = all_data[col].fillna(0)
    
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    all_data[col] = all_data[col].fillna(0)
    
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_data[col] = all_data[col].fillna('None')

all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None")
all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)
all_data["Functional"] = all_data["Functional"].fillna("Typ")
all_data['MSSubClass'] = all_data['MSSubClass'].fillna("None")

In [86]:
# mean value replacements
all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])

all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

In [87]:
all_data.drop(['Utilities'], axis=1, inplace=True)

In [88]:
# Complete
all_data.isnull().sum().sum()

0

In [89]:
all_data['HasMasVnr'] = all_data.MasVnrArea.apply(lambda x: 1 if x else 0)
all_data['Has2ndFlrSF'] = all_data['2ndFlrSF'].apply(lambda x: 1 if x else 0)
all_data['HasGarageArea'] = all_data['GarageArea'].apply(lambda x: 1 if x else 0)
all_data['HasWoodDeckSF'] = all_data['WoodDeckSF'].apply(lambda x: 1 if x else 0)
all_data['HasOpenPorchSF'] = all_data['OpenPorchSF'].apply(lambda x: 1 if x else 0)
all_data['HasEnclosedPorch'] = all_data['EnclosedPorch'].apply(lambda x: 1 if x else 0)
all_data['Has3SsnPorch'] = all_data['3SsnPorch'].apply(lambda x: 1 if x else 0)
all_data['HasScreenPorch'] = all_data['ScreenPorch'].apply(lambda x: 1 if x else 0)
all_data['HasPoolArea'] = all_data['PoolArea'].apply(lambda x: 1 if x else 0)
all_data['HasMiscVal'] = all_data['MiscVal'].apply(lambda x: 1 if x else 0)

all_data['TotalBath'] = all_data['BsmtFullBath'] + all_data['BsmtHalfBath'] * 0.5\
+ all_data['FullBath'] + all_data['HalfBath'] * 0.5
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
#all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['2ndFlrSF']
all_data['TotalBsmtSF'] = all_data['TotalBsmtSF'] + all_data['BsmtFinSF1'] \
+ all_data['BsmtFinSF2']

In [90]:
nominal_vars = list(set(nominal_vars) - set(['Utilities']))

In [91]:
all_data[nominal_vars] = all_data[nominal_vars].astype(str)
all_data[ranking_vars] = all_data[ranking_vars].astype(str)

In [92]:
from sklearn.preprocessing import LabelEncoder

for c in ranking_vars:
    lbl = LabelEncoder() 
    lbl.fit(list(all_data[c].values)) 
    all_data[c] = lbl.transform(list(all_data[c].values))

In [93]:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head()

Unnamed: 0,Skew
MiscVal,21.94
PoolArea,17.689
HasPoolArea,15.495
LotArea,13.109
LowQualFinSF,12.085


In [94]:
skewness = skewness[abs(skewness) > 0.75]

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    all_data[feat] = boxcox1p(all_data[feat], lam)

In [95]:
all_data = pd.get_dummies(all_data)
print(all_data.shape)

(2917, 231)


In [96]:
X_train = all_data[:ntrain]
X_test = all_data[ntrain:]

In [97]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.model_selection import KFold, GridSearchCV, train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.metrics import mean_squared_error as mse
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor as rfr, GradientBoostingRegressor as gbr
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingCVRegressor

import optuna
from functools import partial


In [98]:
n_folds = 5

def rmsle(model, x, y):
    return np.sqrt(mean_squared_error(y, model.predict(x)))

def rmsle_cv(model, x, y):
    #kf = KFold(n_splits=5, shuffle=True, random_state=42)
    kf = 5
    rmse =np.sqrt(-cross_val_score(model, x.values, y, scoring='neg_mean_squared_error', cv=kf))
    return rmse

In [99]:
pipe_lasso = Pipeline([
    ('scaler', RobustScaler()),
    ('model', Lasso())
])
pipe_enet = Pipeline([
    ('scaler', RobustScaler()),
    ('model', ElasticNet(max_iter=5000))
])
pipe_krr = Pipeline([
    ('scaler', RobustScaler()),
    ('model', KernelRidge())
])

grid_param_lasso = [{
    'model__alpha': 0.0001 * np.arange(1, 100)
}]
grid_param_enet = [{
    'model__alpha': 0.0001 * np.arange(1, 100),
    'model__l1_ratio': 0.001 * np.arange(1, 10)
}]
grid_param_krr = [{
    'model__alpha': list(0.0001 * np.arange(1, 100)) + [0.6],
    'model__degree': [1, 2, 3],
    'model__kernel': ['polynomial'],
    'model__coef0': [2.5]
}]

In [100]:
best_params = {
    'Lasso': None,
    'ElasticNet': None,
    'Kernel Ridge': None
}

In [70]:
search_lasso = GridSearchCV(pipe_lasso, grid_param_lasso, scoring='neg_mean_squared_error', n_jobs=-1).fit(X_train, y_train)
best_params['Lasso'] = search_lasso.best_params_

In [71]:
search_enet = GridSearchCV(pipe_enet, grid_param_enet, scoring='neg_mean_squared_error', n_jobs=-1).fit(X_train, y_train)
best_params['ElasticNet'] = search_enet.best_params_

In [101]:
search_krr = GridSearchCV(pipe_krr, grid_param_krr, scoring='neg_mean_squared_error', n_jobs=-1).fit(X_train, y_train)
best_params['Kernel Ridge'] = search_krr.best_params_

In [102]:
best_params

{'Lasso': None,
 'ElasticNet': None,
 'Kernel Ridge': {'model__alpha': 0.0099,
  'model__coef0': 2.5,
  'model__degree': 1,
  'model__kernel': 'polynomial'}}

In [67]:
model_lasso = Pipeline([
    ('scaler', RobustScaler()),
    ('model', Lasso(alpha=0.0005))
])

model_enet = Pipeline([
    ('scaler', RobustScaler()),
    ('model', ElasticNet(alpha=0.0089, l1_ratio=0.009000000000000001, random_state=3))
])

model_krr = Pipeline([
    ('scaler', RobustScaler()),
    ('model', KernelRidge(alpha=0.6,
                        kernel='polynomial',
                        degree=2,
                        coef0=2.5))
])

In [26]:
model_xgbr = XGBRegressor(colsample_bytree=0.4, learning_rate=0.00898718134841855, max_depth=8, 
                             n_estimators=2200, reg_alpha=0.036142628805195254, reg_lambda=0.03188665185506858,
                             subsample=0.6, random_state =42)

In [27]:
model_gbr = gbr(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state=5)

In [28]:
model_lgbm = LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [29]:
stack_gen = StackingCVRegressor(regressors=(model_lgbm, model_lasso, model_enet, model_krr, model_gbr),
                               meta_regressor=model_xgbr,
                               use_features_in_secondary=True)

In [30]:
models = [
    model_lasso, model_enet, model_krr, model_gbr, model_xgbr, model_lgbm
]
cross_score = {
    'Lasso': 0,
    'ElasticNet': 0,
    'Kernel Ridge': 0,
    'GradientBoosting': 0,
    'XGBoost': 0,
    'LightGBM': 0,
}

for idx, model in enumerate(models):
    cross_score[list(cross_score.keys())[idx]] = rmsle_cv(model, X_train, y_train).mean()



In [31]:
cross_score

{'Lasso': 0.11146742693188405,
 'ElasticNet': 0.11334756941313837,
 'Kernel Ridge': 0.11846942905147206,
 'GradientBoosting': 0.11638980595469176,
 'XGBoost': 0.11370977524666484,
 'LightGBM': 0.11457622325507826}

In [32]:
for model in models:
    model = model.fit(X_train, y_train)



In [33]:
stack_gen = stack_gen.fit(X_train, y_train)



In [34]:
def blend(X):
    return ((0.15 * model_lasso.predict(X)) + \
            (0.15 * model_enet.predict(X)) + \
            (0.05 * model_krr.predict(X)) + \
            (0.15 * model_xgbr.predict(X)) + \
            (0.15 * model_lgbm.predict(X)) + \
            (0.35 * stack_gen.predict(np.array(X))))

In [35]:
np.sqrt(mse(y_train, blend(X_train)))

0.046255324643040246

In [36]:
sub = pd.DataFrame()
sub['Id'] = test_id
sub['SalePrice'] = score = np.expm1(blend(X_test))
sub.to_csv('submission.csv',index=False)