# Set model

In [1]:
#Visualizing tools
import seaborn as sns
import matplotlib.pyplot as plt

#preprocessing tools
import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

#ML Algoirthm
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
import sklearn.linear_model as linear_model
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from mlxtend.regressor import StackingCVRegressor

In [2]:
df = pd.read_csv('preprossed_data/preprossed_data.csv')

We use the cross_val_score function of Sklearn. However this function has not a shuffle attribut, we add then one line of code, in order to shuffle the dataset prior to cross-validation

In [3]:
X = df.drop("SalePrice", axis=1)
y = df["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [4]:
kf = KFold(n_splits=12, random_state=42, shuffle=True)

# Define error metrics
def rmsle(y_test, y_pred):
    return np.sqrt(mean_squared_error(y_test, y_pred))

def cv_rmse(model, X=X):
    rmse = np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv=kf))
    return (rmse)

In [5]:
# Ridge Regressor
ridge_alphas = [1e-15, 1e-10, 1e-8, 9e-4, 7e-4, 5e-4, 3e-4, 1e-4, 1e-3, 5e-2, 1e-2, 0.1, 0.3, 1, 3, 5, 10, 15, 18, 20, 30, 50, 75, 100]
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=ridge_alphas, cv=kf))

# Support Vector Regressor
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003))

# Light Gradient Boosting Regressor
lightgbm = LGBMRegressor(objective='regression', 
                       num_leaves=6,
                       learning_rate=0.01, 
                       n_estimators=7000,
                       max_bin=200, 
                       bagging_fraction=0.8,
                       bagging_freq=4, 
                       bagging_seed=8,
                       feature_fraction=0.2,
                       feature_fraction_seed=8,
                       min_sum_hessian_in_leaf = 11,
                       verbose=-1,
                       random_state=42)


# Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=6000,
                                learning_rate=0.01,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                random_state=42)

# XGBoost Regressor
xgboost = XGBRegressor(learning_rate=0.01, n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006,random_state=42)

# StackingCVRegressor 
stackReg = StackingCVRegressor(regressors=(xgboost, svr, ridge, gbr),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True,random_state=42)

In [6]:
model_score = {}

score = cv_rmse(lightgbm)
lgb_model_full_data = lightgbm.fit(X_train, y_train)
print("lightgbm: {:.4f}".format(score.mean()))
model_score['lgb'] = score.mean()

lightgbm: 0.0654


In [7]:
score = cv_rmse(xgboost)
xgb_model_full_data = xgboost.fit(X_train, y_train)
print("xgboost: {:.4f})".format(score.mean()))
model_score['xgb'] = score.mean()

xgboost: 0.0654)


In [8]:
score = cv_rmse(svr)
svr_model_full_data = svr.fit(X_train, y_train)
print("SVR: {:.4f} ({:.4f})".format(score.mean(), score.std()))
model_score['svr'] = score.mean()

SVR: 0.0635 (0.0245)


In [9]:
score = cv_rmse(ridge)
ridge_model_full_data = ridge.fit(X_train, y_train)
print("ridge: {:.4f}".format(score.mean()))
model_score['ridge'] =  score.mean()

ridge: 0.0662


In [10]:
score = cv_rmse(gbr)
gbr_model_full_data = gbr.fit(X_train, y_train)
print("gbr: {:.4f}".format(score.mean()))
model_score['gbr'] =  score.mean()

gbr: 0.0646


In [11]:
stack_reg_model = stackReg.fit(np.array(X_train), np.array(y_train))



In [12]:
X_train.shape

(1022, 258)

In [13]:
X_test.shape

(438, 258)

In [14]:
def blended_predictions(X, weight):
    return ((weight[0] * ridge_model_full_data.predict(X)) + \
            (weight[1] * svr_model_full_data.predict(X)) + \
            (weight[2] * gbr_model_full_data.predict(X)) + \
            (weight[3] * xgb_model_full_data.predict(X)) + \
            (weight[4] * lgb_model_full_data.predict(X)) + \
            (weight[5] * stack_reg_model.predict(np.array(X))))

In [15]:
# Blended model predictions
blended_score = rmsle(y_test, blended_predictions(X_test,[0.10,0.10,0.20,0.15,0.15,0.3]))
print("blended score: {:.4f}".format(blended_score))
model_score['blended_model'] =  blended_score

# 0.15,0.20,0.10,0.15,0.10,0.30

blended score: 0.0610


In [16]:
pd.Series(model_score).sort_values(ascending=True)

blended_model    0.061017
svr              0.063495
gbr              0.064572
lgb              0.065355
xgb              0.065401
ridge            0.066235
dtype: float64

In [17]:
import pickle
from pathlib import Path

In [18]:
# save lgb model
with open('./models/lgb_model_full_data.pkl','wb') as f:
    pickle.dump(lgb_model_full_data, f, protocol=4)

In [19]:
# save ridge model
with open('./models/ridge_model_full_data.pkl','wb') as f:
    pickle.dump(ridge_model_full_data, f, protocol=4)

In [20]:
# save svr model
with open('./models/svr_model_full_data.pkl','wb') as f:
    pickle.dump(svr_model_full_data, f, protocol=4)

In [21]:
# save gbr model
with open('./models/gbr_model_full_data.pkl','wb') as f:
    pickle.dump(gbr_model_full_data, f, protocol=4)

In [22]:
# save xgb model
with open('./models/xgb_model_full_data.pkl','wb') as f:
    pickle.dump(xgb_model_full_data, f, protocol=4)

In [23]:
# save stack model
with open('./models/stack_reg_model.pkl','wb') as f:
    pickle.dump(stack_reg_model, f, protocol=4)