In [6]:
import pandas as pd
import numpy as np
import random

import warnings
warnings.filterwarnings('ignore')

seed = 42

# Data Preparation

In [7]:
# prepare the dataset with a subset of features
path = '../data/preprocessed_house_prices'
df = pd.read_csv(path, index_col=0)
selected_feature_names = ['ExterQual', 'GrLivArea', 'TotRmsAbvGrd', 'Total_Bathrooms', 'MedianHousePrice', 'OverallQual']
X = df[selected_feature_names]
y = df['SalePrice']
X.head(3)

Unnamed: 0,ExterQual,GrLivArea,TotRmsAbvGrd,Total_Bathrooms,MedianHousePrice,OverallQual
0,4,1710,8,3.5,197200.0,7
1,3,1262,6,2.5,218000.0,6
2,4,1786,6,3.5,197200.0,7


In [8]:
X.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ExterQual,1456.0,3.393544,0.571667,2.0,3.0,3.0,4.0,5.0
GrLivArea,1456.0,1509.201923,506.902735,334.0,1128.0,1458.5,1775.25,4476.0
TotRmsAbvGrd,1456.0,6.508242,1.615199,2.0,5.0,6.0,7.0,14.0
Total_Bathrooms,1456.0,2.207074,0.781582,1.0,2.0,2.0,2.5,6.0
MedianHousePrice,1456.0,174718.457418,55916.967685,88000.0,135000.0,179900.0,197200.0,315000.0
OverallQual,1456.0,6.093407,1.377107,1.0,5.0,6.0,7.0,10.0


In [9]:
from sklearn.model_selection import train_test_split

# prepare the base_layer_model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
categorical_features = X.select_dtypes(include='object').columns.to_list()
numerical_features = X.select_dtypes(include='number').columns.to_list()

# Base Layer Model Building

In [10]:
from fatih_regression import create_pipelines
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor

In [11]:
algorithms_scaled = [Ridge(alpha=10), 
                     SVR(kernel='rbf', gamma=0.01, C=100)] 

algorithms_unscaled = [LGBMRegressor(random_state=seed, verbose=-1, learning_rate=0.1, n_estimators=50, num_leaves=11), 
                       GradientBoostingRegressor(random_state=seed, subsample=0.8, n_estimators=50, max_depth=3, learning_rate=0.1)]

base_pipelines = create_pipelines(algorithms_scaled, algorithms_unscaled, X, y)

#for example
base_pipelines['SVR']

In [12]:
from fatih_regression import calculate_scores, calculate_score

calculate_scores(base_pipelines, X_train, X_test, y_train, y_test)

Unnamed: 0,Ridge_train,Ridge_test,Ridge_CV,SVR_train,SVR_test,SVR_CV,LGBMR_train,LGBMR_test,LGBMR_CV,Gradi_train,Gradi_test,Gradi_CV
R2,0.82863,0.819573,0.821959,0.860392,0.823881,0.840798,0.889229,0.828972,0.83859,0.888821,0.831124,0.848101
-mae,0.124344,0.124497,-0.125609,0.110217,0.121703,-0.116929,0.097108,0.118362,-0.116142,0.098754,0.117259,-0.112679
-mse,0.027393,0.028637,-0.028014,0.022316,0.027954,-0.025031,0.017706,0.027146,-0.025395,0.017771,0.026804,-0.023852
-rmse,0.165507,0.169226,-0.167169,0.149385,0.167194,-0.158094,0.133064,0.164759,-0.159288,0.13331,0.163719,-0.154351


In [13]:
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import cross_validate, GridSearchCV

base_model = VotingRegressor(list(base_pipelines.items()), weights=[0.3, 0.2, 0.3, 0.2])

cv_scores_base = pd.DataFrame(cross_validate(base_model, X_train, y_train, scoring=['r2', 'neg_root_mean_squared_error'])).iloc[:,2:]
cv_scores_base.mean()

test_r2                             0.848938
test_neg_root_mean_squared_error   -0.153973
dtype: float64

In [14]:
base_model.fit(X_train, y_train)

In [15]:
import pickle

with open("../models/base_model.pkl", "wb") as file:
    pickle.dump(base_model, file)

In [16]:
# Create a new feature with the predictions from the blended model
X_train_stacked = X_train.copy()
X_train_stacked['Blended_Predictions'] = base_model.predict(X_train)

# Create a new feature with the predictions from the blended model on the test set
X_test_stacked = X_test.copy()
X_test_stacked['Blended_Predictions'] = base_model.predict(X_test)

X_stacked = pd.concat([X_train_stacked, X_test_stacked], axis=0)
y_stacked = pd.concat([y_train, y_test], axis=0)

print(X_stacked.shape)
print(y_stacked.shape)

X_stacked.head(3)

(1456, 7)
(1456,)


Unnamed: 0,ExterQual,GrLivArea,TotRmsAbvGrd,Total_Bathrooms,MedianHousePrice,OverallQual,Blended_Predictions
254,3,1314,5,2.0,140000.0,5,11.850917
1065,4,2260,7,3.5,200250.0,7,12.496154
637,3,1387,7,2.0,119000.0,5,11.737014


# Meta Layer Model Building

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_stacked, y_stacked, test_size=0.2, random_state=42)

In [18]:
gb_params = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [2,3,4,5],
    'subsample': [0.8, 0.9, 1.0]
}

grid_search = GridSearchCV(GradientBoostingRegressor(random_state=seed), 
                           param_grid = gb_params, 
                           cv=5, n_jobs=-1, scoring='neg_root_mean_squared_error')

grid_search.fit(X_stacked, y_stacked)
meta_model = grid_search.best_estimator_
meta_model

In [19]:
with open("../models/meta_model.pkl", "wb") as file:
    pickle.dump(meta_model, file)

In [20]:
pd.DataFrame(cross_validate(meta_model, X_stacked, y_stacked, cv=5, n_jobs=-1, scoring=['r2', 'neg_root_mean_squared_error'])).iloc[:,2:].mean()

test_r2                             0.866484
test_neg_root_mean_squared_error   -0.145308
dtype: float64