In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import PredefinedSplit, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error

from scipy.stats import loguniform as sp_loguniform
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

import statsmodels.api as sm
import statsmodels.formula.api as smf

import warnings
warnings.filterwarnings('ignore')

Load the Training, Validation, and Test datasets for:   

1. Multi - family Homes  
2. Industrial Type properties   
   

In [2]:
#MULTIFAMILY get training, validation and test data
X_train_mf = pd.read_csv('../processing/X_train_mf.csv')
X_train_mf.index = X_train_mf['Unnamed: 0']
X_train_mf = X_train_mf.drop(['Unnamed: 0'], axis=1)

X_val_mf = pd.read_csv('../processing/X_val_mf.csv')
X_val_mf.index = X_val_mf['Unnamed: 0']
X_val_mf = X_val_mf.drop(['Unnamed: 0'], axis=1)

X_test_mf = pd.read_csv('../processing/X_test_mf.csv')
X_test_mf.index = X_test_mf['Unnamed: 0']
X_test_mf = X_test_mf.drop(['Unnamed: 0'], axis=1)

y_train_mf = pd.read_csv('../processing/y_train_mf.csv')
y_train_mf.index = y_train_mf['Unnamed: 0']
y_train_mf = y_train_mf.drop(['Unnamed: 0'], axis=1)

y_val_mf = pd.read_csv('../processing/y_val_mf.csv')
y_val_mf.index = y_val_mf['Unnamed: 0']
y_val_mf = y_val_mf.drop(['Unnamed: 0'], axis=1)

y_test_mf = pd.read_csv('../processing/y_test_mf.csv')
y_test_mf.index = y_test_mf['Unnamed: 0']
y_test_mf = y_test_mf.drop(['Unnamed: 0'], axis=1)

In [3]:
#INDUSTRIAL get training, validation and test data
X_train_id = pd.read_csv('../processing/X_train_id.csv')
X_train_id.index = X_train_id['Unnamed: 0']
X_train_id = X_train_id.drop(['Unnamed: 0'], axis=1)

X_val_id = pd.read_csv('../processing/X_val_id.csv')
X_val_id.index = X_val_id['Unnamed: 0']
X_val_id = X_val_id.drop(['Unnamed: 0'], axis=1)

X_test_id = pd.read_csv('../processing/X_test_id.csv')
X_test_id.index = X_test_id['Unnamed: 0']
X_test_id = X_test_id.drop(['Unnamed: 0'], axis=1)

y_train_id = pd.read_csv('../processing/y_train_id.csv')
y_train_id.index = y_train_id['Unnamed: 0']
y_train_id = y_train_id.drop(['Unnamed: 0'], axis=1)

y_val_id = pd.read_csv('../processing/y_val_id.csv')
y_val_id.index = y_val_id['Unnamed: 0']
y_val_id = y_val_id.drop(['Unnamed: 0'], axis=1)

y_test_id = pd.read_csv('../processing/y_test_id.csv')
y_test_id.index = y_test_id['Unnamed: 0']
y_test_id = y_test_id.drop(['Unnamed: 0'], axis=1)

Create a predefined training plus validation dataset  

In [4]:
#Multifamily
y_train_plus_val_mf = pd.concat([y_train_mf, y_val_mf])
y_train_plus_val_mf_copy = y_train_plus_val_mf.copy()
y_train_plus_val_mf_copy.columns = ['train_val_split']
y_train_plus_val_mf_copy.loc[y_train_mf.index,'train_val_split'] = -1
y_train_plus_val_mf_copy.loc[y_val_mf.index,'train_val_split'] = 0
val_fold_mf = np.array(y_train_plus_val_mf_copy)
ps_mf = PredefinedSplit(val_fold_mf)

# Multifamily training plus validation set
X_train_plus_val_mf = pd.concat([X_train_mf, X_val_mf])

In [5]:
#Industrial
y_train_plus_val_id = pd.concat([y_train_id, y_val_id])
y_train_plus_val_id_copy = y_train_plus_val_id.copy()
y_train_plus_val_id_copy.columns = ['train_val_split']
y_train_plus_val_id_copy.loc[y_train_id.index,'train_val_split'] = -1
y_train_plus_val_id_copy.loc[y_val_id.index,'train_val_split'] = 0
val_fold_id = np.array(y_train_plus_val_id_copy)
ps_id = PredefinedSplit(val_fold_id)

# Industrial
X_train_plus_val_id = pd.concat([X_train_id, X_val_id])

# Multi - Family Home Modeling

## Linear Regression

In [10]:
lr = LinearRegression().fit(X_train_mf,y_train_mf)
lr.score(X_train_mf,y_train_mf)
mean_squared_error(y_test_mf, lr.predict(X_test_mf),squared=False)

5414.851988058862

## Linear regression with RandomSearch 

In [31]:
# random search for linear regression
param_lr_mf = {'fit_intercept': [True,False],
               'normalize':[True,False]}

lr_cv_mf = RandomizedSearchCV(
           LinearRegression(),
           param_distributions=param_lr_mf,
           n_iter=20,
           cv=ps_mf,
           scoring='neg_root_mean_squared_error', 
           n_jobs=-1,
           random_state=123
).fit(X_train_plus_val_mf, np.array(y_train_plus_val_mf).ravel())

# find best model hyperparameters 
print(lr_cv_mf.best_params_)
print(lr_cv_mf.best_estimator_)
print('MULTIFAMILY Linear min RMSE is: {}'.format(-lr_cv_mf.best_score_))

{'normalize': True, 'fit_intercept': False}
LinearRegression(fit_intercept=False, normalize=True)
MULTIFAMILY Linear min RMSE is: 11493975458447.715


In [38]:
lr_r2_mf = r2_score(y_train_plus_val_mf, 
                    lr_cv_mf.best_estimator_.predict(X_train_plus_val_mf))
print(lr_r2_mf)

0.21042001646250008


## LASSO regression 

In [30]:
# random search for lasso
param_la_mf = {'alpha': sp_uniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

la_cv_mf = RandomizedSearchCV(
        Lasso(),
        param_distributions=param_la_mf,
        n_iter=10,
        cv=ps_mf,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_mf, np.array(y_train_plus_val_mf).ravel())

# find best model hyperparameters 
print(la_cv_mf.best_params_)
print(la_cv_mf.best_estimator_)
print('MULTI FAMILY LASSO min RMSE is: {}'.format(-la_cv_mf.best_score_))

{'alpha': 69.64701855978616, 'fit_intercept': True, 'normalize': True}
Lasso(alpha=69.64701855978616, normalize=True)
MULTI FAMILY LASSO min RMSE is: 587.1247692629556


In [39]:
la_r2 = r2_score(y_train_plus_val_mf, 
                 la_cv_mf.best_estimator_.predict(X_train_plus_val_mf))
print(la_r2)

0.0


## Ridge Regression  

In [29]:
# random search for ridge
param_rd = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

rd_cv = RandomizedSearchCV(
        Ridge(),
        param_distributions=param_rd,
        n_iter=10,
        cv=ps_mf,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_mf, np.array(y_train_plus_val_mf).ravel())

# Find best model hyperparameters 
print(rd_cv.best_params_)
print(rd_cv.best_estimator_)
print('MULTIFAMILY RIDGE min RMSE is: {}'.format(-rd_cv.best_score_))

{'alpha': 1.5094374246471327, 'fit_intercept': True, 'normalize': True}
Ridge(alpha=1.5094374246471327, normalize=True)
MULTIFAMILY RIDGE min RMSE is: 1655.6082440221574


In [40]:
rd_r2 = r2_score(y_train_plus_val_mf, 
                 rd_cv.best_estimator_.predict(X_train_plus_val_mf))
print(rd_r2)

0.15085012723399194


## KNN  Regressor 

In [32]:
# random search for KNeighborsRegressor
param_knn = {'n_neighbors': sp_randint(1,21),
             'weights': ['uniform', 'distance'],
             'leaf_size': sp_randint(1,21)}

knn_cv = RandomizedSearchCV(
         KNeighborsRegressor(),
         param_distributions=param_knn,
         n_iter=10,
         cv=ps_mf,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_mf, np.array(y_train_plus_val_mf).ravel())

# find best model hyperparameters 
print(knn_cv.best_params_)
print(knn_cv.best_estimator_)
print('MULTIFAMILY KNN min RMSE is: {}'.format(-knn_cv.best_score_))

{'leaf_size': 3, 'n_neighbors': 7, 'weights': 'distance'}
KNeighborsRegressor(leaf_size=3, n_neighbors=7, weights='distance')
MULTIFAMILY KNN min RMSE is: 410.45699249138124


In [41]:
knn_r2 = knn_cv.best_estimator_.score(X_train_plus_val_mf,
                                      y_train_plus_val_mf)
print(knn_r2)

1.0


## Random Forest  

In [34]:
# random search for RandomForestRegressor
param_rf = {'n_estimators': [100, 120, 200, 300, 500, 800, 1200],
            'max_depth': [None, 5, 8, 15, 25, 30],
            'min_samples_split': [1, 2, 5, 10 ,15, 100],
            'min_samples_leaf': [1, 2, 5, 10],
            'max_features': ['auto', 'sqrt', 'log2']}

rf_cv = RandomizedSearchCV(
        RandomForestRegressor(),
        param_distributions=param_rf,
        n_iter=10,
        cv=ps_mf,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_mf, np.array(y_train_plus_val_mf).ravel())

# find best model hyperparameters 
print(rf_cv.best_params_)
print(rf_cv.best_estimator_)
print('RF MULTIFAMILY min RMSE is: {}'.format(-rf_cv.best_score_))

{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': None}
RandomForestRegressor(min_samples_leaf=5, n_estimators=800)
RF MULTIFAMILY min RMSE is: 367.1724158866464


In [42]:
rf_r2 = r2_score(y_train_plus_val_mf, 
                 rf_cv.best_estimator_.predict(X_train_plus_val_mf))
print(rf_r2)

0.8260040239921591


## MLP Regressor  

In [36]:
# random search for MLPRegressor
hl = []
for i in [1, 2, 3, 4, 5, 10, 15, 25, 30]:
    hl.append((i))
    hl.append((i,i))

param_mlp = {'hidden_layer_sizes': hl,
             'learning_rate_init': sp_loguniform(1e-4, 1e2),
             'alpha': sp_loguniform(1e-4, 1e2),
             'batch_size': [1, 5, 10, 25, 100, 500]}

mlp_cv = RandomizedSearchCV(
         MLPRegressor(),
         param_distributions=param_mlp,
         n_iter=10,
         cv=ps_mf,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_mf, np.array(y_train_plus_val_mf).ravel())

# find best model hyperparameters 
print(mlp_cv.best_params_)
print(mlp_cv.best_estimator_)
print('MLPRegressor MULTIFAMILY min RMSE is: {}'.format(-mlp_cv.best_score_))

{'alpha': 2.1610275095525036, 'batch_size': 5, 'hidden_layer_sizes': 2, 'learning_rate_init': 0.005784745785308777}
MLPRegressor(alpha=2.1610275095525036, batch_size=5, hidden_layer_sizes=2,
             learning_rate_init=0.005784745785308777)
MLPRegressor MULTIFAMILY min RMSE is: 477.14378094561096


In [43]:
mlp_r2 = r2_score(y_train_plus_val_mf, 
                  mlp_cv.best_estimator_.predict(X_train_plus_val_mf))
print(mlp_r2)

0.30809743995068795


## XGBoost  

In [10]:
xgparams = {
    'n_estimators':[100, 500 ,1000],
               'max_depth': sp_randint(3, 15),
               'learning_rate_init': sp_loguniform(1e-4, 1e2),
               'min_child_weight': [1, 3, 5, 7]    }

# Set up the random search with cross validation
random_cv = RandomizedSearchCV(
            estimator=xgb.XGBRegressor(nthread=-1),
            param_distributions=xgparams,
            cv=ps_mf,
            scoring='neg_root_mean_squared_error',
            n_iter=10,      
            random_state= 123).fit(X_train_plus_val_mf,  np.array(y_train_plus_val_mf).ravel())

Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used

In [11]:
# find best model hyperparameters 
print(random_cv.best_params_)
print(random_cv.best_estimator_)
print('XGBoost MULTIFAMILY min RMSE is: {}'.format(-random_cv.best_score_))
xg_r2 = r2_score(y_train_plus_val_mf, 
                  random_cv.best_estimator_.predict(X_train_plus_val_mf))
print(xg_r2)

{'learning_rate_init': 2.1610275095525036, 'max_depth': 4, 'min_child_weight': 3, 'n_estimators': 1000}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, learning_rate_init=2.1610275095525036,
             max_delta_step=0, max_depth=4, min_child_weight=3, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=-1,
             nthread=-1, num_parallel_tree=1, random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)
XGBoost MULTIFAMILY min RMSE is: 339.8281396661046
0.986315073488825



# 'Industrial' Modeling

We perform the same model fitting process for the "Industrial" Property Type.   
Note however that the training set has 1002 rows, validation set has 251 rows, and the test set has 128 rows.  
Since the data is sparse for the "Industrial" property type, we will use a K-fold cross validation where k = 3.  

## Industrial: Linear Regression 

In [53]:
lr_id = LinearRegression().fit(X_train_id, y_train_id)
lr_id.score(X_train_id, y_train_id)
mean_squared_error(y_test_id, lr_id.predict(X_test_id),squared=False)

1210.0662507998659

## Industrial: Linear Regression with RandomSearch  

In [67]:
# random search for linear regression
param_lr_id = {'fit_intercept': [True,False],
               'normalize':[True,False]}

lr_cv_id = RandomizedSearchCV(
           LinearRegression(),
           param_distributions=param_lr_id,
           n_iter=20,
           cv=3,
           scoring='neg_root_mean_squared_error', 
           n_jobs=-1,
           random_state=123
).fit(X_train_plus_val_id, np.array(y_train_plus_val_id).ravel())

# find best model hyperparameters 
print(lr_cv_id.best_params_)
print(lr_cv_id.best_estimator_)
print('INDUSTRIAL Linear min RMSE is: {}'.format(-lr_cv_id.best_score_))

{'normalize': True, 'fit_intercept': False}
LinearRegression(fit_intercept=False, normalize=True)
INDUSTRIAL Linear min RMSE is: 99659602926215.62


In [58]:
lr_r2_id = r2_score(y_train_plus_val_id, 
                    lr_cv_id.best_estimator_.predict(X_train_plus_val_id))
print(lr_r2_id)

0.46799377626407235


## Industrial: LASSO

In [66]:
# random search for lasso
param_la_id = {'alpha': sp_uniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

la_cv_id = RandomizedSearchCV(
        Lasso(),
        param_distributions=param_la_id,
        n_iter=10,
        cv=3,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_id, np.array(y_train_plus_val_id).ravel())

# find best model hyperparameters 
print(la_cv_id.best_params_)
print(la_cv_id.best_estimator_)
print('INDUSTRIAL LASSO min RMSE is: {}'.format(-la_cv_id.best_score_))

{'alpha': 22.68524535642031, 'fit_intercept': True, 'normalize': False}
Lasso(alpha=22.68524535642031)
INDUSTRIAL LASSO min RMSE is: 734.3963170575006


In [60]:
la_r2_id = r2_score(y_train_plus_val_id, 
                 la_cv_id.best_estimator_.predict(X_train_plus_val_id))
print(la_r2_id)

0.40048410902029197


## Industrial: Ridge  

In [65]:
# random search for ridge
param_id = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

rd_cv_id = RandomizedSearchCV(
        Ridge(),
        param_distributions=param_id,
        n_iter=10,
        cv= 3,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_id, np.array(y_train_plus_val_id).ravel())

# Find best model hyperparameters 
print(rd_cv_id.best_params_)
print(rd_cv_id.best_estimator_)
print('INDUSTRIAL RIDGE min RMSE is: {}'.format(-rd_cv_id.best_score_))

{'alpha': 2.074024196289186, 'fit_intercept': False, 'normalize': False}
Ridge(alpha=2.074024196289186, fit_intercept=False)
INDUSTRIAL RIDGE min RMSE is: 715.6038223254658


In [62]:
rd_r2_id = r2_score(y_train_plus_val_id, 
                 rd_cv_id.best_estimator_.predict(X_train_plus_val_id))
print(rd_r2)

0.15085012723399194


## Industrial: KNN 

In [63]:
# random search for KNeighborsRegressor
param_knn_id = {'n_neighbors': sp_randint(1,21),
             'weights': ['uniform', 'distance'],
             'leaf_size': sp_randint(1,21)}

knn_cv_id = RandomizedSearchCV(
         KNeighborsRegressor(),
         param_distributions=param_knn_id,
         n_iter=10,
         cv= 3,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_id, np.array(y_train_plus_val_id).ravel())

# find best model hyperparameters 
print(knn_cv_id.best_params_)
print(knn_cv_id.best_estimator_)
print('INDUSTRIAL KNN min RMSE is: {}'.format(-knn_cv_id.best_score_))

{'leaf_size': 3, 'n_neighbors': 7, 'weights': 'distance'}
KNeighborsRegressor(leaf_size=3, n_neighbors=7, weights='distance')
INDUSTRIAL KNN min RMSE is: 713.66993870114


In [64]:
knn_r2_id = knn_cv_id.best_estimator_.score(X_train_plus_val_id,
                                      y_train_plus_val_id)
print(knn_r2_id)

1.0


## Industrial: Random Forest  

In [68]:
# random search for RandomForestRegressor
param_rf_id = {'n_estimators': [100, 120, 200, 300, 500, 800, 1200],
            'max_depth': [None, 5, 8, 15, 25, 30],
            'min_samples_split': [1, 2, 5, 10 ,15, 100],
            'min_samples_leaf': [1, 2, 5, 10],
            'max_features': ['auto', 'sqrt', 'log2']}

rf_cv_id = RandomizedSearchCV(
        RandomForestRegressor(),
        param_distributions=param_rf_id,
        n_iter=10,
        cv= 3,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_id, np.array(y_train_plus_val_id).ravel())

# find best model hyperparameters 
print(rf_cv_id.best_params_)
print(rf_cv_id.best_estimator_)
print('RF MULTIFAMILY min RMSE is: {}'.format(-rf_cv_id.best_score_))

{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': None}
RandomForestRegressor(min_samples_leaf=5, n_estimators=800)
RF MULTIFAMILY min RMSE is: 641.7616705564853


In [69]:
rf_r2_id = r2_score(y_train_plus_val_id, 
                 rf_cv_id.best_estimator_.predict(X_train_plus_val_id))
print(rf_r2_id)

0.8042309862546281


## Industrial: MLP Regressor  

In [70]:
# random search for MLPRegressor
hl = []
for i in [1, 2, 3, 4, 5, 10, 15, 25, 30]:
    hl.append((i))
    hl.append((i,i))

param_mlp_id = {'hidden_layer_sizes': hl,
             'learning_rate_init': sp_loguniform(1e-4, 1e2),
             'alpha': sp_loguniform(1e-4, 1e2),
             'batch_size': [1, 5, 10, 25, 100, 500]}

mlp_cv_id = RandomizedSearchCV(
         MLPRegressor(),
         param_distributions=param_mlp_id,
         n_iter=10,
         cv= 3,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_id, np.array(y_train_plus_val_id).ravel())

# find best model hyperparameters 
print(mlp_cv_id.best_params_)
print(mlp_cv_id.best_estimator_)
print('MLPRegressor INDUSTRIAL min RMSE is: {}'.format(-mlp_cv_id.best_score_))

{'alpha': 0.02444924947328454, 'batch_size': 100, 'hidden_layer_sizes': (30, 30), 'learning_rate_init': 0.0012443724365791028}
MLPRegressor(alpha=0.02444924947328454, batch_size=100,
             hidden_layer_sizes=(30, 30),
             learning_rate_init=0.0012443724365791028)
MLPRegressor INDUSTRIAL min RMSE is: 698.7729699890768


In [71]:
mlp_r2_id = r2_score(y_train_plus_val_id, 
                  mlp_cv_id.best_estimator_.predict(X_train_plus_val_id))
print(mlp_r2_id)

0.5280989761105339


## Industrial: XGBoost  

In [12]:
xgparams_id = {
               'n_estimators':[100, 500 ,1000],
               'max_depth': sp_randint(3, 15),
               'learning_rate_init': sp_loguniform(1e-4, 1e2),
               'min_child_weight': [1, 3, 5, 7] }

# Set up the random search with cross validation
xgb_cv_id = RandomizedSearchCV(
            estimator=xgb.XGBRegressor(nthread=-1),
            param_distributions=xgparams,
            cv=3,
            scoring='neg_root_mean_squared_error',
            n_iter=10,      
            random_state= 123).fit(X_train_plus_val_id,  np.array(y_train_plus_val_id).ravel())

Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used

Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used

In [13]:
# find best model hyperparameters 
print(xgb_cv_id.best_params_)
print(xgb_cv_id.best_estimator_)
print('XGBoost INDUSTRIAL min RMSE is: {}'.format(-xgb_cv_id.best_score_))
xg_r2_id = r2_score(y_train_plus_val_id, 
                  xgb_cv_id.best_estimator_.predict(X_train_plus_val_id))
print(xg_r2_id)

{'learning_rate_init': 1.5094374246471327, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 1000}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, learning_rate_init=1.5094374246471327,
             max_delta_step=0, max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=-1,
             nthread=-1, num_parallel_tree=1, random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)
XGBoost INDUSTRIAL min RMSE is: 651.899892600659
0.9999999999944897


# Retrain the two best models with the tuned hyper parameters on the train + validation sets   

### Multi-Family  

In [18]:
# train with best hyper parameters on 
#both training and validation sets
xf_xgb = random_cv.fit(X_train_plus_val_mf,  
                       np.array(y_train_plus_val_mf).ravel())

Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used

In [23]:
#predict on the hold out test set
mf_preds = xf_xgb.predict(X_test_mf)
np.sqrt(mean_squared_error(y_test_mf, mf_preds)) 

360.5010705665698

## Industrial

In [25]:
#fit the model on train + val
id_xgb = xgb_cv_id.fit(X_train_plus_val_id,  
                       np.array(y_train_plus_val_id).ravel())

Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used

Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { learning_rate_init } might not be used.

  This may not be accurate due to some parameters are only used

In [26]:
#predict on the hold out test set
id_preds = xf_xgb.predict(X_test_id)
np.sqrt(mean_squared_error(y_test_id, id_preds)) 

ValueError: feature_names mismatch: ['basements_None', 'basements_full', 'basements_partial', 'central_air_N', 'central_air_Y', 'exterior_condition_above average', 'exterior_condition_average', 'exterior_condition_below average', 'exterior_condition_new', 'exterior_condition_none', 'exterior_condition_rehabbed', 'exterior_condition_vacant', 'garage_type_attached', 'garage_type_builtin', 'garage_type_converted', 'garage_type_detached', 'garage_type_none', 'house_number_North or East', 'house_number_South or West', 'interior_condition_above average', 'interior_condition_average', 'interior_condition_below average', 'interior_condition_new', 'interior_condition_none', 'interior_condition_sealed', 'interior_condition_vacant', 'site_type_A', 'site_type_B', 'site_type_C', 'site_type_D', 'site_type_F', 'street_designation_AVE', 'street_designation_BLV', 'street_designation_DR', 'street_designation_LA', 'street_designation_Other', 'street_designation_PL', 'street_designation_RD', 'street_designation_ST', 'topography_A', 'topography_C', 'topography_D', 'topography_E', 'topography_F', 'type_heater_A', 'type_heater_B', 'type_heater_C', 'type_heater_D', 'type_heater_E', 'type_heater_G', 'type_heater_H', 'type_heater_None', 'view_type_A', 'view_type_B', 'view_type_C', 'view_type_D', 'view_type_E', 'view_type_H', 'view_type_I', 'view_type_None', 'depth', 'fireplaces', 'frontage', 'garage_spaces', 'geographic_ward', 'number_of_bathrooms', 'number_of_bedrooms', 'number_of_rooms', 'number_stories', 'total_area', 'total_livable_area', 'zip_code', 'lat', 'lng', 'sale_year', 'sale_month', 'sale_week', 'sale_dow', 'age', 'bath_total_ratio', 'bed_total_ratio', 'livable_area_ratio'] ['basements_None', 'basements_full', 'basements_partial', 'central_air_N', 'central_air_Y', 'exterior_condition_above average', 'exterior_condition_average', 'exterior_condition_below average', 'exterior_condition_new', 'exterior_condition_none', 'exterior_condition_rehabbed', 'exterior_condition_vacant', 'garage_type_attached', 'garage_type_builtin', 'garage_type_detached', 'garage_type_none', 'house_number_North or East', 'house_number_South or West', 'interior_condition_above average', 'interior_condition_average', 'interior_condition_below average', 'interior_condition_new', 'interior_condition_none', 'interior_condition_sealed', 'interior_condition_vacant', 'site_type_A', 'site_type_B', 'site_type_D', 'site_type_E', 'site_type_F', 'street_designation_AVE', 'street_designation_BLV', 'street_designation_DR', 'street_designation_LA', 'street_designation_Other', 'street_designation_PL', 'street_designation_RD', 'street_designation_ST', 'topography_A', 'topography_E', 'topography_F', 'type_heater_A', 'type_heater_B', 'type_heater_H', 'type_heater_None', 'view_type_A', 'view_type_B', 'view_type_C', 'view_type_D', 'view_type_E', 'view_type_I', 'view_type_None', 'depth', 'fireplaces', 'frontage', 'garage_spaces', 'geographic_ward', 'number_of_bathrooms', 'number_of_bedrooms', 'number_of_rooms', 'number_stories', 'total_area', 'total_livable_area', 'zip_code', 'lat', 'lng', 'sale_year', 'sale_month', 'sale_week', 'sale_dow', 'age', 'bath_total_ratio', 'bed_total_ratio', 'livable_area_ratio']
expected view_type_H, topography_C, type_heater_G, topography_D, type_heater_D, type_heater_E, site_type_C, garage_type_converted, type_heater_C in input data
training data did not have the following fields: site_type_E