In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb

from sklearn.model_selection import PredefinedSplit, RandomizedSearchCV, KFold, StratifiedKFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from scipy.stats import loguniform as sp_loguniform
from scipy.stats import randint as sp_randint

import statsmodels.api as sm
import statsmodels.formula.api as smf

import warnings
warnings.filterwarnings('ignore')

### Experiment on Single Family

In [2]:
# get training, validation and test data for single family
X_train_sf = pd.read_csv('X_train_sf.csv')
X_train_sf.index = X_train_sf['Unnamed: 0']
X_train_sf = X_train_sf.drop(['Unnamed: 0'], axis=1)
X_val_sf = pd.read_csv('X_val_sf.csv')
X_val_sf.index = X_val_sf['Unnamed: 0']
X_val_sf = X_val_sf.drop(['Unnamed: 0'], axis=1)
X_test_sf = pd.read_csv('X_test_sf.csv')
X_test_sf.index = X_test_sf['Unnamed: 0']
X_test_sf = X_test_sf.drop(['Unnamed: 0'], axis=1)
y_train_sf = pd.read_csv('y_train_sf.csv')
y_train_sf.index = y_train_sf['Unnamed: 0']
y_train_sf = y_train_sf.drop(['Unnamed: 0'], axis=1)
y_val_sf = pd.read_csv('y_val_sf.csv')
y_val_sf.index = y_val_sf['Unnamed: 0']
y_val_sf = y_val_sf.drop(['Unnamed: 0'], axis=1)
y_test_sf = pd.read_csv('y_test_sf.csv')
y_test_sf.index = y_test_sf['Unnamed: 0']
y_test_sf = y_test_sf.drop(['Unnamed: 0'], axis=1)

In [3]:
# create a predefined validation set for random search
y_train_plus_val_sf = pd.concat([y_train_sf, y_val_sf])
y_train_plus_val_sf_copy = y_train_plus_val_sf.copy()
y_train_plus_val_sf_copy.columns = ['train_val_split']
y_train_plus_val_sf_copy.loc[y_train_sf.index,'train_val_split'] = -1
y_train_plus_val_sf_copy.loc[y_val_sf.index,'train_val_split'] = 0
val_fold_sf = np.array(y_train_plus_val_sf_copy)
ps_sf = PredefinedSplit(val_fold_sf)

# get training plus validation set
X_train_plus_val_sf = pd.concat([X_train_sf, X_val_sf])

In [10]:
# create a function to calculate baseline RMSE and MAE
def rmse(l):
    return np.sqrt((np.sum((l-np.mean(l))**2))/len(l))
def mae(l):
    return np.sum(np.abs(l-np.mean(l)))/len(l)

### Single Family: Linear regression

In [4]:
# random search for linear regression
param_lr_sf = {'fit_intercept': [True,False],
               'normalize':[True,False]}

lr_cv_sf = RandomizedSearchCV(
           LinearRegression(),
           param_distributions=param_lr_sf,
           n_iter=10,
           cv=ps_sf,
           scoring='neg_root_mean_squared_error', 
           n_jobs=-1,
           random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# find best model hyperparameters 
print(lr_cv_sf.best_params_)
print(lr_cv_sf.best_estimator_)
print('Min RMSE for linear regression on Single Family is: {}'.format(-lr_cv_sf.best_score_))

{'normalize': True, 'fit_intercept': False}
LinearRegression(fit_intercept=False, normalize=True)
Min RMSE for linear regression on Single Family is: 23479006242.85987


In [11]:
# MAE of best model
print(mean_absolute_error(y_val_sf, lr_cv_sf.best_estimator_.predict(X_val_sf)))

# RMSE and MAE of baseline model
print(rmse(y_val_sf))
print(mae(y_val_sf))

106.65993138395191
sale_price    290.504546
dtype: float64
sale_price    155.263946
dtype: float64


### Single Family: Lasso

In [12]:
# random search for lasso
param_la_sf = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

la_cv_sf = RandomizedSearchCV(
        Lasso(),
        param_distributions=param_la_sf,
        n_iter=10,
        cv=ps_sf,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# find best model hyperparameters 
print(la_cv_sf.best_params_)
print(la_cv_sf.best_estimator_)
print('Min RMSE for lasso on Single Family is: {}'.format(-la_cv_sf.best_score_))

{'alpha': 0.07684071705306554, 'fit_intercept': False, 'normalize': True}
Lasso(alpha=0.07684071705306554, fit_intercept=False, normalize=True)
Min RMSE for lasso on Single Family is: 237.02107253707626


In [13]:
# MAE of best model
print(mean_absolute_error(y_val_sf, la_cv_sf.best_estimator_.predict(X_val_sf)))

# RMSE and MAE of baseline model
print(rmse(y_val_sf))
print(mae(y_val_sf))

106.50077190341187
sale_price    290.504546
dtype: float64
sale_price    155.263946
dtype: float64


### Single Family: Ridge

In [14]:
# random search for ridge
param_rd_sf = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

rd_cv_sf = RandomizedSearchCV(
        Ridge(),
        param_distributions=param_rd_sf,
        n_iter=10,
        cv=ps_sf,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# Find best model hyperparameters 
print(rd_cv_sf.best_params_)
print(rd_cv_sf.best_estimator_)
print('Min RMSE fort ridge on Single Family is: {}'.format(-rd_cv_sf.best_score_))

{'alpha': 76.66289057556017, 'fit_intercept': True, 'normalize': False}
Ridge(alpha=76.66289057556017)
Min RMSE fort ridge on Single Family is: 236.94456670464658


In [15]:
# MAE of best model
print(mean_absolute_error(y_val_sf, rd_cv_sf.best_estimator_.predict(X_val_sf)))

# RMSE and MAE of baseline model
print(rmse(y_val_sf))
print(mae(y_val_sf))

106.58155691099871
sale_price    290.504546
dtype: float64
sale_price    155.263946
dtype: float64


### Single Family: KNeighborsRegressor

In [16]:
# random search for KNeighborsRegressor
param_knn_sf = {'n_neighbors': sp_randint(1,21),
             'weights': ['uniform', 'distance'],
             'leaf_size': sp_randint(1,21)}

knn_cv_sf = RandomizedSearchCV(
         KNeighborsRegressor(),
         param_distributions=param_knn_sf,
         n_iter=10,
         cv=ps_sf,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# find best model hyperparameters 
print(knn_cv_sf.best_params_)
print(knn_cv_sf.best_estimator_)
print('Min RMSE for Knn regressor on Single Family is: {}'.format(-knn_cv_sf.best_score_))

{'leaf_size': 5, 'n_neighbors': 18, 'weights': 'distance'}
KNeighborsRegressor(leaf_size=5, n_neighbors=18, weights='distance')
Min RMSE for Knn regressor on Single Family is: 170.7286967851921


In [17]:
# MAE of best model
print(mean_absolute_error(y_val_sf, knn_cv_sf.best_estimator_.predict(X_val_sf)))

# RMSE and MAE of baseline model
print(rmse(y_val_sf))
print(mae(y_val_sf))

0.07647382492493975
sale_price    290.504546
dtype: float64
sale_price    155.263946
dtype: float64


### Single Family: RandomForestRegressor

In [18]:
# random search for RandomForestRegressor
param_rf_sf = {'n_estimators': [100, 120, 200, 300, 500, 800, 1200],
            'max_depth': [None, 5, 8, 15, 25, 30],
            'min_samples_split': [1, 2, 5, 10 ,15, 100],
            'min_samples_leaf': [1, 2, 5, 10],
            'max_features': ['auto', 'sqrt', 'log2']}

rf_cv_sf = RandomizedSearchCV(
        RandomForestRegressor(),
        param_distributions=param_rf_sf,
        n_iter=10,
        cv=ps_sf,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# find best model hyperparameters 
print(rf_cv_sf.best_params_)
print(rf_cv_sf.best_estimator_)
print('Min RMSE is for random forest regressor on Single Family is: {}'.format(-rf_cv_sf.best_score_))

{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': None}
RandomForestRegressor(min_samples_leaf=5, n_estimators=800)
Min RMSE is for random forest regressor on Single Family is: 136.8469916397392


In [19]:
# MAE of best model
print(mean_absolute_error(y_val_sf, rf_cv_sf.best_estimator_.predict(X_val_sf)))

# RMSE and MAE of baseline model
print(rmse(y_val_sf))
print(mae(y_val_sf))

34.78543529085287
sale_price    290.504546
dtype: float64
sale_price    155.263946
dtype: float64


### Single Family: MLPRegressor

In [None]:
# random search for MLPRegressor
hl = []
for i in [1, 2, 3, 4, 5, 10, 15, 25, 30]:
    hl.append((i))
    hl.append((i,i))

param_mlp_sf = {'hidden_layer_sizes': hl,
             'learning_rate_init': sp_loguniform(1e-4, 1e2),
             'alpha': sp_loguniform(1e-4, 1e2),
             'batch_size': [1, 3, 5, 10, 20, 50, 100, 250, 500]}

mlp_cv_sf = RandomizedSearchCV(
         MLPRegressor(),
         param_distributions=param_mlp_sf,
         n_iter=10,
         cv=ps_sf,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# find best model hyperparameters 
print(mlp_cv_sf.best_params_)
print(mlp_cv_sf.best_estimator_)
print('Min RMSE for MLP regressor on Single Family is: {}'.format(-mlp_cv_sf.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_sf, mlp_cv_sf.best_estimator_.predict(X_val_sf)))

# RMSE and MAE of baseline model
print(rmse(y_val_sf))
print(mae(y_val_sf))

### Single Family: XGBoost

In [None]:
# random search for XGBoost
param_xg_sf = {'n_estimators':[100, 500 ,1000],
               'max_depth': sp_randint(3, 15),
               'learning_rate_init': sp_loguniform(1e-4, 1e2),
               'min_child_weight': [1, 3, 5, 7]}

xg_cv_sf = RandomizedSearchCV(
            xgb.XGBRegressor(nthread=-1),
            param_distributions=param_xg_sf,
            n_iter=10,
            cv=ps_sf,
            scoring='neg_root_mean_squared_error',
            n_jobs=-1,
            random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# find best model hyperparameters 
print(xg_cv_sf.best_params_)
print(xg_cv_sf.best_estimator_)
print('Min RMSE for XGBoost on Single Family is: {}'.format(-xg_cv_sf.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_sf, xg_cv_sf.best_estimator_.predict(X_val_sf)))

# RMSE and MAE of baseline model
print(rmse(y_val_sf))
print(mae(y_val_sf))

### Experiment on Mixed Use

In [12]:
# get training, validation and test data for mixed use
X_train_mx = pd.read_csv('X_train_mx.csv')
X_train_mx.index = X_train_mx['Unnamed: 0']
X_train_mx = X_train_mx.drop(['Unnamed: 0'], axis=1)
X_val_mx = pd.read_csv('X_val_mx.csv')
X_val_mx.index = X_val_mx['Unnamed: 0']
X_val_mx = X_val_mx.drop(['Unnamed: 0'], axis=1)
X_test_mx = pd.read_csv('X_test_mx.csv')
X_test_mx.index = X_test_mx['Unnamed: 0']
X_test_mx = X_test_mx.drop(['Unnamed: 0'], axis=1)
y_train_mx = pd.read_csv('y_train_mx.csv')
y_train_mx.index = y_train_mx['Unnamed: 0']
y_train_mx = y_train_mx.drop(['Unnamed: 0'], axis=1)
y_val_mx = pd.read_csv('y_val_mx.csv')
y_val_mx.index = y_val_mx['Unnamed: 0']
y_val_mx = y_val_mx.drop(['Unnamed: 0'], axis=1)
y_test_mx = pd.read_csv('y_test_mx.csv')
y_test_mx.index = y_test_mx['Unnamed: 0']
y_test_mx = y_test_mx.drop(['Unnamed: 0'], axis=1)

In [13]:
# get training plus validation set
X_train_plus_val_mx = pd.concat([X_train_mx, X_val_mx])
y_train_plus_val_mx = pd.concat([y_train_mx, y_val_mx])

### Mixed Use: Linear regression

In [14]:
# random search for linear regression
param_lr_mx = {'fit_intercept': [True,False],
               'normalize':[True,False]}

lr_cv_mx = RandomizedSearchCV(
           LinearRegression(),
           param_distributions=param_lr_mx,
           n_iter=10,
           cv=5,
           scoring='neg_root_mean_squared_error', 
           n_jobs=-1,
           random_state=123
).fit(X_train_plus_val_mx, np.array(y_train_plus_val_mx).ravel())

# find best model hyperparameters 
print(lr_cv_mx.best_params_)
print(lr_cv_mx.best_estimator_)
print('Min RMSE for linear regression is: {}'.format(-lr_cv_mx.best_score_))

{'normalize': True, 'fit_intercept': False}
LinearRegression(fit_intercept=False, normalize=True)
Min RMSE for linear regression is: 10122191127754.484


In [None]:
# MAE of best model
print(mean_absolute_error(y_val_mx, lr_cv_mx.best_estimator_.predict(X_val_mx)))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

### Mixed Use: Lasso

In [15]:
# random search for lasso
param_la_mx = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

la_cv_mx = RandomizedSearchCV(
        Lasso(),
        param_distributions=param_la_mx,
        n_iter=10,
        cv=5,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_mx, np.array(y_train_plus_val_mx).ravel())

# find best model hyperparameters 
print(la_cv_mx.best_params_)
print(la_cv_mx.best_estimator_)
print('Min RMSE for lasso fro Mixed Use is: {}'.format(-la_cv_mx.best_score_))

{'alpha': 0.15463515822289586, 'fit_intercept': True, 'normalize': False}
Lasso(alpha=0.15463515822289586)
Min RMSE for lasso fro Mixed Use is: 261.77672139034263


In [None]:
# MAE of best model
print(mean_absolute_error(y_val_mx, la_cv_mx.best_estimator_.predict(X_val_mx)))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

### Mixed Use: Ridge

In [16]:
# random search for ridge
param_rd_mx = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

rd_cv_mx = RandomizedSearchCV(
        Ridge(),
        param_distributions=param_rd_mx,
        n_iter=10,
        cv=5,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_mx, np.array(y_train_plus_val_mx).ravel())

# Find best model hyperparameters 
print(rd_cv_mx.best_params_)
print(rd_cv_mx.best_estimator_)
print('Min RMSE for ridge on Mixed Use is: {}'.format(-rd_cv_mx.best_score_))

{'alpha': 76.66289057556017, 'fit_intercept': True, 'normalize': False}
Ridge(alpha=76.66289057556017)
Min RMSE for ridge on Mixed Use is: 261.77449765235895


In [None]:
# MAE of best model
print(mean_absolute_error(y_val_mx, rd_cv_mx.best_estimator_.predict(X_val_mx)))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

### Mixed Use: KNeighborsRegressor

In [17]:
# random search for KNeighborsRegressor
param_knn_mx = {'n_neighbors': sp_randint(1,21),
             'weights': ['uniform', 'distance'],
             'leaf_size': sp_randint(1,21)}

knn_cv_mx = RandomizedSearchCV(
         KNeighborsRegressor(),
         param_distributions=param_knn_mx,
         n_iter=10,
         cv=5,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_mx, np.array(y_train_plus_val_mx).ravel())

# find best model hyperparameters 
print(knn_cv_mx.best_params_)
print(knn_cv_mx.best_estimator_)
print('Min RMSE for Knn regressor on Mixed Use is: {}'.format(-knn_cv_mx.best_score_))

{'leaf_size': 20, 'n_neighbors': 11, 'weights': 'distance'}
KNeighborsRegressor(leaf_size=20, n_neighbors=11, weights='distance')
Min RMSE for Knn regressor on Mixed Use is: 240.56197162525683


In [None]:
# MAE of best model
print(mean_absolute_error(y_val_mx, knn_cv_mx.best_estimator_.predict(X_val_mx)))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

### Mixed Use: RandomForestRegressor

In [18]:

# random search for RandomForestRegressor
param_rf_mx = {'n_estimators': [100, 120, 200, 300, 500, 800, 1200],
            'max_depth': [None, 5, 8, 15, 25, 30],
            'min_samples_split': [1, 2, 5, 10 ,15, 100],
            'min_samples_leaf': [1, 2, 5, 10],
            'max_features': ['auto', 'sqrt', 'log2']}

rf_cv_mx = RandomizedSearchCV(
        RandomForestRegressor(),
        param_distributions=param_rf_mx,
        n_iter=10,
        cv=5,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_mx, np.array(y_train_plus_val_mx).ravel())

# find best model hyperparameters 
print(rf_cv_mx.best_params_)
print(rf_cv_mx.best_estimator_)
print('Min RMSE is for random forest regressor on Mixed Use is: {}'.format(-rf_cv_mx.best_score_))

{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': None}
RandomForestRegressor(min_samples_leaf=5, n_estimators=800)
Min RMSE is for random forest regressor on Mixed Use is: 207.44661783558658


In [None]:
# MAE of best model
print(mean_absolute_error(y_val_mx, rf_cv_mx.best_estimator_.predict(X_val_mx)))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

### Mixed Use: MLPRegressor

In [19]:
# random search for MLPRegressor
hl = []
for i in [1, 2, 3, 4, 5, 10, 15, 25, 30]:
    hl.append((i))
    hl.append((i,i))

param_mlp_mx = {'hidden_layer_sizes': hl,
             'learning_rate_init': sp_loguniform(1e-4, 1e2),
             'alpha': sp_loguniform(1e-4, 1e2),
             'batch_size': [1, 3, 5, 10, 20, 50, 100, 250, 500]}

mlp_cv_mx = RandomizedSearchCV(
         MLPRegressor(),
         param_distributions=param_mlp_mx,
         n_iter=10,
         cv=5,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_mx, np.array(y_train_plus_val_mx).ravel())

# find best model hyperparameters 
print(mlp_cv_mx.best_params_)
print(mlp_cv_mx.best_estimator_)
print('Min RMSE for MLP regressor on Mixed Use is: {}'.format(-mlp_cv_mx.best_score_))

{'alpha': 2.765529811671396, 'batch_size': 20, 'hidden_layer_sizes': 4, 'learning_rate_init': 0.008664699052148592}
MLPRegressor(alpha=2.765529811671396, batch_size=20, hidden_layer_sizes=4,
             learning_rate_init=0.008664699052148592)
Min RMSE for MLP regressor on Mixed Use is: 235.52184352925582


In [None]:
# MAE of best model
print(mean_absolute_error(y_val_mx, mlp_cv_mx.best_estimator_.predict(X_val_mx)))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

### Mixed Use: XGBoost

In [20]:
# random search for XGBoost
param_xg_mx = {'n_estimators':[100, 500 ,1000],
               'max_depth': sp_randint(3, 15),
               'learning_rate_init': sp_loguniform(1e-4, 1e2),
               'min_child_weight': [1, 3, 5, 7]}

xg_cv_mx = RandomizedSearchCV(
            xgb.XGBRegressor(nthread=-1),
            param_distributions=param_xg_mx,
            n_iter=10,
            cv=5,
            scoring='neg_root_mean_squared_error',
            n_jobs=-1,
            random_state=123
).fit(X_train_plus_val_mx, np.array(y_train_plus_val_mx).ravel())

# find best model hyperparameters 
print(xg_cv_mx.best_params_)
print(xg_cv_mx.best_estimator_)
print('Min RMSE for XGBoost on Mixed Use is: {}'.format(-xg_cv_mx.best_score_))

Parameters: { "learning_rate_init" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


{'learning_rate_init': 0.20318358298265976, 'max_depth': 6, 'min_child_weight': 5, 'n_estimators': 500}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             learning_rate_init=0.20318358298265976, max_delta_step=0,
             max_depth=6, min_child_weight=5, missing=nan,
             monotone_constraints='()', n_estimators=500, n_jobs=16, nthread=-1,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
         

In [22]:
# MAE of best model
print(mean_absolute_error(y_val_mx, xg_cv_mx.best_estimator_.predict(X_val_mx)))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

NameError: name 'X_train_va' is not defined