In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb

from sklearn.model_selection import PredefinedSplit, RandomizedSearchCV, KFold, StratifiedKFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from scipy.stats import loguniform as sp_loguniform
from scipy.stats import randint as sp_randint

import statsmodels.api as sm
import statsmodels.formula.api as smf

import warnings
warnings.filterwarnings('ignore')

### Experiment on Single Family

In [2]:
# get training, validation and test data for single family
X_train_sf = pd.read_csv('X_train_sf.csv')
X_train_sf.index = X_train_sf['Unnamed: 0']
X_train_sf = X_train_sf.drop(['Unnamed: 0'], axis=1)
X_val_sf = pd.read_csv('X_val_sf.csv')
X_val_sf.index = X_val_sf['Unnamed: 0']
X_val_sf = X_val_sf.drop(['Unnamed: 0'], axis=1)
X_test_sf = pd.read_csv('X_test_sf.csv')
X_test_sf.index = X_test_sf['Unnamed: 0']
X_test_sf = X_test_sf.drop(['Unnamed: 0'], axis=1)
y_train_sf = pd.read_csv('y_train_sf.csv')
y_train_sf.index = y_train_sf['Unnamed: 0']
y_train_sf = y_train_sf.drop(['Unnamed: 0'], axis=1)
y_val_sf = pd.read_csv('y_val_sf.csv')
y_val_sf.index = y_val_sf['Unnamed: 0']
y_val_sf = y_val_sf.drop(['Unnamed: 0'], axis=1)
y_test_sf = pd.read_csv('y_test_sf.csv')
y_test_sf.index = y_test_sf['Unnamed: 0']
y_test_sf = y_test_sf.drop(['Unnamed: 0'], axis=1)

In [3]:
# create a predefined validation set for random search
y_train_plus_val_sf = pd.concat([y_train_sf, y_val_sf])
y_train_plus_val_sf_copy = y_train_plus_val_sf.copy()
y_train_plus_val_sf_copy.columns = ['train_val_split']
y_train_plus_val_sf_copy.loc[y_train_sf.index,'train_val_split'] = -1
y_train_plus_val_sf_copy.loc[y_val_sf.index,'train_val_split'] = 0
val_fold_sf = np.array(y_train_plus_val_sf_copy)
ps_sf = PredefinedSplit(val_fold_sf)

# get training plus validation set
X_train_plus_val_sf = pd.concat([X_train_sf, X_val_sf])

In [10]:
# create a function to calculate baseline RMSE and MAE
def rmse(l):
    return np.sqrt((np.sum((l-np.mean(l))**2))/len(l))
def mae(l):
    return np.sum(np.abs(l-np.mean(l)))/len(l)

### Single Family: Linear regression

In [4]:
# random search for linear regression
param_lr_sf = {'fit_intercept': [True,False],
               'normalize':[True,False]}

lr_cv_sf = RandomizedSearchCV(
           LinearRegression(),
           param_distributions=param_lr_sf,
           n_iter=10,
           cv=ps_sf,
           scoring='neg_root_mean_squared_error', 
           n_jobs=-1,
           random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# find best model hyperparameters 
print(lr_cv_sf.best_params_)
print(lr_cv_sf.best_estimator_)
print('Min RMSE for linear regression on Single Family is: {}'.format(-lr_cv_sf.best_score_))

{'normalize': True, 'fit_intercept': False}
LinearRegression(fit_intercept=False, normalize=True)
Min RMSE for linear regression on Single Family is: 23479006242.85987


In [11]:
# MAE of best model
print(mean_absolute_error(y_val_sf, lr_cv_sf.best_estimator_.predict(X_val_sf)))

# RMSE and MAE of baseline model
print(rmse(y_val_sf))
print(mae(y_val_sf))

106.65993138395191
sale_price    290.504546
dtype: float64
sale_price    155.263946
dtype: float64


### Single Family: Lasso

In [12]:
# random search for lasso
param_la_sf = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

la_cv_sf = RandomizedSearchCV(
        Lasso(),
        param_distributions=param_la_sf,
        n_iter=10,
        cv=ps_sf,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# find best model hyperparameters 
print(la_cv_sf.best_params_)
print(la_cv_sf.best_estimator_)
print('Min RMSE for lasso on Single Family is: {}'.format(-la_cv_sf.best_score_))

{'alpha': 0.07684071705306554, 'fit_intercept': False, 'normalize': True}
Lasso(alpha=0.07684071705306554, fit_intercept=False, normalize=True)
Min RMSE for lasso on Single Family is: 237.02107253707626


In [13]:
# MAE of best model
print(mean_absolute_error(y_val_sf, la_cv_sf.best_estimator_.predict(X_val_sf)))

# RMSE and MAE of baseline model
print(rmse(y_val_sf))
print(mae(y_val_sf))

106.50077190341187
sale_price    290.504546
dtype: float64
sale_price    155.263946
dtype: float64


### Single Family: Ridge

In [14]:
# random search for ridge
param_rd_sf = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

rd_cv_sf = RandomizedSearchCV(
        Ridge(),
        param_distributions=param_rd_sf,
        n_iter=10,
        cv=ps_sf,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# Find best model hyperparameters 
print(rd_cv_sf.best_params_)
print(rd_cv_sf.best_estimator_)
print('Min RMSE fort ridge on Single Family is: {}'.format(-rd_cv_sf.best_score_))

{'alpha': 76.66289057556017, 'fit_intercept': True, 'normalize': False}
Ridge(alpha=76.66289057556017)
Min RMSE fort ridge on Single Family is: 236.94456670464658


In [15]:
# MAE of best model
print(mean_absolute_error(y_val_sf, rd_cv_sf.best_estimator_.predict(X_val_sf)))

# RMSE and MAE of baseline model
print(rmse(y_val_sf))
print(mae(y_val_sf))

106.58155691099871
sale_price    290.504546
dtype: float64
sale_price    155.263946
dtype: float64


### Single Family: KNeighborsRegressor

In [16]:
# random search for KNeighborsRegressor
param_knn_sf = {'n_neighbors': sp_randint(1,21),
             'weights': ['uniform', 'distance'],
             'leaf_size': sp_randint(1,21)}

knn_cv_sf = RandomizedSearchCV(
         KNeighborsRegressor(),
         param_distributions=param_knn_sf,
         n_iter=10,
         cv=ps_sf,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# find best model hyperparameters 
print(knn_cv_sf.best_params_)
print(knn_cv_sf.best_estimator_)
print('Min RMSE for Knn regressor on Single Family is: {}'.format(-knn_cv_sf.best_score_))

{'leaf_size': 5, 'n_neighbors': 18, 'weights': 'distance'}
KNeighborsRegressor(leaf_size=5, n_neighbors=18, weights='distance')
Min RMSE for Knn regressor on Single Family is: 170.7286967851921


In [17]:
# MAE of best model
print(mean_absolute_error(y_val_sf, knn_cv_sf.best_estimator_.predict(X_val_sf)))

# RMSE and MAE of baseline model
print(rmse(y_val_sf))
print(mae(y_val_sf))

0.07647382492493975
sale_price    290.504546
dtype: float64
sale_price    155.263946
dtype: float64


### Single Family: RandomForestRegressor

In [18]:
# random search for RandomForestRegressor
param_rf_sf = {'n_estimators': [100, 120, 200, 300, 500, 800, 1200],
            'max_depth': [None, 5, 8, 15, 25, 30],
            'min_samples_split': [1, 2, 5, 10 ,15, 100],
            'min_samples_leaf': [1, 2, 5, 10],
            'max_features': ['auto', 'sqrt', 'log2']}

rf_cv_sf = RandomizedSearchCV(
        RandomForestRegressor(),
        param_distributions=param_rf_sf,
        n_iter=10,
        cv=ps_sf,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# find best model hyperparameters 
print(rf_cv_sf.best_params_)
print(rf_cv_sf.best_estimator_)
print('Min RMSE is for random forest regressor on Single Family is: {}'.format(-rf_cv_sf.best_score_))

{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': None}
RandomForestRegressor(min_samples_leaf=5, n_estimators=800)
Min RMSE is for random forest regressor on Single Family is: 136.8469916397392


In [19]:
# MAE of best model
print(mean_absolute_error(y_val_sf, rf_cv_sf.best_estimator_.predict(X_val_sf)))

# RMSE and MAE of baseline model
print(rmse(y_val_sf))
print(mae(y_val_sf))

34.78543529085287
sale_price    290.504546
dtype: float64
sale_price    155.263946
dtype: float64


### Single Family: MLPRegressor

In [20]:
# random search for MLPRegressor
hl = []
for i in [1, 2, 3, 4, 5, 10, 15, 25, 30]:
    hl.append((i))
    hl.append((i,i))

param_mlp_sf = {'hidden_layer_sizes': hl,
             'learning_rate_init': sp_loguniform(1e-4, 1e2),
             'alpha': sp_loguniform(1e-4, 1e2),
             'batch_size': [1, 3, 5, 10, 20, 50, 100, 250, 500]}

mlp_cv_sf = RandomizedSearchCV(
         MLPRegressor(),
         param_distributions=param_mlp_sf,
         n_iter=10,
         cv=ps_sf,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# find best model hyperparameters 
print(mlp_cv_sf.best_params_)
print(mlp_cv_sf.best_estimator_)
print('Min RMSE for MLP regressor on Single Family is: {}'.format(-mlp_cv_sf.best_score_))

{'alpha': 1.8792349823745838, 'batch_size': 500, 'hidden_layer_sizes': 30, 'learning_rate_init': 0.08964140621713512}
MLPRegressor(alpha=1.8792349823745838, batch_size=500, hidden_layer_sizes=30,
             learning_rate_init=0.08964140621713512)
Min RMSE for MLP regressor on Single Family is: 178.02757159982698


In [21]:
# MAE of best model
print(mean_absolute_error(y_val_sf, mlp_cv_sf.best_estimator_.predict(X_val_sf)))

# RMSE and MAE of baseline model
print(rmse(y_val_sf))
print(mae(y_val_sf))

73.1474578825477
sale_price    290.504546
dtype: float64
sale_price    155.263946
dtype: float64


### Single Family: XGBoost

In [22]:
# random search for XGBoost
param_xg_sf = {'n_estimators':[100, 500 ,1000],
               'max_depth': sp_randint(3, 15),
               'learning_rate_init': sp_loguniform(1e-4, 1e2),
               'min_child_weight': [1, 3, 5, 7]}

xg_cv_sf = RandomizedSearchCV(
            xgb.XGBRegressor(nthread=-1),
            param_distributions=param_xg_sf,
            n_iter=10,
            cv=ps_sf,
            scoring='neg_root_mean_squared_error',
            n_jobs=-1,
            random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# find best model hyperparameters 
print(xg_cv_sf.best_params_)
print(xg_cv_sf.best_estimator_)
print('Min RMSE for XGBoost on Single Family is: {}'.format(-xg_cv_sf.best_score_))

Parameters: { "learning_rate_init" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


{'learning_rate_init': 0.15522637752063606, 'max_depth': 11, 'min_child_weight': 3, 'n_estimators': 100}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             learning_rate_init=0.15522637752063606, max_delta_step=0,
             max_depth=11, min_child_weight=3, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=16, nthread=-1,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
       

In [23]:
# MAE of best model
print(mean_absolute_error(y_val_sf, xg_cv_sf.best_estimator_.predict(X_val_sf)))

# RMSE and MAE of baseline model
print(rmse(y_val_sf))
print(mae(y_val_sf))

31.23552542508914
sale_price    290.504546
dtype: float64
sale_price    155.263946
dtype: float64


### Experiment on Multi Family

In [None]:
# MULTIFAMILY get training, validation and test data
X_train_mf = pd.read_csv('X_train_mf.csv')
X_train_mf.index = X_train_mf['Unnamed: 0']
X_train_mf = X_train_mf.drop(['Unnamed: 0'], axis=1)

X_val_mf = pd.read_csv('X_val_mf.csv')
X_val_mf.index = X_val_mf['Unnamed: 0']
X_val_mf = X_val_mf.drop(['Unnamed: 0'], axis=1)

X_test_mf = pd.read_csv('X_test_mf.csv')
X_test_mf.index = X_test_mf['Unnamed: 0']
X_test_mf = X_test_mf.drop(['Unnamed: 0'], axis=1)

y_train_mf = pd.read_csv('y_train_mf.csv')
y_train_mf.index = y_train_mf['Unnamed: 0']
y_train_mf = y_train_mf.drop(['Unnamed: 0'], axis=1)

y_val_mf = pd.read_csv('y_val_mf.csv')
y_val_mf.index = y_val_mf['Unnamed: 0']
y_val_mf = y_val_mf.drop(['Unnamed: 0'], axis=1)

y_test_mf = pd.read_csv('y_test_mf.csv')
y_test_mf.index = y_test_mf['Unnamed: 0']
y_test_mf = y_test_mf.drop(['Unnamed: 0'], axis=1)

In [None]:
# Multifamily
y_train_plus_val_mf = pd.concat([y_train_mf, y_val_mf])
y_train_plus_val_mf_copy = y_train_plus_val_mf.copy()
y_train_plus_val_mf_copy.columns = ['train_val_split']
y_train_plus_val_mf_copy.loc[y_train_mf.index,'train_val_split'] = -1
y_train_plus_val_mf_copy.loc[y_val_mf.index,'train_val_split'] = 0
val_fold_mf = np.array(y_train_plus_val_mf_copy)
ps_mf = PredefinedSplit(val_fold_mf)

# Multifamily training plus validation set
X_train_plus_val_mf = pd.concat([X_train_mf, X_val_mf])

### Multi Family: Linear regression

In [None]:
# random search for linear regression
param_lr_mf = {'fit_intercept': [True,False],
               'normalize':[True,False]}

lr_cv_mf = RandomizedSearchCV(
           LinearRegression(),
           param_distributions=param_lr_mf,
           n_iter=20,
           cv=ps_mf,
           scoring='neg_root_mean_squared_error', 
           n_jobs=-1,
           random_state=123
).fit(X_train_plus_val_mf, np.array(y_train_plus_val_mf).ravel())

# find best model hyperparameters 
print(lr_cv_mf.best_params_)
print(lr_cv_mf.best_estimator_)
print('MULTIFAMILY Linear min RMSE is: {}'.format(-lr_cv_mf.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_mf, lr_cv_mf.best_estimator_.predict(X_val_mf)))

# RMSE and MAE of baseline model
print(rmse(y_val_mf))
print(mae(y_val_mf))

### Multi Family: Lasso

In [None]:
# random search for lasso
param_la_mf = {'alpha': sp_uniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

la_cv_mf = RandomizedSearchCV(
        Lasso(),
        param_distributions=param_la_mf,
        n_iter=10,
        cv=ps_mf,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_mf, np.array(y_train_plus_val_mf).ravel())

# find best model hyperparameters 
print(la_cv_mf.best_params_)
print(la_cv_mf.best_estimator_)
print('MULTI FAMILY LASSO min RMSE is: {}'.format(-la_cv_mf.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_mf, la_cv_mf.best_estimator_.predict(X_val_mf)))

# RMSE and MAE of baseline model
print(rmse(y_val_mf))
print(mae(y_val_mf))

### Multi Family: Ridge

In [None]:
# random search for ridge
param_rd = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

rd_cv = RandomizedSearchCV(
        Ridge(),
        param_distributions=param_rd,
        n_iter=10,
        cv=ps_mf,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_mf, np.array(y_train_plus_val_mf).ravel())

# Find best model hyperparameters 
print(rd_cv.best_params_)
print(rd_cv.best_estimator_)
print('MULTIFAMILY RIDGE min RMSE is: {}'.format(-rd_cv.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_mf, rd_cv_mf.best_estimator_.predict(X_val_mf)))

# RMSE and MAE of baseline model
print(rmse(y_val_mf))
print(mae(y_val_mf))

### Multi Family: KNN Regressor

In [None]:
# random search for KNeighborsRegressor
param_knn = {'n_neighbors': sp_randint(1,21),
             'weights': ['uniform', 'distance'],
             'leaf_size': sp_randint(1,21)}

knn_cv = RandomizedSearchCV(
         KNeighborsRegressor(),
         param_distributions=param_knn,
         n_iter=10,
         cv=ps_mf,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_mf, np.array(y_train_plus_val_mf).ravel())

# find best model hyperparameters 
print(knn_cv.best_params_)
print(knn_cv.best_estimator_)
print('MULTIFAMILY KNN min RMSE is: {}'.format(-knn_cv.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_mf, knn_cv_mf.best_estimator_.predict(X_val_mf)))

# RMSE and MAE of baseline model
print(rmse(y_val_mf))
print(mae(y_val_mf))

### Multi Family: Random Forest Regressor

In [None]:
# random search for RandomForestRegressor
param_rf = {'n_estimators': [100, 120, 200, 300, 500, 800, 1200],
            'max_depth': [None, 5, 8, 15, 25, 30],
            'min_samples_split': [1, 2, 5, 10 ,15, 100],
            'min_samples_leaf': [1, 2, 5, 10],
            'max_features': ['auto', 'sqrt', 'log2']}

rf_cv = RandomizedSearchCV(
        RandomForestRegressor(),
        param_distributions=param_rf,
        n_iter=10,
        cv=ps_mf,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_mf, np.array(y_train_plus_val_mf).ravel())

# find best model hyperparameters 
print(rf_cv.best_params_)
print(rf_cv.best_estimator_)
print('RF MULTIFAMILY min RMSE is: {}'.format(-rf_cv.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_mf, rf_cv_mf.best_estimator_.predict(X_val_mf)))

# RMSE and MAE of baseline model
print(rmse(y_val_mf))
print(mae(y_val_mf))

### Multi Family: MLP Regressor

In [None]:
# random search for MLPRegressor
hl = []
for i in [1, 2, 3, 4, 5, 10, 15, 25, 30]:
    hl.append((i))
    hl.append((i,i))

param_mlp = {'hidden_layer_sizes': hl,
             'learning_rate_init': sp_loguniform(1e-4, 1e2),
             'alpha': sp_loguniform(1e-4, 1e2),
             'batch_size': [1, 5, 10, 25, 100, 500]}

mlp_cv = RandomizedSearchCV(
         MLPRegressor(),
         param_distributions=param_mlp,
         n_iter=10,
         cv=ps_mf,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_mf, np.array(y_train_plus_val_mf).ravel())

# find best model hyperparameters 
print(mlp_cv.best_params_)
print(mlp_cv.best_estimator_)
print('MLPRegressor MULTIFAMILY min RMSE is: {}'.format(-mlp_cv.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_mf, mlp_cv_mf.best_estimator_.predict(X_val_mf)))

# RMSE and MAE of baseline model
print(rmse(y_val_mf))
print(mae(y_val_mf))

### Multi Family: XGboost Regressor

In [None]:
# random search for XGBoost
param_xg_mf = {'n_estimators':[100, 500 ,1000],
               'max_depth': sp_randint(3, 15),
               'learning_rate_init': sp_loguniform(1e-4, 1e2),
               'min_child_weight': [1, 3, 5, 7]}

xg_cv_mf = RandomizedSearchCV(
            xgb.XGBRegressor(nthread=-1),
            param_distributions=param_xg_mf,
            n_iter=10,
            cv=ps_sf,
            scoring='neg_root_mean_squared_error',
            n_jobs=-1,
            random_state=123
).fit(X_train_plus_val_mf, np.array(y_train_plus_val_mf).ravel())

# find best model hyperparameters 
print(xg_cv_mf.best_params_)
print(xg_cv_mf.best_estimator_)
print('Min RMSE for XGBoost on Multi Family is: {}'.format(-xg_cv_mf.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_mf, xg_cv_mf.best_estimator_.predict(X_val_mf)))

# RMSE and MAE of baseline model
print(rmse(y_val_mf))
print(mae(y_val_mf))

### Experiment on Commercial

In [None]:
# Commercial-- training, validation and test data
X_train_cm = pd.read_csv('X_train_cm.csv')
X_train_cm.index = X_train_cm['Unnamed: 0']
X_train_cm = X_train_cm.drop(['Unnamed: 0'], axis=1)

X_val_cm = pd.read_csv('X_val_cm.csv')
X_val_cm.index = X_val_cm['Unnamed: 0']
X_val_cm = X_val_cm.drop(['Unnamed: 0'], axis=1)

X_test_cm = pd.read_csv('X_test_cm.csv')
X_test_cm.index = X_test_cm['Unnamed: 0']
X_test_cm = X_test_cm.drop(['Unnamed: 0'], axis=1)

y_train_cm = pd.read_csv('y_train_cm.csv')
y_train_cm.index = y_train_cm['Unnamed: 0']
y_train_cm = y_train_cm.drop(['Unnamed: 0'], axis=1)

y_val_cm = pd.read_csv('y_val_cm.csv')
y_val_cm.index = y_val_cm['Unnamed: 0']
y_val_cm = y_val_cm.drop(['Unnamed: 0'], axis=1)

y_test_cm = pd.read_csv('y_test_cm.csv')
y_test_cm.index = y_test_cm['Unnamed: 0']
y_test_cm = y_test_cm.drop(['Unnamed: 0'], axis=1)

In [None]:
# Commercial Y train+val,--concate and prepare for "PredefinedSplit"
y_train_plus_val_cm = pd.concat([y_train_cm, y_val_cm])
y_train_plus_val_cm_copy = y_train_plus_val_cm.copy()
y_train_plus_val_cm_copy.columns = ['train_val_split']
y_train_plus_val_cm_copy.loc[y_train_cm.index,'train_val_split'] = -1
y_train_plus_val_cm_copy.loc[y_val_cm.index,'train_val_split'] = 0
val_fold_cm = np.array(y_train_plus_val_cm_copy)
ps_cm = PredefinedSplit(val_fold_cm)

# Commercial X train+val
X_train_plus_val_cm = pd.concat([X_train_cm, X_val_cm])

### Commercial: Linear Regression

In [None]:
# random search for linear regression
param_lr_cm = {'fit_intercept': [True,False],
               'normalize':[True,False]}

lr_cv_cm = RandomizedSearchCV(
           LinearRegression(),
           param_distributions=param_lr_cm,
           n_iter=20,
           cv=ps_cm,
           scoring='neg_root_mean_squared_error', 
           n_jobs=-1,
           random_state=123
).fit(X_train_plus_val_cm, np.array(y_train_plus_val_cm).ravel())

# find best model hyperparameters 
print(lr_cv_cm.best_params_)
print(lr_cv_cm.best_estimator_)
print('Commercial Linear min RMSE is: {}'.format(-lr_cv_cm.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_cm, lr_cv_cm.best_estimator_.predict(X_val_cm)))

# RMSE and MAE of baseline model
print(rmse(y_val_cm))
print(mae(y_val_cm))

### Commercial: Lasso

In [None]:
# random search for lasso
param_la_cm = {'alpha': sp_uniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

la_cv_cm = RandomizedSearchCV(
        Lasso(),
        param_distributions=param_la_cm,
        n_iter=10,
        cv=ps_cm,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_cm, np.array(y_train_plus_val_cm).ravel())

# find best model hyperparameters 
print(la_cv_cm.best_params_)
print(la_cv_cm.best_estimator_)
print('Commercial LASSO min RMSE is: {}'.format(-la_cv_cm.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_cm, la_cv_cm.best_estimator_.predict(X_val_cm)))

# RMSE and MAE of baseline model
print(rmse(y_val_cm))
print(mae(y_val_cm))

### Commercial: Ridge

In [None]:
# random search for ridge
param_rd = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

rd_cv = RandomizedSearchCV(
        Ridge(),
        param_distributions=param_rd,
        n_iter=10,
        cv=ps_cm,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_cm, np.array(y_train_plus_val_cm).ravel())

# Find best model hyperparameters 
print(rd_cv.best_params_)
print(rd_cv.best_estimator_)
print('Commercial RIDGE min RMSE is: {}'.format(-rd_cv.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_cm, rd_cv_cm.best_estimator_.predict(X_val_cm)))

# RMSE and MAE of baseline model
print(rmse(y_val_cm))
print(mae(y_val_cm))

### Commercial: KNN Regressor

In [None]:
# random search for KNeighborsRegressor
param_knn = {'n_neighbors': sp_randint(1,21),
             'weights': ['uniform', 'distance'],
             'leaf_size': sp_randint(1,21)}

knn_cv = RandomizedSearchCV(
         KNeighborsRegressor(),
         param_distributions=param_knn,
         n_iter=10,
         cv=ps_cm,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_cm, np.array(y_train_plus_val_cm).ravel())

# find best model hyperparameters 
print(knn_cv.best_params_)
print(knn_cv.best_estimator_)
print('Commercial KNN min RMSE is: {}'.format(-knn_cv.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_cm, knn_cv_cm.best_estimator_.predict(X_val_cm)))

# RMSE and MAE of baseline model
print(rmse(y_val_cm))
print(mae(y_val_cm))

### Commercial: Random Forest Regressor

In [None]:
# random search for RandomForestRegressor
param_rf = {'n_estimators': [100, 120, 200, 300, 500, 800, 1200],
            'max_depth': [None, 5, 8, 15, 25, 30],
            'min_samples_split': [1, 2, 5, 10 ,15, 100],
            'min_samples_leaf': [1, 2, 5, 10],
            'max_features': ['auto', 'sqrt', 'log2']}

rf_cv = RandomizedSearchCV(
        RandomForestRegressor(),
        param_distributions=param_rf,
        n_iter=10,
        cv=ps_cm,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_cm, np.array(y_train_plus_val_cm).ravel())

# find best model hyperparameters 
print(rf_cv.best_params_)
print(rf_cv.best_estimator_)
print('RF Commercial min RMSE is: {}'.format(-rf_cv.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_cm, rf_cv_cm.best_estimator_.predict(X_val_cm)))

# RMSE and MAE of baseline model
print(rmse(y_val_cm))
print(mae(y_val_cm))

### Commercial: MLP Regressor

In [None]:
# random search for MLPRegressor
hl = []
for i in [1, 2, 3, 4, 5, 10, 15, 25, 30]:
    hl.append((i))
    hl.append((i,i))

param_mlp = {'hidden_layer_sizes': hl,
             'learning_rate_init': sp_loguniform(1e-4, 1e2),
             'alpha': sp_loguniform(1e-4, 1e2),
             'batch_size': [1, 5, 10, 25, 100, 500]}

mlp_cv = RandomizedSearchCV(
         MLPRegressor(),
         param_distributions=param_mlp,
         n_iter=10,
         cv=ps_cm,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_cm, np.array(y_train_plus_val_cm).ravel())

# find best model hyperparameters 
print(mlp_cv.best_params_)
print(mlp_cv.best_estimator_)
print('MLPRegressor Commercial min RMSE is: {}'.format(-mlp_cv.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_cm, mlp_cv_cm.best_estimator_.predict(X_val_cm)))

# RMSE and MAE of baseline model
print(rmse(y_val_cm))
print(mae(y_val_cm))

### Commercial: XGBoost

In [None]:
# random search for XGBoost
param_xg_cm = {'n_estimators':[100, 500 ,1000],
               'max_depth': sp_randint(3, 15),
               'learning_rate_init': sp_loguniform(1e-4, 1e2),
               'min_child_weight': [1, 3, 5, 7]}

xg_cv_mf = RandomizedSearchCV(
            xgb.XGBRegressor(nthread=-1),
            param_distributions=param_xg_cm,
            n_iter=10,
            cv=ps_cm,
            scoring='neg_root_mean_squared_error',
            n_jobs=-1,
            random_state=123
).fit(X_train_plus_val_cm, np.array(y_train_plus_val_cm).ravel())

# find best model hyperparameters 
print(xg_cv_cm.best_params_)
print(xg_cv_cm.best_estimator_)
print('Min RMSE for XGBoost on Commercial is: {}'.format(-xg_cv_cm.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_cm, xg_cv_cm.best_estimator_.predict(X_val_cm)))

# RMSE and MAE of baseline model
print(rmse(y_val_cm))
print(mae(y_val_cm))

### Experiment on Industrial

In [None]:
#INDUSTRIAL get training, validation and test data
X_train_id = pd.read_csv('X_train_id.csv')
X_train_id.index = X_train_id['Unnamed: 0']
X_train_id = X_train_id.drop(['Unnamed: 0'], axis=1)

X_val_id = pd.read_csv('X_val_id.csv')
X_val_id.index = X_val_id['Unnamed: 0']
X_val_id = X_val_id.drop(['Unnamed: 0'], axis=1)

X_test_id = pd.read_csv('X_test_id.csv')
X_test_id.index = X_test_id['Unnamed: 0']
X_test_id = X_test_id.drop(['Unnamed: 0'], axis=1)

y_train_id = pd.read_csv('y_train_id.csv')
y_train_id.index = y_train_id['Unnamed: 0']
y_train_id = y_train_id.drop(['Unnamed: 0'], axis=1)

y_val_id = pd.read_csv('y_val_id.csv')
y_val_id.index = y_val_id['Unnamed: 0']
y_val_id = y_val_id.drop(['Unnamed: 0'], axis=1)

y_test_id = pd.read_csv('y_test_id.csv')
y_test_id.index = y_test_id['Unnamed: 0']
y_test_id = y_test_id.drop(['Unnamed: 0'], axis=1)

In [None]:
#Industrial
y_train_plus_val_id = pd.concat([y_train_id, y_val_id])
y_train_plus_val_id_copy = y_train_plus_val_id.copy()
y_train_plus_val_id_copy.columns = ['train_val_split']
y_train_plus_val_id_copy.loc[y_train_id.index,'train_val_split'] = -1
y_train_plus_val_id_copy.loc[y_val_id.index,'train_val_split'] = 0
val_fold_id = np.array(y_train_plus_val_id_copy)
ps_id = PredefinedSplit(val_fold_id)

# Industrial
X_train_plus_val_id = pd.concat([X_train_id, X_val_id])

### Industrial: Linear regression

In [None]:
# random search for linear regression
param_lr_id = {'fit_intercept': [True,False],
               'normalize':[True,False]}

lr_cv_id = RandomizedSearchCV(
           LinearRegression(),
           param_distributions=param_lr_id,
           n_iter=20,
           cv=3,
           scoring='neg_root_mean_squared_error', 
           n_jobs=-1,
           random_state=123
).fit(X_train_plus_val_id, np.array(y_train_plus_val_id).ravel())

# find best model hyperparameters 
print(lr_cv_id.best_params_)
print(lr_cv_id.best_estimator_)
print('INDUSTRIAL Linear min RMSE is: {}'.format(-lr_cv_id.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_id, lr_cv_id.best_estimator_.predict(X_val_id)))

# RMSE and MAE of baseline model
print(rmse(y_val_id))
print(mae(y_val_id))

### Industrial: Lasso

In [None]:
# random search for lasso
param_la_id = {'alpha': sp_uniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

la_cv_id = RandomizedSearchCV(
        Lasso(),
        param_distributions=param_la_id,
        n_iter=10,
        cv=3,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_id, np.array(y_train_plus_val_id).ravel())

# find best model hyperparameters 
print(la_cv_id.best_params_)
print(la_cv_id.best_estimator_)
print('INDUSTRIAL LASSO min RMSE is: {}'.format(-la_cv_id.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_id, la_cv_id.best_estimator_.predict(X_val_id)))

# RMSE and MAE of baseline model
print(rmse(y_val_id))
print(mae(y_val_id))

### Industrial: Ridge

In [None]:
# random search for ridge
param_id = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

rd_cv_id = RandomizedSearchCV(
        Ridge(),
        param_distributions=param_id,
        n_iter=10,
        cv= 3,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_id, np.array(y_train_plus_val_id).ravel())

# Find best model hyperparameters 
print(rd_cv_id.best_params_)
print(rd_cv_id.best_estimator_)
print('INDUSTRIAL RIDGE min RMSE is: {}'.format(-rd_cv_id.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_id, rd_cv_id.best_estimator_.predict(X_val_id)))

# RMSE and MAE of baseline model
print(rmse(y_val_id))
print(mae(y_val_id))

### Industria: KNN

In [None]:
# random search for KNeighborsRegressor
param_knn_id = {'n_neighbors': sp_randint(1,21),
             'weights': ['uniform', 'distance'],
             'leaf_size': sp_randint(1,21)}

knn_cv_id = RandomizedSearchCV(
         KNeighborsRegressor(),
         param_distributions=param_knn_id,
         n_iter=10,
         cv= 3,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_id, np.array(y_train_plus_val_id).ravel())

# find best model hyperparameters 
print(knn_cv_id.best_params_)
print(knn_cv_id.best_estimator_)
print('INDUSTRIAL KNN min RMSE is: {}'.format(-knn_cv_id.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_id, knn_cv_id.best_estimator_.predict(X_val_id)))

# RMSE and MAE of baseline model
print(rmse(y_val_id))
print(mae(y_val_id))

### Industrial: Random Forest

In [None]:
# random search for RandomForestRegressor
param_rf_id = {'n_estimators': [100, 120, 200, 300, 500, 800, 1200],
            'max_depth': [None, 5, 8, 15, 25, 30],
            'min_samples_split': [1, 2, 5, 10 ,15, 100],
            'min_samples_leaf': [1, 2, 5, 10],
            'max_features': ['auto', 'sqrt', 'log2']}

rf_cv_id = RandomizedSearchCV(
        RandomForestRegressor(),
        param_distributions=param_rf_id,
        n_iter=10,
        cv= 3,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_id, np.array(y_train_plus_val_id).ravel())

# find best model hyperparameters 
print(rf_cv_id.best_params_)
print(rf_cv_id.best_estimator_)
print('RF MULTIFAMILY min RMSE is: {}'.format(-rf_cv_id.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_id, rf_cv_id.best_estimator_.predict(X_val_id)))

# RMSE and MAE of baseline model
print(rmse(y_val_id))
print(mae(y_val_id))

### Industrial: MLP Regressor

In [None]:
# random search for MLPRegressor
hl = []
for i in [1, 2, 3, 4, 5, 10, 15, 25, 30]:
    hl.append((i))
    hl.append((i,i))

param_mlp_id = {'hidden_layer_sizes': hl,
             'learning_rate_init': sp_loguniform(1e-4, 1e2),
             'alpha': sp_loguniform(1e-4, 1e2),
             'batch_size': [1, 5, 10, 25, 100, 500]}

mlp_cv_id = RandomizedSearchCV(
         MLPRegressor(),
         param_distributions=param_mlp_id,
         n_iter=10,
         cv= 3,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_id, np.array(y_train_plus_val_id).ravel())

# find best model hyperparameters 
print(mlp_cv_id.best_params_)
print(mlp_cv_id.best_estimator_)
print('MLPRegressor INDUSTRIAL min RMSE is: {}'.format(-mlp_cv_id.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_id, mlp_cv_id.best_estimator_.predict(X_val_id)))

# RMSE and MAE of baseline model
print(rmse(y_val_id))
print(mae(y_val_id))

### Industrial: XGBoost

In [None]:
# random search for XGBoost
param_xg_id = {'n_estimators':[100, 500 ,1000],
               'max_depth': sp_randint(3, 15),
               'learning_rate_init': sp_loguniform(1e-4, 1e2),
               'min_child_weight': [1, 3, 5, 7]}

xg_cv_id = RandomizedSearchCV(
            xgb.XGBRegressor(nthread=-1),
            param_distributions=param_xg_id,
            n_iter=10,
            cv=3,
            scoring='neg_root_mean_squared_error',
            n_jobs=-1,
            random_state=123
).fit(X_train_plus_val_id, np.array(y_train_plus_val_id).ravel())

# find best model hyperparameters 
print(xg_cv_id.best_params_)
print(xg_cv_id.best_estimator_)
print('Min RMSE for XGBoost on Industral is: {}'.format(-xg_cv_id.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_id, xg_cv_id.best_estimator_.predict(X_val_id)))

# RMSE and MAE of baseline model
print(rmse(y_val_id))
print(mae(y_val_id))

### Experiment on Mixed Use

In [12]:
# get training, validation and test data for mixed use
X_train_mx = pd.read_csv('X_train_mx.csv')
X_train_mx.index = X_train_mx['Unnamed: 0']
X_train_mx = X_train_mx.drop(['Unnamed: 0'], axis=1)
X_val_mx = pd.read_csv('X_val_mx.csv')
X_val_mx.index = X_val_mx['Unnamed: 0']
X_val_mx = X_val_mx.drop(['Unnamed: 0'], axis=1)
X_test_mx = pd.read_csv('X_test_mx.csv')
X_test_mx.index = X_test_mx['Unnamed: 0']
X_test_mx = X_test_mx.drop(['Unnamed: 0'], axis=1)
y_train_mx = pd.read_csv('y_train_mx.csv')
y_train_mx.index = y_train_mx['Unnamed: 0']
y_train_mx = y_train_mx.drop(['Unnamed: 0'], axis=1)
y_val_mx = pd.read_csv('y_val_mx.csv')
y_val_mx.index = y_val_mx['Unnamed: 0']
y_val_mx = y_val_mx.drop(['Unnamed: 0'], axis=1)
y_test_mx = pd.read_csv('y_test_mx.csv')
y_test_mx.index = y_test_mx['Unnamed: 0']
y_test_mx = y_test_mx.drop(['Unnamed: 0'], axis=1)

In [13]:
# get training plus validation set
X_train_plus_val_mx = pd.concat([X_train_mx, X_val_mx])
y_train_plus_val_mx = pd.concat([y_train_mx, y_val_mx])

### Mixed Use: Linear regression

In [14]:
# random search for linear regression
param_lr_mx = {'fit_intercept': [True,False],
               'normalize':[True,False]}

lr_cv_mx = RandomizedSearchCV(
           LinearRegression(),
           param_distributions=param_lr_mx,
           n_iter=10,
           cv=5,
           scoring='neg_root_mean_squared_error', 
           n_jobs=-1,
           random_state=123
).fit(X_train_plus_val_mx, np.array(y_train_plus_val_mx).ravel())

# find best model hyperparameters 
print(lr_cv_mx.best_params_)
print(lr_cv_mx.best_estimator_)
print('Min RMSE for linear regression is: {}'.format(-lr_cv_mx.best_score_))

{'normalize': True, 'fit_intercept': False}
LinearRegression(fit_intercept=False, normalize=True)
Min RMSE for linear regression is: 10122191127754.484


In [None]:
# MAE of best model
print(mean_absolute_error(y_val_mx, lr_cv_mx.best_estimator_.predict(X_val_mx)))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

### Mixed Use: Lasso

In [15]:
# random search for lasso
param_la_mx = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

la_cv_mx = RandomizedSearchCV(
        Lasso(),
        param_distributions=param_la_mx,
        n_iter=10,
        cv=5,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_mx, np.array(y_train_plus_val_mx).ravel())

# find best model hyperparameters 
print(la_cv_mx.best_params_)
print(la_cv_mx.best_estimator_)
print('Min RMSE for lasso fro Mixed Use is: {}'.format(-la_cv_mx.best_score_))

{'alpha': 0.15463515822289586, 'fit_intercept': True, 'normalize': False}
Lasso(alpha=0.15463515822289586)
Min RMSE for lasso fro Mixed Use is: 261.77672139034263


In [None]:
# MAE of best model
print(mean_absolute_error(y_val_mx, la_cv_mx.best_estimator_.predict(X_val_mx)))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

### Mixed Use: Ridge

In [16]:
# random search for ridge
param_rd_mx = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

rd_cv_mx = RandomizedSearchCV(
        Ridge(),
        param_distributions=param_rd_mx,
        n_iter=10,
        cv=5,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_mx, np.array(y_train_plus_val_mx).ravel())

# Find best model hyperparameters 
print(rd_cv_mx.best_params_)
print(rd_cv_mx.best_estimator_)
print('Min RMSE for ridge on Mixed Use is: {}'.format(-rd_cv_mx.best_score_))

{'alpha': 76.66289057556017, 'fit_intercept': True, 'normalize': False}
Ridge(alpha=76.66289057556017)
Min RMSE for ridge on Mixed Use is: 261.77449765235895


In [None]:
# MAE of best model
print(mean_absolute_error(y_val_mx, rd_cv_mx.best_estimator_.predict(X_val_mx)))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

### Mixed Use: KNeighborsRegressor

In [17]:
# random search for KNeighborsRegressor
param_knn_mx = {'n_neighbors': sp_randint(1,21),
             'weights': ['uniform', 'distance'],
             'leaf_size': sp_randint(1,21)}

knn_cv_mx = RandomizedSearchCV(
         KNeighborsRegressor(),
         param_distributions=param_knn_mx,
         n_iter=10,
         cv=5,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_mx, np.array(y_train_plus_val_mx).ravel())

# find best model hyperparameters 
print(knn_cv_mx.best_params_)
print(knn_cv_mx.best_estimator_)
print('Min RMSE for Knn regressor on Mixed Use is: {}'.format(-knn_cv_mx.best_score_))

{'leaf_size': 20, 'n_neighbors': 11, 'weights': 'distance'}
KNeighborsRegressor(leaf_size=20, n_neighbors=11, weights='distance')
Min RMSE for Knn regressor on Mixed Use is: 240.56197162525683


In [None]:
# MAE of best model
print(mean_absolute_error(y_val_mx, knn_cv_mx.best_estimator_.predict(X_val_mx)))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

### Mixed Use: RandomForestRegressor

In [18]:
# random search for RandomForestRegressor
param_rf_mx = {'n_estimators': [100, 120, 200, 300, 500, 800, 1200],
            'max_depth': [None, 5, 8, 15, 25, 30],
            'min_samples_split': [1, 2, 5, 10 ,15, 100],
            'min_samples_leaf': [1, 2, 5, 10],
            'max_features': ['auto', 'sqrt', 'log2']}

rf_cv_mx = RandomizedSearchCV(
        RandomForestRegressor(),
        param_distributions=param_rf_mx,
        n_iter=10,
        cv=5,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_mx, np.array(y_train_plus_val_mx).ravel())

# find best model hyperparameters 
print(rf_cv_mx.best_params_)
print(rf_cv_mx.best_estimator_)
print('Min RMSE is for random forest regressor on Mixed Use is: {}'.format(-rf_cv_mx.best_score_))

{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': None}
RandomForestRegressor(min_samples_leaf=5, n_estimators=800)
Min RMSE is for random forest regressor on Mixed Use is: 207.44661783558658


In [None]:
# MAE of best model
print(mean_absolute_error(y_val_mx, rf_cv_mx.best_estimator_.predict(X_val_mx)))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

### Mixed Use: MLPRegressor

In [19]:
# random search for MLPRegressor
hl = []
for i in [1, 2, 3, 4, 5, 10, 15, 25, 30]:
    hl.append((i))
    hl.append((i,i))

param_mlp_mx = {'hidden_layer_sizes': hl,
             'learning_rate_init': sp_loguniform(1e-4, 1e2),
             'alpha': sp_loguniform(1e-4, 1e2),
             'batch_size': [1, 3, 5, 10, 20, 50, 100, 250, 500]}

mlp_cv_mx = RandomizedSearchCV(
         MLPRegressor(),
         param_distributions=param_mlp_mx,
         n_iter=10,
         cv=5,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_mx, np.array(y_train_plus_val_mx).ravel())

# find best model hyperparameters 
print(mlp_cv_mx.best_params_)
print(mlp_cv_mx.best_estimator_)
print('Min RMSE for MLP regressor on Mixed Use is: {}'.format(-mlp_cv_mx.best_score_))

{'alpha': 2.765529811671396, 'batch_size': 20, 'hidden_layer_sizes': 4, 'learning_rate_init': 0.008664699052148592}
MLPRegressor(alpha=2.765529811671396, batch_size=20, hidden_layer_sizes=4,
             learning_rate_init=0.008664699052148592)
Min RMSE for MLP regressor on Mixed Use is: 235.52184352925582


In [None]:
# MAE of best model
print(mean_absolute_error(y_val_mx, mlp_cv_mx.best_estimator_.predict(X_val_mx)))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

### Mixed Use: XGBoost

In [20]:
# random search for XGBoost
param_xg_mx = {'n_estimators':[100, 500 ,1000],
               'max_depth': sp_randint(3, 15),
               'learning_rate_init': sp_loguniform(1e-4, 1e2),
               'min_child_weight': [1, 3, 5, 7]}

xg_cv_mx = RandomizedSearchCV(
            xgb.XGBRegressor(nthread=-1),
            param_distributions=param_xg_mx,
            n_iter=10,
            cv=5,
            scoring='neg_root_mean_squared_error',
            n_jobs=-1,
            random_state=123
).fit(X_train_plus_val_mx, np.array(y_train_plus_val_mx).ravel())

# find best model hyperparameters 
print(xg_cv_mx.best_params_)
print(xg_cv_mx.best_estimator_)
print('Min RMSE for XGBoost on Mixed Use is: {}'.format(-xg_cv_mx.best_score_))

Parameters: { "learning_rate_init" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


{'learning_rate_init': 0.20318358298265976, 'max_depth': 6, 'min_child_weight': 5, 'n_estimators': 500}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             learning_rate_init=0.20318358298265976, max_delta_step=0,
             max_depth=6, min_child_weight=5, missing=nan,
             monotone_constraints='()', n_estimators=500, n_jobs=16, nthread=-1,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
         

In [22]:
# MAE of best model
print(mean_absolute_error(y_val_mx, xg_cv_mx.best_estimator_.predict(X_val_mx)))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

NameError: name 'X_train_va' is not defined

### Experiment on Vacant Land

In [None]:
# vacant-- training, validation and test data
X_train_va = pd.read_csv('X_train_va.csv')
X_train_va.index = X_train_va['Unnamed: 0']
X_train_va = X_train_va.drop(['Unnamed: 0'], axis=1)

X_val_va = pd.read_csv('X_val_va.csv')
X_val_va.index = X_val_va['Unnamed: 0']
X_val_va = X_val_va.drop(['Unnamed: 0'], axis=1)

X_test_va = pd.read_csv('X_test_va.csv')
X_test_va.index = X_test_va['Unnamed: 0']
X_test_va = X_test_va.drop(['Unnamed: 0'], axis=1)

y_train_va = pd.read_csv('y_train_va.csv')
y_train_va.index = y_train_va['Unnamed: 0']
y_train_va = y_train_va.drop(['Unnamed: 0'], axis=1)

y_val_va = pd.read_csv('y_val_va.csv')
y_val_va.index = y_val_va['Unnamed: 0']
y_val_va = y_val_va.drop(['Unnamed: 0'], axis=1)

y_test_va = pd.read_csv('y_test_va.csv')
y_test_va.index = y_test_va['Unnamed: 0']
y_test_va = y_test_va.drop(['Unnamed: 0'], axis=1)

In [None]:
# Vacant Y train+val,--concate and prepare for "PredefinedSplit"
y_train_plus_val_va = pd.concat([y_train_va, y_val_va])
y_train_plus_val_va_copy = y_train_plus_val_va.copy()
y_train_plus_val_va_copy.columns = ['train_val_split']
y_train_plus_val_va_copy.loc[y_train_va.index,'train_val_split'] = -1
y_train_plus_val_va_copy.loc[y_val_va.index,'train_val_split'] = 0
val_fold_va = np.array(y_train_plus_val_va_copy)
ps_va = PredefinedSplit(val_fold_va)

# Vacant X train+val
X_train_plus_val_va = pd.concat([X_train_va, X_val_va])

### Vacant Land: Linear regression

In [None]:
# random search for linear regression
param_lr_va = {'fit_intercept': [True,False],
               'normalize':[True,False]}

lr_cv_va = RandomizedSearchCV(
           LinearRegression(),
           param_distributions=param_lr_va,
           n_iter=10,
           cv=5,
           scoring='neg_root_mean_squared_error', 
           n_jobs=-1,
           random_state=123
).fit(X_train_plus_val_va, np.array(y_train_plus_val_va).ravel())

# find best model hyperparameters 
print(lr_cv_va.best_params_)
print(lr_cv_va.best_estimator_)
print('Min RMSE for linear regression is: {}'.format(-lr_cv_va.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_mx, xg_cv_mx.best_estimator_.predict(X_val_mx)))

# RMSE and MAE of baseline model
print(rmse(y_val_mx))
print(mae(y_val_mx))

### Vacant Land: Lasso

In [None]:
# random search for lasso
param_la_va = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

la_cv_va = RandomizedSearchCV(
        Lasso(),
        param_distributions=param_la_va,
        n_iter=10,
        cv=5,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_va, np.array(y_train_plus_val_va).ravel())

# find best model hyperparameters 
print(la_cv_va.best_params_)
print(la_cv_va.best_estimator_)
print('Min RMSE for lasso fro Mixed Use is: {}'.format(-la_cv_va.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_va, la_cv_va.best_estimator_.predict(X_val_va)))

# RMSE and MAE of baseline model
print(rmse(y_val_va))
print(mae(y_val_va))

### Vacant Land: Ridge

In [None]:
# random search for ridge
param_rd_va = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

rd_cv_va = RandomizedSearchCV(
        Ridge(),
        param_distributions=param_rd_va,
        n_iter=10,
        cv=5,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_va, np.array(y_train_plus_val_va).ravel())

# Find best model hyperparameters 
print(rd_cv_va.best_params_)
print(rd_cv_va.best_estimator_)
print('Min RMSE for ridge on Vacant Land is: {}'.format(-rd_cv_va.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_va, rd_cv_va.best_estimator_.predict(X_val_va)))

# RMSE and MAE of baseline model
print(rmse(y_val_va))
print(mae(y_val_va))

### Vacant Land: KNN Regressor

In [None]:
# random search for KNeighborsRegressor
param_knn_va = {'n_neighbors': sp_randint(1,21),
             'weights': ['uniform', 'distance'],
             'leaf_size': sp_randint(1,21)}

knn_cv_va = RandomizedSearchCV(
         KNeighborsRegressor(),
         param_distributions=param_knn_va,
         n_iter=10,
         cv=5,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_va, np.array(y_train_plus_val_va).ravel())

# find best model hyperparameters 
print(knn_cv_va.best_params_)
print(knn_cv_va.best_estimator_)
print('Min RMSE for Knn regressor on Mixed Use is: {}'.format(-knn_cv_va.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_va, knn_cv_va.best_estimator_.predict(X_val_va)))

# RMSE and MAE of baseline model
print(rmse(y_val_va))
print(mae(y_val_va))

### Vacant Land: Random Forest Regressor

In [None]:
# random search for RandomForestRegressor
param_rf_va = {'n_estimators': [100, 120, 200, 300, 500, 800, 1200],
            'max_depth': [None, 5, 8, 15, 25, 30],
            'min_samples_split': [1, 2, 5, 10 ,15, 100],
            'min_samples_leaf': [1, 2, 5, 10],
            'max_features': ['auto', 'sqrt', 'log2']}

rf_cv_va = RandomizedSearchCV(
        RandomForestRegressor(),
        param_distributions=param_rf_va,
        n_iter=10,
        cv=5,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_va, np.array(y_train_plus_val_va).ravel())

# find best model hyperparameters 
print(rf_cv_va.best_params_)
print(rf_cv_va.best_estimator_)
print('Min RMSE is for random forest regressor on Mixed Use is: {}'.format(-rf_cv_va.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_va, rf_cv_va.best_estimator_.predict(X_val_va)))

# RMSE and MAE of baseline model
print(rmse(y_val_va))
print(mae(y_val_va))

### Vacant Land: MLP Regressor

In [None]:
# random search for MLPRegressor
hl = []
for i in [1, 2, 3, 4, 5, 10, 15, 25, 30]:
    hl.append((i))
    hl.append((i,i))

param_mlp_va = {'hidden_layer_sizes': hl,
             'learning_rate_init': sp_loguniform(1e-4, 1e2),
             'alpha': sp_loguniform(1e-4, 1e2),
             'batch_size': [1, 3, 5, 10, 20, 50, 100, 250, 500]}

mlp_cv_va = RandomizedSearchCV(
         MLPRegressor(),
         param_distributions=param_mlp_mx,
         n_iter=10,
         cv=5,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val_va, np.array(y_train_plus_val_va).ravel())

# find best model hyperparameters 
print(mlp_cv_va.best_params_)
print(mlp_cv_va.best_estimator_)
print('Min RMSE for MLP regressor on Mixed Use is: {}'.format(-mlp_cv_va.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_va, mlp_cv_va.best_estimator_.predict(X_val_va)))

# RMSE and MAE of baseline model
print(rmse(y_val_va))
print(mae(y_val_va))

### Vacant Land: XGBoost

In [None]:
# random search for XGBoost
param_xg_va = {'n_estimators':[100, 500 ,1000],
               'max_depth': sp_randint(3, 15),
               'learning_rate_init': sp_loguniform(1e-4, 1e2),
               'min_child_weight': [1, 3, 5, 7]}

xg_cv_va = RandomizedSearchCV(
            xgb.XGBRegressor(nthread=-1),
            param_distributions=param_xg_mx,
            n_iter=10,
            cv=5,
            scoring='neg_root_mean_squared_error',
            n_jobs=-1,
            random_state=123
).fit(X_train_plus_val_va, np.array(y_train_plus_val_va).ravel())

# find best model hyperparameters 
print(xg_cv_va.best_params_)
print(xg_cv_va.best_estimator_)
print('Min RMSE for XGBoost on Mixed Use is: {}'.format(-xg_cv_va.best_score_))

In [None]:
# MAE of best model
print(mean_absolute_error(y_val_va, xg_cv_va.best_estimator_.predict(X_val_va)))

# RMSE and MAE of baseline model
print(rmse(y_val_va))
print(mae(y_val_va))