In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import PredefinedSplit, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error

from scipy.stats import loguniform as sp_loguniform
from scipy.stats import randint as sp_randint

import statsmodels.api as sm
import statsmodels.formula.api as smf

import warnings
warnings.filterwarnings('ignore')

Load the Training, Validation, and Test datasets for:

- Commercial type properties
- Vacant Land type properties

In [2]:
# Commercial-- training, validation and test data
X_train_cm = pd.read_csv('../processing/X_train_cm.csv')
X_train_cm.index = X_train_cm['Unnamed: 0']
X_train_cm = X_train_cm.drop(['Unnamed: 0'], axis=1)

X_val_cm = pd.read_csv('../processing/X_val_cm.csv')
X_val_cm.index = X_val_cm['Unnamed: 0']
X_val_cm = X_val_cm.drop(['Unnamed: 0'], axis=1)

X_test_cm = pd.read_csv('../processing/X_test_cm.csv')
X_test_cm.index = X_test_cm['Unnamed: 0']
X_test_cm = X_test_cm.drop(['Unnamed: 0'], axis=1)

y_train_cm = pd.read_csv('../processing/y_train_cm.csv')
y_train_cm.index = y_train_cm['Unnamed: 0']
y_train_cm = y_train_cm.drop(['Unnamed: 0'], axis=1)

y_val_cm = pd.read_csv('../processing/y_val_cm.csv')
y_val_cm.index = y_val_cm['Unnamed: 0']
y_val_cm = y_val_cm.drop(['Unnamed: 0'], axis=1)

y_test_cm = pd.read_csv('../processing/y_test_cm.csv')
y_test_cm.index = y_test_cm['Unnamed: 0']
y_test_cm = y_test_cm.drop(['Unnamed: 0'], axis=1)

In [3]:
# vacant-- training, validation and test data
X_train_va = pd.read_csv('../processing/X_train_va.csv')
X_train_va.index = X_train_va['Unnamed: 0']
X_train_va = X_train_va.drop(['Unnamed: 0'], axis=1)

X_val_va = pd.read_csv('../processing/X_val_va.csv')
X_val_va.index = X_val_va['Unnamed: 0']
X_val_va = X_val_va.drop(['Unnamed: 0'], axis=1)

X_test_va = pd.read_csv('../processing/X_test_va.csv')
X_test_va.index = X_test_va['Unnamed: 0']
X_test_va = X_test_va.drop(['Unnamed: 0'], axis=1)

y_train_va = pd.read_csv('../processing/y_train_va.csv')
y_train_va.index = y_train_va['Unnamed: 0']
y_train_va = y_train_va.drop(['Unnamed: 0'], axis=1)

y_val_va = pd.read_csv('../processing/y_val_va.csv')
y_val_va.index = y_val_va['Unnamed: 0']
y_val_va = y_val_va.drop(['Unnamed: 0'], axis=1)

y_test_va = pd.read_csv('../processing/y_test_va.csv')
y_test_va.index = y_test_va['Unnamed: 0']
y_test_va = y_test_va.drop(['Unnamed: 0'], axis=1)

Create a predefined train+val dataset

In [4]:
# Commercial Y train+val,--concate and prepare for "PredefinedSplit"
y_train_plus_val_cm = pd.concat([y_train_cm, y_val_cm])
y_train_plus_val_cm_copy = y_train_plus_val_cm.copy()
y_train_plus_val_cm_copy.columns = ['train_val_split']
y_train_plus_val_cm_copy.loc[y_train_cm.index,'train_val_split'] = -1
y_train_plus_val_cm_copy.loc[y_val_cm.index,'train_val_split'] = 0
val_fold_cm = np.array(y_train_plus_val_cm_copy)
ps_cm = PredefinedSplit(val_fold_cm)

# Commercial X train+val
X_train_plus_val_cm = pd.concat([X_train_cm, X_val_cm])

In [5]:
# Vacant Y train+val,--concate and prepare for "PredefinedSplit"
y_train_plus_val_va = pd.concat([y_train_va, y_val_va])
y_train_plus_val_va_copy = y_train_plus_val_va.copy()
y_train_plus_val_va_copy.columns = ['train_val_split']
y_train_plus_val_va_copy.loc[y_train_va.index,'train_val_split'] = -1
y_train_plus_val_va_copy.loc[y_val_va.index,'train_val_split'] = 0
val_fold_va = np.array(y_train_plus_val_va_copy)
ps_va = PredefinedSplit(val_fold_va)

# Vacant X train+val
X_train_plus_val_va = pd.concat([X_train_va, X_val_va])

## Commercial type properties

### 1. model selection 

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb

clfs = []
clfs.append(LinearRegression())
clfs.append(Lasso())
clfs.append(Ridge())
clfs.append(KNeighborsRegressor())
clfs.append(RandomForestRegressor())
clfs.append(MLPRegressor())
clfs.append(xgb.XGBRegressor())
clfs.append(ElasticNet())
clfs.append(GradientBoostingRegressor())


for model in clfs:
    # cross-val_score on X_train, y_train
    model.fit(X_train_cm, y_train_cm)    
    val_preds = model.predict(X_val_cm)
    r2 = r2_score(y_val_cm,val_preds)

    print('---------------------------------')
    print(str(model))
    print('R^2 for this classifier is', r2)

---------------------------------
LinearRegression()
R^2 for this classifier is 0.10553552910780706
---------------------------------
Lasso()
R^2 for this classifier is 0.11723603678544847
---------------------------------
Ridge()
R^2 for this classifier is 0.11288263975414259
---------------------------------
KNeighborsRegressor()
R^2 for this classifier is 0.23762580878510853
---------------------------------
RandomForestRegressor()
R^2 for this classifier is 0.4627189142187813
---------------------------------
MLPRegressor()
R^2 for this classifier is 0.16323282230382097
---------------------------------
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()'

### 2. Grid search for two best model: 
- RandomForestRegressor()
- XGBRegressor()

#### 2.1 Random Forest 

In [7]:
# random search for RandomForestRegressor
param_rf = {'n_estimators': [100, 120, 200, 300, 500, 800, 1200],
            'max_depth': [None, 5, 8, 15, 25, 30]}

rf_cv = RandomizedSearchCV(
        RandomForestRegressor(),
        param_distributions=param_rf,
        n_iter=10,
        cv=ps_cm,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_cm, np.array(y_train_plus_val_cm).ravel())

# find best model hyperparameters 
print(rf_cv.best_params_)
print(rf_cv.best_estimator_)
print('RF Commercial Average RMSE is: {}'.format(-rf_cv.best_score_))

{'n_estimators': 100, 'max_depth': 30}
RandomForestRegressor(max_depth=30)
RF Commercial Average RMSE is: 684.3245946356279


In [8]:
rf_r2 = r2_score(y_train_plus_val_cm, 
                 rf_cv.best_estimator_.predict(X_train_plus_val_cm))
print(rf_r2)

0.9305405843936732


#### 2.2 XGBoost

In [9]:
xgparams = {
    'max_depth': [2, 3, 5, 10, 15],
    'min_child_weight': [1,3,5,10,15,20,50,100],
    'n_estimators': [100, 500, 900, 1100, 1500]
    }

# Set up the random search with cross validation
random_cv = RandomizedSearchCV(
            estimator=xgb.XGBRegressor(eval_metric='mlogloss',nthread=-1),
            param_distributions=xgparams,
            cv=ps_cm,
            n_iter=10,      
            random_state= 123, 
    scoring='neg_root_mean_squared_error').fit(X_train_plus_val_cm,  np.array(y_train_plus_val_cm).ravel())



In [10]:
# find best model hyperparameters 
print(random_cv.best_params_)
print(random_cv.best_estimator_)
print('XGBoost Commercial Average RMSE is: {}'.format(-random_cv.best_score_))
xg_r2 = r2_score(y_train_plus_val_cm, 
                  random_cv.best_estimator_.predict(X_train_plus_val_cm))
print(xg_r2)

{'n_estimators': 1500, 'min_child_weight': 20, 'max_depth': 10}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             eval_metric='mlogloss', gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=10, min_child_weight=20, missing=nan,
             monotone_constraints='()', n_estimators=1500, n_jobs=8, nthread=-1,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)
XGBoost Commercial Average RMSE is: 715.3305032930913
0.9999999570754688


## Vacant Land type properties

### 1. Model Selection

In [11]:
clfs2 = []
clfs2.append(LinearRegression())
clfs2.append(Lasso())
clfs2.append(Ridge())
clfs2.append(KNeighborsRegressor())
clfs2.append(RandomForestRegressor())
clfs2.append(MLPRegressor())
clfs2.append(xgb.XGBRegressor())
clfs2.append(ElasticNet())
clfs2.append(GradientBoostingRegressor())


for model in clfs2:
    # cross-val_score on X_train, y_train
    model.fit(X_train_va, y_train_va)    
#     val_preds_va = model.predict(X_val_va)
#     r2 = r2_score(y_val_va,val_preds_va)
    
    cv_r2 = cross_val_score(model, X_train_va, y_train_va, 
                                 cv=5, scoring='r2')

    print('---------------------------------')
    print(str(model))
    print('R^2 for this classifier is', cv_r2.mean())

---------------------------------
LinearRegression()
R^2 for this classifier is -4.767804061799426e+23
---------------------------------
Lasso()
R^2 for this classifier is 0.024161640983541323
---------------------------------
Ridge()
R^2 for this classifier is 0.017336158154493873
---------------------------------
KNeighborsRegressor()
R^2 for this classifier is 0.22322883022117157
---------------------------------
RandomForestRegressor()
R^2 for this classifier is 0.470265913397545
---------------------------------
MLPRegressor()
R^2 for this classifier is 0.15264625976176135
---------------------------------
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints=

### 2. Grid search for two best model:
- RandomForestRegressor()
- XGBRegressor()

#### 2.1 Random Forest

In [12]:
# random search for RandomForestRegressor
param_rf = {'n_estimators': [100, 120, 200, 300, 500, 800, 1200],
            'max_depth': [None, 5, 8, 15, 25, 30]}

rf_cv = RandomizedSearchCV(
        RandomForestRegressor(),
        param_distributions=param_rf,
        n_iter=10,
        cv=ps_va,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_va, np.array(y_train_plus_val_va).ravel())

# find best model hyperparameters 
print(rf_cv.best_params_)
print(rf_cv.best_estimator_)
print('RF Vacant_Land min RMSE is: {}'.format(-rf_cv.best_score_))

{'n_estimators': 120, 'max_depth': None}
RandomForestRegressor(n_estimators=120)
RF Vacant_Land min RMSE is: 264.4754479446253


In [13]:
rf_r2 = r2_score(y_train_plus_val_va, 
                 rf_cv.best_estimator_.predict(X_train_plus_val_va))
print(rf_r2)

0.9314018597250768


#### 2.2 XGBoost

In [14]:
xgparams = {
    'max_depth': [2, 3, 5, 10, 15],
    'min_child_weight': [1,3,5,10,15,20,50,100],
    'n_estimators': [100, 500, 900, 1100, 1500]
    }

# Set up the random search with cross validation
random_cv = RandomizedSearchCV(
            estimator=xgb.XGBRegressor(eval_metric='mlogloss',nthread=-1),
            param_distributions=xgparams,
            cv=ps_va,
            n_iter=10,      
            random_state= 123, 
    scoring='neg_root_mean_squared_error').fit(X_train_plus_val_va,  np.array(y_train_plus_val_va).ravel())



In [15]:
# find best model hyperparameters 
print(random_cv.best_params_)
print(random_cv.best_estimator_)
print('XGBoost Vacant_Land min RMSE is: {}'.format(-random_cv.best_score_))
xg_r2 = r2_score(y_train_plus_val_va, 
                  random_cv.best_estimator_.predict(X_train_plus_val_va))
print(xg_r2)


{'n_estimators': 1500, 'min_child_weight': 15, 'max_depth': 5}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             eval_metric='mlogloss', gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=5, min_child_weight=15, missing=nan,
             monotone_constraints='()', n_estimators=1500, n_jobs=8, nthread=-1,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)
XGBoost Vacant_Land min RMSE is: 268.51915569656876
0.9917361983904
