In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb

from sklearn.model_selection import PredefinedSplit, RandomizedSearchCV, KFold, StratifiedKFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from scipy.stats import loguniform as sp_loguniform
from scipy.stats import randint as sp_randint

import statsmodels.api as sm
import statsmodels.formula.api as smf

import warnings
warnings.filterwarnings('ignore')

In [2]:
# get training, validation and test data for single family
X_train_sf = pd.read_csv('X_train_sf.csv')
X_train_sf.index = X_train_sf['Unnamed: 0']
X_train_sf = X_train_sf.drop(['Unnamed: 0'], axis=1)
X_val_sf = pd.read_csv('X_val_sf.csv')
X_val_sf.index = X_val_sf['Unnamed: 0']
X_val_sf = X_val_sf.drop(['Unnamed: 0'], axis=1)
X_test_sf = pd.read_csv('X_test_sf.csv')
X_test_sf.index = X_test_sf['Unnamed: 0']
X_test_sf = X_test_sf.drop(['Unnamed: 0'], axis=1)
y_train_sf = pd.read_csv('y_train_sf.csv')
y_train_sf.index = y_train_sf['Unnamed: 0']
y_train_sf = y_train_sf.drop(['Unnamed: 0'], axis=1)
y_val_sf = pd.read_csv('y_val_sf.csv')
y_val_sf.index = y_val_sf['Unnamed: 0']
y_val_sf = y_val_sf.drop(['Unnamed: 0'], axis=1)
y_test_sf = pd.read_csv('y_test_sf.csv')
y_test_sf.index = y_test_sf['Unnamed: 0']
y_test_sf = y_test_sf.drop(['Unnamed: 0'], axis=1)

In [3]:
# MULTIFAMILY get training, validation and test data
X_train_mf = pd.read_csv('X_train_mf.csv')
X_train_mf.index = X_train_mf['Unnamed: 0']
X_train_mf = X_train_mf.drop(['Unnamed: 0'], axis=1)

X_val_mf = pd.read_csv('X_val_mf.csv')
X_val_mf.index = X_val_mf['Unnamed: 0']
X_val_mf = X_val_mf.drop(['Unnamed: 0'], axis=1)

X_test_mf = pd.read_csv('X_test_mf.csv')
X_test_mf.index = X_test_mf['Unnamed: 0']
X_test_mf = X_test_mf.drop(['Unnamed: 0'], axis=1)

y_train_mf = pd.read_csv('y_train_mf.csv')
y_train_mf.index = y_train_mf['Unnamed: 0']
y_train_mf = y_train_mf.drop(['Unnamed: 0'], axis=1)

y_val_mf = pd.read_csv('y_val_mf.csv')
y_val_mf.index = y_val_mf['Unnamed: 0']
y_val_mf = y_val_mf.drop(['Unnamed: 0'], axis=1)

y_test_mf = pd.read_csv('y_test_mf.csv')
y_test_mf.index = y_test_mf['Unnamed: 0']
y_test_mf = y_test_mf.drop(['Unnamed: 0'], axis=1)

In [4]:
# Commercial-- training, validation and test data
X_train_cm = pd.read_csv('X_train_cm.csv')
X_train_cm.index = X_train_cm['Unnamed: 0']
X_train_cm = X_train_cm.drop(['Unnamed: 0'], axis=1)

X_val_cm = pd.read_csv('X_val_cm.csv')
X_val_cm.index = X_val_cm['Unnamed: 0']
X_val_cm = X_val_cm.drop(['Unnamed: 0'], axis=1)

X_test_cm = pd.read_csv('X_test_cm.csv')
X_test_cm.index = X_test_cm['Unnamed: 0']
X_test_cm = X_test_cm.drop(['Unnamed: 0'], axis=1)

y_train_cm = pd.read_csv('y_train_cm.csv')
y_train_cm.index = y_train_cm['Unnamed: 0']
y_train_cm = y_train_cm.drop(['Unnamed: 0'], axis=1)

y_val_cm = pd.read_csv('y_val_cm.csv')
y_val_cm.index = y_val_cm['Unnamed: 0']
y_val_cm = y_val_cm.drop(['Unnamed: 0'], axis=1)

y_test_cm = pd.read_csv('y_test_cm.csv')
y_test_cm.index = y_test_cm['Unnamed: 0']
y_test_cm = y_test_cm.drop(['Unnamed: 0'], axis=1)

In [5]:
#INDUSTRIAL get training, validation and test data
X_train_id = pd.read_csv('X_train_id.csv')
X_train_id.index = X_train_id['Unnamed: 0']
X_train_id = X_train_id.drop(['Unnamed: 0'], axis=1)

X_val_id = pd.read_csv('X_val_id.csv')
X_val_id.index = X_val_id['Unnamed: 0']
X_val_id = X_val_id.drop(['Unnamed: 0'], axis=1)

X_test_id = pd.read_csv('X_test_id.csv')
X_test_id.index = X_test_id['Unnamed: 0']
X_test_id = X_test_id.drop(['Unnamed: 0'], axis=1)

y_train_id = pd.read_csv('y_train_id.csv')
y_train_id.index = y_train_id['Unnamed: 0']
y_train_id = y_train_id.drop(['Unnamed: 0'], axis=1)

y_val_id = pd.read_csv('y_val_id.csv')
y_val_id.index = y_val_id['Unnamed: 0']
y_val_id = y_val_id.drop(['Unnamed: 0'], axis=1)

y_test_id = pd.read_csv('y_test_id.csv')
y_test_id.index = y_test_id['Unnamed: 0']
y_test_id = y_test_id.drop(['Unnamed: 0'], axis=1)

In [6]:
# get training, validation and test data for mixed use
X_train_mx = pd.read_csv('X_train_mx.csv')
X_train_mx.index = X_train_mx['Unnamed: 0']
X_train_mx = X_train_mx.drop(['Unnamed: 0'], axis=1)
X_val_mx = pd.read_csv('X_val_mx.csv')
X_val_mx.index = X_val_mx['Unnamed: 0']
X_val_mx = X_val_mx.drop(['Unnamed: 0'], axis=1)
X_test_mx = pd.read_csv('X_test_mx.csv')
X_test_mx.index = X_test_mx['Unnamed: 0']
X_test_mx = X_test_mx.drop(['Unnamed: 0'], axis=1)
y_train_mx = pd.read_csv('y_train_mx.csv')
y_train_mx.index = y_train_mx['Unnamed: 0']
y_train_mx = y_train_mx.drop(['Unnamed: 0'], axis=1)
y_val_mx = pd.read_csv('y_val_mx.csv')
y_val_mx.index = y_val_mx['Unnamed: 0']
y_val_mx = y_val_mx.drop(['Unnamed: 0'], axis=1)
y_test_mx = pd.read_csv('y_test_mx.csv')
y_test_mx.index = y_test_mx['Unnamed: 0']
y_test_mx = y_test_mx.drop(['Unnamed: 0'], axis=1)

In [7]:
# vacant-- training, validation and test data
X_train_va = pd.read_csv('X_train_va.csv')
X_train_va.index = X_train_va['Unnamed: 0']
X_train_va = X_train_va.drop(['Unnamed: 0'], axis=1)

X_val_va = pd.read_csv('X_val_va.csv')
X_val_va.index = X_val_va['Unnamed: 0']
X_val_va = X_val_va.drop(['Unnamed: 0'], axis=1)

X_test_va = pd.read_csv('X_test_va.csv')
X_test_va.index = X_test_va['Unnamed: 0']
X_test_va = X_test_va.drop(['Unnamed: 0'], axis=1)

y_train_va = pd.read_csv('y_train_va.csv')
y_train_va.index = y_train_va['Unnamed: 0']
y_train_va = y_train_va.drop(['Unnamed: 0'], axis=1)

y_val_va = pd.read_csv('y_val_va.csv')
y_val_va.index = y_val_va['Unnamed: 0']
y_val_va = y_val_va.drop(['Unnamed: 0'], axis=1)

y_test_va = pd.read_csv('y_test_va.csv')
y_test_va.index = y_test_va['Unnamed: 0']
y_test_va = y_test_va.drop(['Unnamed: 0'], axis=1)

In [8]:
# training plus val data
X_train_plus_val_sf = pd.concat([X_train_sf, X_val_sf])
y_train_plus_val_sf = pd.concat([y_train_sf, y_val_sf])
X_train_plus_val_mf = pd.concat([X_train_mf, X_val_mf])
y_train_plus_val_mf = pd.concat([y_train_mf, y_val_mf])
X_train_plus_val_cm = pd.concat([X_train_sf, X_val_cm])
y_train_plus_val_cm = pd.concat([y_train_cm, y_val_cm])
X_train_plus_val_id = pd.concat([X_train_id, X_val_id])
y_train_plus_val_id = pd.concat([y_train_id, y_val_id])
X_train_plus_val_mx = pd.concat([X_train_mx, X_val_mx])
y_train_plus_val_mx = pd.concat([y_train_mx, y_val_mx])
X_train_plus_val_va = pd.concat([X_train_va, X_val_va])
y_train_plus_val_va = pd.concat([y_train_va, y_val_va])

In [40]:
# define function to calculate baseline r2, rmse and mae
def baseline(pre, l):
    r2 = 1-np.sum((pre-l)**2)/np.sum((np.mean(l)-l)**2)
    rmse = np.sqrt(np.sum((pre-l)**2)/len(l))
    mae = (np.sum(np.abs(pre-l)))/len(l)
    return r2, rmse, mae

In [44]:
# compute baseline r2, rmse and mae on val and test data
# single family baseline for val and test
print(baseline(np.mean(y_train_sf), y_val_sf))
print(baseline(np.mean(y_train_plus_val_sf), y_test_sf))

(sale_price   -0.000021
dtype: float64, sale_price    290.507662
dtype: float64, sale_price    155.754805
dtype: float64)
(sale_price   -0.029653
dtype: float64, sale_price    312.628615
dtype: float64, sale_price    150.877797
dtype: float64)


In [47]:
# multi family baseline for val and test
print(baseline(np.mean(y_train_mf), y_val_mf))
print(baseline(np.mean(y_train_plus_val_mf), y_test_mf))

(sale_price   -0.000321
dtype: float64, sale_price    587.124769
dtype: float64, sale_price    308.049347
dtype: float64)
(sale_price   -0.003041
dtype: float64, sale_price    501.711842
dtype: float64, sale_price    243.959655
dtype: float64)


In [48]:
# commercial baseline for val and test
print(baseline(np.mean(y_train_cm), y_val_cm))
print(baseline(np.mean(y_train_plus_val_cm), y_test_cm))

(sale_price   -0.006
dtype: float64, sale_price    846.626526
dtype: float64, sale_price    587.716804
dtype: float64)
(sale_price   -0.003774
dtype: float64, sale_price    916.282516
dtype: float64, sale_price    596.427803
dtype: float64)


In [49]:
# industrial baseline for val and test
print(baseline(np.mean(y_train_id), y_val_id))
print(baseline(np.mean(y_train_plus_val_id), y_test_id))

(sale_price   -0.000263
dtype: float64, sale_price    898.633108
dtype: float64, sale_price    620.167356
dtype: float64)
(sale_price   -0.042391
dtype: float64, sale_price    1026.3886
dtype: float64, sale_price    713.971641
dtype: float64)


In [50]:
# mixed use baseline for val and test
print(baseline(np.mean(y_train_mx), y_val_mx))
print(baseline(np.mean(y_train_plus_val_mx), y_test_mx))

(sale_price   -8.186383e-09
dtype: float64, sale_price    329.708794
dtype: float64, sale_price    205.827053
dtype: float64)
(sale_price   -0.009273
dtype: float64, sale_price    329.24996
dtype: float64, sale_price    177.989375
dtype: float64)


In [51]:
# vacant land baseline for val and test
print(baseline(np.mean(y_train_va), y_val_va))
print(baseline(np.mean(y_train_plus_val_va), y_test_va))

(sale_price   -0.000397
dtype: float64, sale_price    366.686874
dtype: float64, sale_price    166.425429
dtype: float64)
(sale_price   -0.038991
dtype: float64, sale_price    388.409471
dtype: float64, sale_price    188.229455
dtype: float64)
