In [22]:
# Import dependencies
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
import numpy as np
from scipy import stats
from scipy.stats import norm
import seaborn as sns

In [23]:
# import SQL dependencies
from sqlalchemy import create_engine
import psycopg2
from pandas.io import sql

from config import db_password

db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/austin_housing"
engine = create_engine(db_string)

In [24]:
#pull 2020 data from pgAdmin

df = pd.read_sql('regressionzip_2020', db_string)
df.head()

Unnamed: 0,zpid,latestPrice,numOfBathrooms,livingAreaSqFt,numOfBedrooms,avgSchoolRating,numOfStories,MedianStudentsPerTeacher,numOfHighSchools,longitude,...,latitude,lotSizeSqFt,numOfPhotos,numPriceChanges,zipcode,propertyTaxRate,latest_saledate,latest_salemonth,latest_saleyear,averagePrice
0,120900430,295000.0,2.0,1768.0,4,2.666667,1,14,1,-97.661697,...,30.432673,6185.0,29,1,78660,1.98,2020-10-13,10,2020,303125.25
1,2080105342,309045.0,2.0,1446.0,3,4.0,1,14,1,-97.656181,...,30.437775,5161.0,2,2,78660,1.98,2020-08-05,8,2020,303125.25
2,241932337,315000.0,3.0,2432.0,4,3.666667,2,12,1,-97.643394,...,30.414684,12196.8,36,2,78660,1.98,2020-06-11,6,2020,303125.25
3,241932327,279900.0,2.0,1580.0,3,3.666667,1,12,1,-97.643288,...,30.414934,5401.0,32,2,78660,1.98,2020-08-28,8,2020,303125.25
4,69808966,239900.0,2.0,1762.0,4,3.333333,1,14,1,-97.623436,...,30.167768,6011.0,28,9,78617,1.98,2020-09-05,9,2020,198750.391304


In [5]:
# Import data
# df = pd.read_csv(Path('../resources/regressiondata_2020.csv'))
# df.head()

Unnamed: 0.1,Unnamed: 0,zpid,latestPrice,numOfBathrooms,livingAreaSqFt,numOfBedrooms,avgSchoolRating,numOfStories,MedianStudentsPerTeacher,numOfHighSchools,...,avgSchoolDistance,latitude,lotSizeSqFt,numOfPhotos,numPriceChanges,zipcode,propertyTaxRate,latest_saledate,latest_salemonth,latest_saleyear
0,1,120900430,295000.0,2.0,1768.0,4,2.666667,1,14,1,...,1.4,30.432673,6185.0,29,1,78660,1.98,2020-10-13,10,2020
1,5,2080105342,309045.0,2.0,1446.0,3,4.0,1,14,1,...,1.066667,30.437775,5161.0,2,2,78660,1.98,2020-08-05,8,2020
2,6,241932337,315000.0,3.0,2432.0,4,3.666667,2,12,1,...,1.233333,30.414684,12196.8,36,2,78660,1.98,2020-06-11,6,2020
3,10,241932327,279900.0,2.0,1580.0,3,3.666667,1,12,1,...,1.2,30.414934,5401.0,32,2,78660,1.98,2020-08-28,8,2020
4,12,69808966,239900.0,2.0,1762.0,4,3.333333,1,14,1,...,0.633333,30.167768,6011.0,28,9,78617,1.98,2020-09-05,9,2020


In [25]:
df.columns

Index(['zpid', 'latestPrice', 'numOfBathrooms', 'livingAreaSqFt',
       'numOfBedrooms', 'avgSchoolRating', 'numOfStories',
       'MedianStudentsPerTeacher', 'numOfHighSchools', 'longitude',
       'numOfPrimarySchools', 'avgSchoolDistance', 'latitude', 'lotSizeSqFt',
       'numOfPhotos', 'numPriceChanges', 'zipcode', 'propertyTaxRate',
       'latest_saledate', 'latest_salemonth', 'latest_saleyear',
       'averagePrice'],
      dtype='object')

In [26]:
# Save 'zpid'
id_df = df['zpid']

# Drop 'zpid' column for CSV
# df.drop(["zpid","latest_saledate","Unnamed: 0",'latest_salemonth','latest_saleyear'], axis = 1, inplace = True)

# Drop 'zpid' column for SQL
df.drop(["zpid","latest_saledate",'latest_salemonth','latest_saleyear'], axis = 1, inplace = True)

# Check data size after dropping the 'Id' variable
print("\nData size: {} ".format(df.shape)) 


Data size: (5412, 18) 


In [27]:
# Create features and target
y = df["latestPrice"]
X = df.drop(columns=['latestPrice'])

In [28]:
# Split data into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train.shape

(4059, 17)

In [29]:
# Create DMatrices

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [30]:
from sklearn.metrics import mean_absolute_error
import numpy as np

# "Learn" the mean from the training data
mean_train = np.mean(y_train)

# Get predictions on the test set
baseline_predictions = np.ones(y_test.shape) * mean_train

# Compute MAE
mae_baseline = mean_absolute_error(y_test, baseline_predictions)
print("Baseline MAE is {:.2f}".format(mae_baseline))

Baseline MAE is 246150.68


In [31]:
# Create parameters dictionary
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:squarederror',
}

In [32]:
params['eval_metric'] = "mae"

num_boost_round = 999

In [33]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

print("Best MAE: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

[0]	Test-mae:383274
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:272451
[2]	Test-mae:203042
[3]	Test-mae:161584
[4]	Test-mae:138146
[5]	Test-mae:124579
[6]	Test-mae:117350
[7]	Test-mae:113377
[8]	Test-mae:111977
[9]	Test-mae:111741
[10]	Test-mae:111102
[11]	Test-mae:111393
[12]	Test-mae:110895
[13]	Test-mae:110515
[14]	Test-mae:110589
[15]	Test-mae:110081
[16]	Test-mae:110173
[17]	Test-mae:109917
[18]	Test-mae:110147
[19]	Test-mae:109457
[20]	Test-mae:109375
[21]	Test-mae:109264
[22]	Test-mae:109035
[23]	Test-mae:108990
[24]	Test-mae:109139
[25]	Test-mae:109080
[26]	Test-mae:108981
[27]	Test-mae:109104
[28]	Test-mae:108756
[29]	Test-mae:108728
[30]	Test-mae:108896
[31]	Test-mae:108629
[32]	Test-mae:108793
[33]	Test-mae:108936
[34]	Test-mae:109138
[35]	Test-mae:109008
[36]	Test-mae:109131
[37]	Test-mae:109084
[38]	Test-mae:109228
[39]	Test-mae:109210
[40]	Test-mae:109370
[41]	Test-mae:109425
Stopping. Best iteration:
[31]	Test-mae:108629

Best MAE: 108629.25 with

In [34]:
# Get cross validation score with current params
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)
cv_results

Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
0,389425.0875,1906.160096,390801.4625,9845.446578
1,277973.0125,1452.681422,282800.775,8735.461389
2,203114.890625,1530.677586,213286.540625,6967.223573
3,154297.925,1031.889099,170710.375,6355.469271
4,124027.335937,1479.611121,145977.7125,4893.280554
5,105676.96875,950.222492,132873.348438,4700.059555
6,94271.535937,789.986973,125384.798438,4930.008256
7,87419.803125,766.581401,122147.334375,4843.248918
8,83022.528125,604.309243,120297.564062,5571.121666
9,79840.651563,550.853557,119333.78125,5734.396507


In [35]:
cv_results['test-mae-mean'].min()

115886.2078128

In [36]:
# You can try wider intervals with a larger step between
# each value and then narrow it down. Here after several
# iteration I found that the optimal value was in the
# following ranges.
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(4,12)
    for min_child_weight in range(1,10)
]

In [37]:
# Define initial best params and MAE
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=4, min_child_weight=1
	MAE 115242.7234376 for 66 rounds
CV with max_depth=4, min_child_weight=2
	MAE 116611.2031248 for 41 rounds
CV with max_depth=4, min_child_weight=3
	MAE 115676.7750002 for 55 rounds
CV with max_depth=4, min_child_weight=4
	MAE 116255.5812502 for 44 rounds
CV with max_depth=4, min_child_weight=5
	MAE 118125.8859376 for 35 rounds
CV with max_depth=4, min_child_weight=6
	MAE 116696.6218748 for 53 rounds
CV with max_depth=4, min_child_weight=7
	MAE 116855.6671876 for 43 rounds
CV with max_depth=4, min_child_weight=8
	MAE 118216.6703124 for 35 rounds
CV with max_depth=4, min_child_weight=9
	MAE 117948.6375 for 52 rounds
CV with max_depth=5, min_child_weight=1
	MAE 113495.4328128 for 38 rounds
CV with max_depth=5, min_child_weight=2
	MAE 114875.396875 for 42 rounds
CV with max_depth=5, min_child_weight=3
	MAE 114290.8296876 for 61 rounds
CV with max_depth=5, min_child_weight=4
	MAE 114718.8390628 for 53 rounds
CV with max_depth=5, min_child_weight=5
	M

In [38]:
# Update parameters with best found parameters
params['max_depth'] = 7
params['min_child_weight'] = 1

In [39]:
# Tuning 'subsample' and 'colsample_bytree' parameters
# Create list of possible params
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [40]:
min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with subsample=1.0, colsample=1.0
	MAE 115823.4843752 for 19 rounds
CV with subsample=1.0, colsample=0.9
	MAE 114166.35625 for 26 rounds
CV with subsample=1.0, colsample=0.8
	MAE 115792.9859378 for 28 rounds
CV with subsample=1.0, colsample=0.7
	MAE 116559.7265626 for 21 rounds
CV with subsample=0.9, colsample=1.0
	MAE 114194.21875 for 23 rounds
CV with subsample=0.9, colsample=0.9
	MAE 115207.7515626 for 18 rounds
CV with subsample=0.9, colsample=0.8
	MAE 118034.7859378 for 22 rounds
CV with subsample=0.9, colsample=0.7
	MAE 116583.07656280001 for 26 rounds
CV with subsample=0.8, colsample=1.0
	MAE 114615.73125 for 30 rounds
CV with subsample=0.8, colsample=0.9
	MAE 115727.1624998 for 31 rounds
CV with subsample=0.8, colsample=0.8
	MAE 117488.3 for 29 rounds
CV with subsample=0.8, colsample=0.7
	MAE 118172.2734374 for 29 rounds
CV with subsample=0.7, colsample=1.0
	MAE 115864.6281248 for 11 rounds
CV with subsample=0.7, colsample=0.9
	MAE 119256.6656252 for 34 rounds
CV with subsam

In [41]:
# Update params dictionary
params['subsample'] = 0.9
params['colsample_bytree'] = 0.9

In [42]:
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    cv_results = xgb.cv(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            seed=42,
            nfold=5,
            metrics=['mae'],
            early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))

CV with eta=0.3
	MAE 115207.75 for 18 rounds

CV with eta=0.2
	MAE 112186.909375 for 22 rounds

CV with eta=0.1
	MAE 110293.128125 for 144 rounds

CV with eta=0.05
	MAE 110494.1484376 for 122 rounds

CV with eta=0.01
	MAE 108706.53281219999 for 841 rounds

CV with eta=0.005
	MAE 109527.2546876 for 998 rounds

Best params: 0.01, MAE: 108706.53281219999


In [43]:
# Update parameters dictionary
params['eta'] = .05

In [44]:
params

{'max_depth': 7,
 'min_child_weight': 1,
 'eta': 0.05,
 'subsample': 0.9,
 'colsample_bytree': 0.9,
 'objective': 'reg:squarederror',
 'eval_metric': 'mae'}

In [45]:
# Train the model with the tuned parameters and use test data
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

print("Best MAE: {:.2f} in {} rounds".format(model.best_score, model.best_iteration+1))

[0]	Test-mae:517491
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:492782
[2]	Test-mae:468979
[3]	Test-mae:445692
[4]	Test-mae:424089
[5]	Test-mae:403462
[6]	Test-mae:383938
[7]	Test-mae:365142
[8]	Test-mae:347249
[9]	Test-mae:329853
[10]	Test-mae:314106
[11]	Test-mae:298744
[12]	Test-mae:284940
[13]	Test-mae:271704
[14]	Test-mae:259281
[15]	Test-mae:247243
[16]	Test-mae:236546
[17]	Test-mae:226652
[18]	Test-mae:217189
[19]	Test-mae:208557
[20]	Test-mae:199986
[21]	Test-mae:192409
[22]	Test-mae:185479
[23]	Test-mae:178830
[24]	Test-mae:172571
[25]	Test-mae:166711
[26]	Test-mae:161261
[27]	Test-mae:156002
[28]	Test-mae:151353
[29]	Test-mae:146666
[30]	Test-mae:142740
[31]	Test-mae:138819
[32]	Test-mae:135468
[33]	Test-mae:132352
[34]	Test-mae:129649
[35]	Test-mae:127088
[36]	Test-mae:124720
[37]	Test-mae:122439
[38]	Test-mae:120359
[39]	Test-mae:118524
[40]	Test-mae:116981
[41]	Test-mae:115315
[42]	Test-mae:114099
[43]	Test-mae:112908
[44]	Test-mae:111824
[45]	Test

In [46]:
# Save model with best parameters
num_boost_round = model.best_iteration + 1
best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

[0]	Test-mae:517491
[1]	Test-mae:492782
[2]	Test-mae:468979
[3]	Test-mae:445692
[4]	Test-mae:424089
[5]	Test-mae:403462
[6]	Test-mae:383939
[7]	Test-mae:365142
[8]	Test-mae:347249
[9]	Test-mae:329853
[10]	Test-mae:314106
[11]	Test-mae:298744
[12]	Test-mae:284940
[13]	Test-mae:271704
[14]	Test-mae:259281
[15]	Test-mae:247243
[16]	Test-mae:236546
[17]	Test-mae:226652
[18]	Test-mae:217189
[19]	Test-mae:208557
[20]	Test-mae:199986
[21]	Test-mae:192409
[22]	Test-mae:185479
[23]	Test-mae:178830
[24]	Test-mae:172571
[25]	Test-mae:166711
[26]	Test-mae:161261
[27]	Test-mae:156002
[28]	Test-mae:151353
[29]	Test-mae:146666
[30]	Test-mae:142740
[31]	Test-mae:138819
[32]	Test-mae:135468
[33]	Test-mae:132352
[34]	Test-mae:129649
[35]	Test-mae:127088
[36]	Test-mae:124720
[37]	Test-mae:122439
[38]	Test-mae:120359
[39]	Test-mae:118524
[40]	Test-mae:116981
[41]	Test-mae:115315
[42]	Test-mae:114099
[43]	Test-mae:112908
[44]	Test-mae:111824
[45]	Test-mae:110870
[46]	Test-mae:109977
[47]	Test-mae:109222
[4

In [47]:
# Create prediction
y_pred = best_model.predict(dtest)

In [48]:
# Save model
best_model.save_model("xgboost_optimal_2020.model")

In [49]:
# Calculate R squared and Adjusted R Square
import statsmodels.api as sm
result = sm.OLS(y_pred, y_test).fit()
print(result.rsquared, result.rsquared_adj)

0.8965441835219592 0.8964676629476411


In [50]:
# Calculate Mean Squared Error
from sklearn.metrics import mean_squared_error, mean_squared_log_error
import math
print(mean_squared_error(y_test, y_pred))
print(math.sqrt(mean_squared_error(y_test, y_pred)))
print(mean_squared_log_error(y_test, y_pred))

49164830140.78594
221731.43697001095
0.06222501411706528


In [51]:
# Calculate Mean Absolute Error(MAE)
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test, y_pred))

102670.7606591833


In [None]:
import matplotlib.pyplot as plt

# extra step to allow graphviz to be found 
import os
os.environ["PATH"] += os.pathsep + 'C:/Users/danny/.conda/envs/mlenv/lib/site-packages/graphviz'

xgb.plot_tree(model, num_trees=0)
plt.rcParams['figure.figsize'] = [50, 10]
plt.show()

In [None]:
xgb.plot_importance(best_model)
plt.rcParams['figure.figsize'] = [5, 5]
plt.show()

In [None]:
# # Code to load model for other datasets:
# loaded_model = xgb.Booster()
# loaded_model.load_model("my_model.model")
# # And use it for predictions.
# loaded_model.predict(dtest)