In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
import numpy as np
from scipy import stats
from scipy.stats import norm
import seaborn as sns

In [2]:
# import SQL dependencies
from sqlalchemy import create_engine
import psycopg2
from pandas.io import sql

from config import db_password

db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/austin_housing"
engine = create_engine(db_string)

In [3]:
#pull 2020 data from pgAdmin

df = pd.read_sql('regressionzip_2020', db_string)
df.head()

Unnamed: 0,zpid,latestPrice,numOfBathrooms,livingAreaSqFt,numOfBedrooms,avgSchoolRating,numOfStories,MedianStudentsPerTeacher,numOfHighSchools,longitude,...,lotSizeSqFt,numOfPhotos,numPriceChanges,zipcode,yearBuilt,propertyTaxRate,latest_saledate,latest_salemonth,latest_saleyear,averagePrice
0,120900430,295000.0,2.0,1768.0,4,2.666667,1,14,1,-97.661697,...,6185.0,29,1,78660,2013,1.98,2020-10-13,10,2020,303125.25
1,2080105342,309045.0,2.0,1446.0,3,4.0,1,14,1,-97.656181,...,5161.0,2,2,78660,2020,1.98,2020-08-05,8,2020,303125.25
2,241932337,315000.0,3.0,2432.0,4,3.666667,2,12,1,-97.643394,...,12196.8,36,2,78660,2016,1.98,2020-06-11,6,2020,303125.25
3,241932327,279900.0,2.0,1580.0,3,3.666667,1,12,1,-97.643288,...,5401.0,32,2,78660,2016,1.98,2020-08-28,8,2020,303125.25
4,69808966,239900.0,2.0,1762.0,4,3.333333,1,14,1,-97.623436,...,6011.0,28,9,78617,2005,1.98,2020-09-05,9,2020,198750.391304


In [4]:
# Import data
# df = pd.read_csv(Path('../resources/regressiondata_2020.csv'))
# df.head()

In [5]:
df.columns

Index(['zpid', 'latestPrice', 'numOfBathrooms', 'livingAreaSqFt',
       'numOfBedrooms', 'avgSchoolRating', 'numOfStories',
       'MedianStudentsPerTeacher', 'numOfHighSchools', 'longitude',
       'numOfPrimarySchools', 'avgSchoolDistance', 'latitude', 'lotSizeSqFt',
       'numOfPhotos', 'numPriceChanges', 'zipcode', 'yearBuilt',
       'propertyTaxRate', 'latest_saledate', 'latest_salemonth',
       'latest_saleyear', 'averagePrice'],
      dtype='object')

In [6]:
# Save 'zpid'
id_df = df['zpid']

# Drop 'zpid' column for CSV
# df.drop(["zpid","latest_saledate","Unnamed: 0",'latest_salemonth','latest_saleyear'], axis = 1, inplace = True)

# Drop 'zpid' column for SQL
df.drop(["zpid","latest_saledate",'latest_salemonth','latest_saleyear'], axis = 1, inplace = True)

# Check data size after dropping the 'Id' variable
print("\nData size: {} ".format(df.shape)) 


Data size: (5412, 19) 


In [7]:
# Create features and target
y = df["latestPrice"]
X = df.drop(columns=['latestPrice'])

In [8]:
# Split data into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train.shape

(4059, 18)

In [9]:
# Create DMatrices

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [10]:
from sklearn.metrics import mean_absolute_error
import numpy as np

# "Learn" the mean from the training data
mean_train = np.mean(y_train)

# Get predictions on the test set
baseline_predictions = np.ones(y_test.shape) * mean_train

# Compute MAE
mae_baseline = mean_absolute_error(y_test, baseline_predictions)
print("Baseline MAE is {:.2f}".format(mae_baseline))

Baseline MAE is 246150.68


In [11]:
# Create parameters dictionary
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:squarederror',
}

In [12]:
params['eval_metric'] = "mae"

num_boost_round = 999

In [13]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

print("Best MAE: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

[0]	Test-mae:383477
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:272696
[2]	Test-mae:202781
[3]	Test-mae:162918
[4]	Test-mae:139655
[5]	Test-mae:126078
[6]	Test-mae:118320
[7]	Test-mae:115663
[8]	Test-mae:113834
[9]	Test-mae:112197
[10]	Test-mae:111753
[11]	Test-mae:110739
[12]	Test-mae:110092
[13]	Test-mae:108941
[14]	Test-mae:109203
[15]	Test-mae:108840
[16]	Test-mae:108284
[17]	Test-mae:107688
[18]	Test-mae:107162
[19]	Test-mae:107380
[20]	Test-mae:107554
[21]	Test-mae:107317
[22]	Test-mae:106899
[23]	Test-mae:106975
[24]	Test-mae:106868
[25]	Test-mae:106388
[26]	Test-mae:106068
[27]	Test-mae:106214
[28]	Test-mae:106146
[29]	Test-mae:106302
[30]	Test-mae:106122
[31]	Test-mae:106023
[32]	Test-mae:106056
[33]	Test-mae:106226
[34]	Test-mae:106145
[35]	Test-mae:106277
[36]	Test-mae:106252
[37]	Test-mae:106415
[38]	Test-mae:106399
[39]	Test-mae:106460
[40]	Test-mae:106145
[41]	Test-mae:106287
Stopping. Best iteration:
[31]	Test-mae:106023

Best MAE: 106022.81 with

In [14]:
# Get cross validation score with current params
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)
cv_results

Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
0,389436.94375,1921.203125,390748.6,9920.454535
1,277874.91875,1501.409024,282379.25,9089.020237
2,203070.928125,1290.052596,212782.028125,7592.573464
3,154278.83125,1562.886175,170259.559375,6458.533707
4,123748.009375,1427.488828,145883.08125,5566.04156
5,104997.9125,1169.058721,131975.285937,5487.189468
6,93543.771875,1434.763486,124157.217187,5364.237533
7,86405.832812,1091.129784,120088.75,5563.06245
8,81662.909375,1028.8349,117912.8375,5723.502061
9,78191.96875,921.82068,116704.409375,5416.723706


In [15]:
cv_results['test-mae-mean'].min()

113072.4656252

In [16]:
# You can try wider intervals with a larger step between
# each value and then narrow it down. Here after several
# iteration I found that the optimal value was in the
# following ranges.
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(4,12)
    for min_child_weight in range(1,10)
]

In [17]:
# Define initial best params and MAE
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=4, min_child_weight=1
	MAE 115247.0015628 for 81 rounds
CV with max_depth=4, min_child_weight=2
	MAE 115058.5796874 for 68 rounds
CV with max_depth=4, min_child_weight=3
	MAE 115358.2999998 for 49 rounds
CV with max_depth=4, min_child_weight=4
	MAE 116028.3390624 for 60 rounds
CV with max_depth=4, min_child_weight=5
	MAE 115177.04375 for 52 rounds
CV with max_depth=4, min_child_weight=6
	MAE 118011.1531254 for 28 rounds
CV with max_depth=4, min_child_weight=7
	MAE 116277.3625 for 65 rounds
CV with max_depth=4, min_child_weight=8
	MAE 115077.0609376 for 63 rounds
CV with max_depth=4, min_child_weight=9
	MAE 117279.5124998 for 66 rounds
CV with max_depth=5, min_child_weight=1
	MAE 113077.6234376 for 32 rounds
CV with max_depth=5, min_child_weight=2
	MAE 112978.7906252 for 30 rounds
CV with max_depth=5, min_child_weight=3
	MAE 115124.6609376 for 43 rounds
CV with max_depth=5, min_child_weight=4
	MAE 114529.6875 for 39 rounds
CV with max_depth=5, min_child_weight=5
	MAE 1

In [18]:
# Update parameters with best found parameters
params['max_depth'] = 11
params['min_child_weight'] = 8

In [19]:
# Tuning 'subsample' and 'colsample_bytree' parameters
# Create list of possible params
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [20]:
min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with subsample=1.0, colsample=1.0
	MAE 110651.4812498 for 10 rounds
CV with subsample=1.0, colsample=0.9
	MAE 112758.3328124 for 10 rounds
CV with subsample=1.0, colsample=0.8
	MAE 114558.8156248 for 10 rounds
CV with subsample=1.0, colsample=0.7
	MAE 115659.2468754 for 10 rounds
CV with subsample=0.9, colsample=1.0
	MAE 114113.3484376 for 13 rounds
CV with subsample=0.9, colsample=0.9
	MAE 112887.915625 for 10 rounds
CV with subsample=0.9, colsample=0.8
	MAE 116811.8687504 for 11 rounds
CV with subsample=0.9, colsample=0.7
	MAE 115325.984375 for 12 rounds
CV with subsample=0.8, colsample=1.0
	MAE 113114.7000002 for 13 rounds
CV with subsample=0.8, colsample=0.9
	MAE 114355.378125 for 13 rounds
CV with subsample=0.8, colsample=0.8
	MAE 117632.6453124 for 13 rounds
CV with subsample=0.8, colsample=0.7
	MAE 116756.0609376 for 13 rounds
CV with subsample=0.7, colsample=1.0
	MAE 113617.7937498 for 12 rounds
CV with subsample=0.7, colsample=0.9
	MAE 114126.2078128 for 12 rounds
CV with s

In [21]:
# Update params dictionary
params['subsample'] = 1.0
params['colsample_bytree'] = 1.0

In [22]:
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    cv_results = xgb.cv(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            seed=42,
            nfold=5,
            metrics=['mae'],
            early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))

CV with eta=0.3
	MAE 110651.4781252 for 10 rounds

CV with eta=0.2
	MAE 110093.86874979999 for 18 rounds

CV with eta=0.1
	MAE 108928.3296876 for 37 rounds

CV with eta=0.05
	MAE 107974.1359374 for 78 rounds

CV with eta=0.01
	MAE 107952.53125 for 399 rounds

CV with eta=0.005
	MAE 108024.8484372 for 821 rounds

Best params: 0.01, MAE: 107952.53125


In [23]:
# Update parameters dictionary
params['eta'] = .01

In [24]:
params

{'max_depth': 11,
 'min_child_weight': 8,
 'eta': 0.01,
 'subsample': 1.0,
 'colsample_bytree': 1.0,
 'objective': 'reg:squarederror',
 'eval_metric': 'mae'}

In [25]:
# Train the model with the tuned parameters and use test data
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

print("Best MAE: {:.2f} in {} rounds".format(model.best_score, model.best_iteration+1))

[0]	Test-mae:539524
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:534229
[2]	Test-mae:529012
[3]	Test-mae:523848
[4]	Test-mae:518734
[5]	Test-mae:513667
[6]	Test-mae:508623
[7]	Test-mae:503610
[8]	Test-mae:498689
[9]	Test-mae:493743
[10]	Test-mae:488938
[11]	Test-mae:484132
[12]	Test-mae:479392
[13]	Test-mae:474683
[14]	Test-mae:470068
[15]	Test-mae:465453
[16]	Test-mae:460948
[17]	Test-mae:456422
[18]	Test-mae:451970
[19]	Test-mae:447590
[20]	Test-mae:443233
[21]	Test-mae:438897
[22]	Test-mae:434630
[23]	Test-mae:430388
[24]	Test-mae:426200
[25]	Test-mae:421968
[26]	Test-mae:417880
[27]	Test-mae:413721
[28]	Test-mae:409653
[29]	Test-mae:405676
[30]	Test-mae:401704
[31]	Test-mae:397755
[32]	Test-mae:393870
[33]	Test-mae:389961
[34]	Test-mae:386185
[35]	Test-mae:382442
[36]	Test-mae:378653
[37]	Test-mae:374971
[38]	Test-mae:371357
[39]	Test-mae:367722
[40]	Test-mae:364181
[41]	Test-mae:360653
[42]	Test-mae:357159
[43]	Test-mae:353729
[44]	Test-mae:350288
[45]	Test

Best MAE: 99564.87 in 361 rounds


In [26]:
# Save model with best parameters
num_boost_round = model.best_iteration + 1
best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

[0]	Test-mae:539524
[1]	Test-mae:534229
[2]	Test-mae:529012
[3]	Test-mae:523848
[4]	Test-mae:518735
[5]	Test-mae:513667
[6]	Test-mae:508623
[7]	Test-mae:503610
[8]	Test-mae:498689
[9]	Test-mae:493743
[10]	Test-mae:488938
[11]	Test-mae:484132
[12]	Test-mae:479393
[13]	Test-mae:474683
[14]	Test-mae:470068
[15]	Test-mae:465453
[16]	Test-mae:460948
[17]	Test-mae:456422
[18]	Test-mae:451970
[19]	Test-mae:447590
[20]	Test-mae:443233
[21]	Test-mae:438897
[22]	Test-mae:434630
[23]	Test-mae:430388
[24]	Test-mae:426200
[25]	Test-mae:421968
[26]	Test-mae:417880
[27]	Test-mae:413721
[28]	Test-mae:409653
[29]	Test-mae:405676
[30]	Test-mae:401704
[31]	Test-mae:397755
[32]	Test-mae:393870
[33]	Test-mae:389961
[34]	Test-mae:386185
[35]	Test-mae:382442
[36]	Test-mae:378653
[37]	Test-mae:374971
[38]	Test-mae:371357
[39]	Test-mae:367722
[40]	Test-mae:364181
[41]	Test-mae:360653
[42]	Test-mae:357159
[43]	Test-mae:353729
[44]	Test-mae:350288
[45]	Test-mae:346937
[46]	Test-mae:343653
[47]	Test-mae:340479
[4

In [27]:
# Create prediction
y_pred = best_model.predict(dtest)

In [28]:
# Save model
best_model.save_model("xgboost_optimal_2020.model")

In [29]:
# Calculate R squared and Adjusted R Square
import statsmodels.api as sm
result = sm.OLS(y_pred, y_test).fit()
print(result.rsquared, result.rsquared_adj)

0.8954463408211364 0.8953690082329864


In [30]:
# Calculate Mean Squared Error
from sklearn.metrics import mean_squared_error, mean_squared_log_error
import math
print(mean_squared_error(y_test, y_pred))
print(math.sqrt(mean_squared_error(y_test, y_pred)))
print(mean_squared_log_error(y_test, y_pred))

49667940026.47127
222863.05217884653
0.06019869223871155


In [31]:
# Calculate Mean Absolute Error(MAE)
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test, y_pred))

99564.86546101257


In [None]:
import matplotlib.pyplot as plt

# extra step to allow graphviz to be found 
import os
os.environ["PATH"] += os.pathsep + 'C:/Users/danny/.conda/envs/mlenv/lib/site-packages/graphviz'

xgb.plot_tree(model, num_trees=0)
plt.rcParams['figure.figsize'] = [50, 10]
plt.show()

In [None]:
xgb.plot_importance(best_model)
plt.rcParams['figure.figsize'] = [5, 5]
plt.show()

In [None]:
# # Code to load model for other datasets:
# loaded_model = xgb.Booster()
# loaded_model.load_model("my_model.model")
# # And use it for predictions.
# loaded_model.predict(dtest)