In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
import numpy as np
from scipy import stats
from scipy.stats import norm
import seaborn as sns

In [2]:
# import SQL dependencies
from sqlalchemy import create_engine
import psycopg2
from pandas.io import sql

from config import db_password

db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/austin_housing"
engine = create_engine(db_string)

In [3]:
#pull 2020 data from pgAdmin

df = pd.read_sql('regressionzip_2020', db_string)
df.head()

Unnamed: 0,zpid,latestPrice,numOfBathrooms,livingAreaSqFt,numOfBedrooms,avgSchoolRating,numOfStories,MedianStudentsPerTeacher,numOfHighSchools,longitude,...,lotSizeSqFt,numOfPhotos,numPriceChanges,zipcode,yearBuilt,propertyTaxRate,latest_saledate,latest_salemonth,latest_saleyear,averagePrice
0,120900430,295000.0,2.0,1768.0,4,2.666667,1,14,1,-97.661697,...,6185.0,29,1,78660,2013,1.98,2020-10-13,10,2020,303125.25
1,2080105342,309045.0,2.0,1446.0,3,4.0,1,14,1,-97.656181,...,5161.0,2,2,78660,2020,1.98,2020-08-05,8,2020,303125.25
2,241932337,315000.0,3.0,2432.0,4,3.666667,2,12,1,-97.643394,...,12196.8,36,2,78660,2016,1.98,2020-06-11,6,2020,303125.25
3,241932327,279900.0,2.0,1580.0,3,3.666667,1,12,1,-97.643288,...,5401.0,32,2,78660,2016,1.98,2020-08-28,8,2020,303125.25
4,69808966,239900.0,2.0,1762.0,4,3.333333,1,14,1,-97.623436,...,6011.0,28,9,78617,2005,1.98,2020-09-05,9,2020,198750.391304


In [4]:
# Import data
# df = pd.read_csv(Path('../resources/regressiondata_2020.csv'))
# df.head()

In [5]:
df.columns

Index(['zpid', 'latestPrice', 'numOfBathrooms', 'livingAreaSqFt',
       'numOfBedrooms', 'avgSchoolRating', 'numOfStories',
       'MedianStudentsPerTeacher', 'numOfHighSchools', 'longitude',
       'numOfPrimarySchools', 'avgSchoolDistance', 'latitude', 'lotSizeSqFt',
       'numOfPhotos', 'numPriceChanges', 'zipcode', 'yearBuilt',
       'propertyTaxRate', 'latest_saledate', 'latest_salemonth',
       'latest_saleyear', 'averagePrice'],
      dtype='object')

In [6]:
# Save 'zpid'
id_df = df['zpid']

# Drop 'zpid' column for CSV
# df.drop(["zpid","latest_saledate","Unnamed: 0",'latest_salemonth','latest_saleyear'], axis = 1, inplace = True)

# Drop 'zpid' column for SQL
df.drop(["zpid","latest_saledate",'latest_salemonth','latest_saleyear'], axis = 1, inplace = True)

# Check data size after dropping the 'Id' variable
print("\nData size: {} ".format(df.shape)) 


Data size: (5408, 19) 


In [7]:
# Create features and target
y = df["latestPrice"]
X = df.drop(columns=['latestPrice'])

In [8]:
# Split data into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train.shape

(4056, 18)

In [9]:
# Create DMatrices

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [10]:
from sklearn.metrics import mean_absolute_error
import numpy as np

# "Learn" the mean from the training data
mean_train = np.mean(y_train)

# Get predictions on the test set
baseline_predictions = np.ones(y_test.shape) * mean_train

# Compute MAE
mae_baseline = mean_absolute_error(y_test, baseline_predictions)
print("Baseline MAE is {:.2f}".format(mae_baseline))

Baseline MAE is 249930.58


In [11]:
# Create parameters dictionary
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:squarederror',
}

In [12]:
params['eval_metric'] = "mae"

num_boost_round = 999

In [13]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

print("Best MAE: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

[0]	Test-mae:384410
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:277526
[2]	Test-mae:207377
[3]	Test-mae:166760
[4]	Test-mae:142971
[5]	Test-mae:129057
[6]	Test-mae:120791
[7]	Test-mae:116987
[8]	Test-mae:114026
[9]	Test-mae:114196
[10]	Test-mae:112931
[11]	Test-mae:112096
[12]	Test-mae:111731
[13]	Test-mae:111357
[14]	Test-mae:110997
[15]	Test-mae:110803
[16]	Test-mae:110617
[17]	Test-mae:110579
[18]	Test-mae:110283
[19]	Test-mae:109961
[20]	Test-mae:110126
[21]	Test-mae:109979
[22]	Test-mae:110053
[23]	Test-mae:110085
[24]	Test-mae:109996
[25]	Test-mae:110002
[26]	Test-mae:110006
[27]	Test-mae:109762
[28]	Test-mae:109612
[29]	Test-mae:109358
[30]	Test-mae:109124
[31]	Test-mae:108930
[32]	Test-mae:109115
[33]	Test-mae:109482
[34]	Test-mae:109448
[35]	Test-mae:109564
[36]	Test-mae:109531
[37]	Test-mae:109518
[38]	Test-mae:109751
[39]	Test-mae:109815
[40]	Test-mae:109872
[41]	Test-mae:109491
Stopping. Best iteration:
[31]	Test-mae:108930

Best MAE: 108929.79 with

In [14]:
# Get cross validation score with current params
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)
cv_results

Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
0,389378.9625,2143.697696,391467.575,12464.652105
1,277606.7,1550.447046,282213.20625,11418.659801
2,202703.196875,1009.789931,211372.9875,9365.078289
3,153949.465625,556.743834,168403.184375,8145.93781
4,123216.821875,256.615107,143018.453125,8449.572745
5,104048.370313,410.683178,128264.2625,7483.883347
6,92640.760937,795.527612,121012.94375,6475.507619
7,85619.914062,1188.072354,118011.7625,6006.335735
8,81038.31875,1309.177185,115693.14375,6105.455149
9,77235.867187,1492.767709,114215.935937,6205.700045


In [15]:
cv_results['test-mae-mean'].min()

109532.6578126

In [16]:
# You can try wider intervals with a larger step between
# each value and then narrow it down. Here after several
# iteration I found that the optimal value was in the
# following ranges.
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(4,12)
    for min_child_weight in range(1,10)
]

In [17]:
# Define initial best params and MAE
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=4, min_child_weight=1
	MAE 109232.6875 for 81 rounds
CV with max_depth=4, min_child_weight=2
	MAE 112063.5515628 for 65 rounds
CV with max_depth=4, min_child_weight=3
	MAE 112420.64375 for 50 rounds
CV with max_depth=4, min_child_weight=4
	MAE 112191.1390624 for 38 rounds
CV with max_depth=4, min_child_weight=5
	MAE 112432.9093752 for 43 rounds
CV with max_depth=4, min_child_weight=6
	MAE 113804.5171874 for 49 rounds
CV with max_depth=4, min_child_weight=7
	MAE 113843.925 for 37 rounds
CV with max_depth=4, min_child_weight=8
	MAE 112949.2984376 for 54 rounds
CV with max_depth=4, min_child_weight=9
	MAE 113793.7093748 for 73 rounds
CV with max_depth=5, min_child_weight=1
	MAE 109091.0359376 for 50 rounds
CV with max_depth=5, min_child_weight=2
	MAE 110930.4218746 for 54 rounds
CV with max_depth=5, min_child_weight=3
	MAE 110268.82968719999 for 46 rounds
CV with max_depth=5, min_child_weight=4
	MAE 113385.2031246 for 39 rounds
CV with max_depth=5, min_child_weight=5
	MA

In [18]:
# Update parameters with best found parameters
params['max_depth'] = 11
params['min_child_weight'] = 9

In [19]:
# Tuning 'subsample' and 'colsample_bytree' parameters
# Create list of possible params
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [20]:
min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with subsample=1.0, colsample=1.0
	MAE 108575.3703124 for 11 rounds
CV with subsample=1.0, colsample=0.9
	MAE 110579.5453126 for 10 rounds
CV with subsample=1.0, colsample=0.8
	MAE 114294.63125 for 9 rounds
CV with subsample=1.0, colsample=0.7
	MAE 113844.4875002 for 9 rounds
CV with subsample=0.9, colsample=1.0
	MAE 110024.6125 for 11 rounds
CV with subsample=0.9, colsample=0.9
	MAE 110079.0578126 for 11 rounds
CV with subsample=0.9, colsample=0.8
	MAE 114867.4687498 for 11 rounds
CV with subsample=0.9, colsample=0.7
	MAE 113576.3796876 for 10 rounds
CV with subsample=0.8, colsample=1.0
	MAE 110223.0734376 for 10 rounds
CV with subsample=0.8, colsample=0.9
	MAE 109768.0687498 for 10 rounds
CV with subsample=0.8, colsample=0.8
	MAE 110960.4343746 for 11 rounds
CV with subsample=0.8, colsample=0.7
	MAE 113558.5906248 for 9 rounds
CV with subsample=0.7, colsample=1.0
	MAE 110565.5468752 for 10 rounds
CV with subsample=0.7, colsample=0.9
	MAE 111232.8093748 for 9 rounds
CV with subsamp

In [21]:
# Update params dictionary
params['subsample'] = 1.0
params['colsample_bytree'] = 1.0

In [22]:
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    cv_results = xgb.cv(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            seed=42,
            nfold=5,
            metrics=['mae'],
            early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))

CV with eta=0.3
	MAE 108575.3703124 for 11 rounds

CV with eta=0.2
	MAE 108778.7812496 for 18 rounds

CV with eta=0.1
	MAE 106174.81093780001 for 39 rounds

CV with eta=0.05
	MAE 105950.6015626 for 85 rounds

CV with eta=0.01
	MAE 105987.56875020001 for 412 rounds

CV with eta=0.005
	MAE 106266.52499979999 for 843 rounds

Best params: 0.05, MAE: 105950.6015626


In [23]:
# Update parameters dictionary
params['eta'] = .05

In [24]:
params

{'max_depth': 11,
 'min_child_weight': 9,
 'eta': 0.05,
 'subsample': 1.0,
 'colsample_bytree': 1.0,
 'objective': 'reg:squarederror',
 'eval_metric': 'mae'}

In [25]:
# Train the model with the tuned parameters and use test data
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

print("Best MAE: {:.2f} in {} rounds".format(model.best_score, model.best_iteration+1))

[0]	Test-mae:517493
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:492054
[2]	Test-mae:467775
[3]	Test-mae:445184
[4]	Test-mae:423530
[5]	Test-mae:402852
[6]	Test-mae:383571
[7]	Test-mae:364851
[8]	Test-mae:347346
[9]	Test-mae:330964
[10]	Test-mae:315542
[11]	Test-mae:300972
[12]	Test-mae:287399
[13]	Test-mae:274857
[14]	Test-mae:262431
[15]	Test-mae:250840
[16]	Test-mae:240127
[17]	Test-mae:230223
[18]	Test-mae:221043
[19]	Test-mae:212445
[20]	Test-mae:204247
[21]	Test-mae:196543
[22]	Test-mae:189563
[23]	Test-mae:183166
[24]	Test-mae:177113
[25]	Test-mae:171428
[26]	Test-mae:165986
[27]	Test-mae:160993
[28]	Test-mae:156266
[29]	Test-mae:151837
[30]	Test-mae:147665
[31]	Test-mae:143754
[32]	Test-mae:140115
[33]	Test-mae:136569
[34]	Test-mae:133473
[35]	Test-mae:130600
[36]	Test-mae:128000
[37]	Test-mae:125552
[38]	Test-mae:123185
[39]	Test-mae:120996
[40]	Test-mae:119416
[41]	Test-mae:117933
[42]	Test-mae:116524
[43]	Test-mae:115310
[44]	Test-mae:114135
[45]	Test

In [26]:
# Save model with best parameters
num_boost_round = model.best_iteration + 1
best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

[0]	Test-mae:517493
[1]	Test-mae:492054
[2]	Test-mae:467775
[3]	Test-mae:445184
[4]	Test-mae:423530
[5]	Test-mae:402852
[6]	Test-mae:383571
[7]	Test-mae:364851
[8]	Test-mae:347346
[9]	Test-mae:330964
[10]	Test-mae:315542
[11]	Test-mae:300972
[12]	Test-mae:287399
[13]	Test-mae:274857
[14]	Test-mae:262431
[15]	Test-mae:250840
[16]	Test-mae:240127
[17]	Test-mae:230223
[18]	Test-mae:221043
[19]	Test-mae:212445
[20]	Test-mae:204247
[21]	Test-mae:196543
[22]	Test-mae:189563
[23]	Test-mae:183166
[24]	Test-mae:177113
[25]	Test-mae:171428
[26]	Test-mae:165986
[27]	Test-mae:160993
[28]	Test-mae:156266
[29]	Test-mae:151837
[30]	Test-mae:147665
[31]	Test-mae:143754
[32]	Test-mae:140115
[33]	Test-mae:136569
[34]	Test-mae:133473
[35]	Test-mae:130600
[36]	Test-mae:128000
[37]	Test-mae:125552
[38]	Test-mae:123185
[39]	Test-mae:120996
[40]	Test-mae:119416
[41]	Test-mae:117933
[42]	Test-mae:116524
[43]	Test-mae:115310
[44]	Test-mae:114135
[45]	Test-mae:113052
[46]	Test-mae:112186
[47]	Test-mae:111240
[4

In [27]:
# Create prediction
y_pred = best_model.predict(dtest)

In [28]:
# Save model
best_model.save_model("xgboost_optimal_2020.model")

In [29]:
# Calculate R squared and Adjusted R Square
import statsmodels.api as sm
result = sm.OLS(y_pred, y_test).fit()
print(result.rsquared, result.rsquared_adj)

0.8795607376797697 0.8794715894471122


In [30]:
# Calculate Mean Squared Error
from sklearn.metrics import mean_squared_error, mean_squared_log_error
import math
print(mean_squared_error(y_test, y_pred))
print(math.sqrt(mean_squared_error(y_test, y_pred)))
print(mean_squared_log_error(y_test, y_pred))

59636412900.195786
244205.6774528303
0.06288311531430633


In [31]:
# Calculate Mean Absolute Error(MAE)
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test, y_pred))

104758.1213133321


In [None]:
xgb.plot_importance(best_model)
plt.rcParams['figure.figsize'] = [5, 5]
plt.show()

In [None]:
# # Code to load model for other datasets:
# loaded_model = xgb.Booster()
# loaded_model.load_model("my_model.model")
# # And use it for predictions.
# loaded_model.predict(dtest)