In [2]:
# Import dependencies
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
import numpy as np
from scipy import stats
from scipy.stats import norm
import seaborn as sns

In [3]:
# Import data
df = pd.read_csv(Path('../resources/regressiondata.csv'))
df.head()

Unnamed: 0.1,Unnamed: 0,zpid,zipcode,latitude,longitude,propertyTaxRate,garageSpaces,hasAssociation,hasCooling,hasGarage,...,homeType_Apartment,homeType_Condo,homeType_Mobile / Manufactured,homeType_MultiFamily,homeType_Multiple Occupancy,homeType_Other,homeType_Residential,homeType_Single Family,homeType_Townhouse,homeType_Vacant Land
0,0,111373431,78660,30.430632,-97.663078,1.98,2,1,1,1,...,0,0,0,0,0,0,0,1,0,0
1,1,120900430,78660,30.432673,-97.661697,1.98,2,1,1,1,...,0,0,0,0,0,0,0,1,0,0
2,2,2084491383,78660,30.409748,-97.639771,1.98,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
3,3,120901374,78660,30.432112,-97.661659,1.98,2,1,1,1,...,0,0,0,0,0,0,0,1,0,0
4,4,60134862,78660,30.437368,-97.65686,1.98,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
df.columns

Index(['Unnamed: 0', 'zpid', 'zipcode', 'latitude', 'longitude',
       'propertyTaxRate', 'garageSpaces', 'hasAssociation', 'hasCooling',
       'hasGarage', 'hasHeating', 'hasSpa', 'hasView', 'parkingSpaces',
       'yearBuilt', 'latestPrice', 'numPriceChanges', 'latest_saledate',
       'latest_salemonth', 'latest_saleyear', 'numOfPhotos',
       'numOfAccessibilityFeatures', 'numOfAppliances', 'numOfParkingFeatures',
       'numOfPatioAndPorchFeatures', 'numOfSecurityFeatures',
       'numOfWaterfrontFeatures', 'numOfWindowFeatures',
       'numOfCommunityFeatures', 'lotSizeSqFt', 'livingAreaSqFt',
       'numOfPrimarySchools', 'numOfElementarySchools', 'numOfMiddleSchools',
       'numOfHighSchools', 'avgSchoolDistance', 'avgSchoolRating',
       'avgSchoolSize', 'MedianStudentsPerTeacher', 'numOfBathrooms',
       'numOfBedrooms', 'numOfStories', 'city_austin', 'city_del valle',
       'city_driftwood', 'city_dripping springs', 'city_manchaca',
       'city_manor', 'city_pflugerv

In [5]:
df.dtypes

Unnamed: 0                  int64
zpid                        int64
zipcode                     int64
latitude                  float64
longitude                 float64
                           ...   
homeType_Other              int64
homeType_Residential        int64
homeType_Single Family      int64
homeType_Townhouse          int64
homeType_Vacant Land        int64
Length: 61, dtype: object

In [6]:
df.isnull().sum()

Unnamed: 0                0
zpid                      0
zipcode                   0
latitude                  0
longitude                 0
                         ..
homeType_Other            0
homeType_Residential      0
homeType_Single Family    0
homeType_Townhouse        0
homeType_Vacant Land      0
Length: 61, dtype: int64

In [7]:
# Save 'zpid'
id_df = df['zpid']

# Drop 'zpid' column
df.drop("zpid", axis = 1, inplace = True)
df.drop("latest_saledate", axis = 1, inplace = True)

# Check data size after dropping the 'Id' variable
print("\nData size: {} ".format(df.shape)) 


Data size: (15171, 59) 


In [8]:
# Create features and target
y = df["latestPrice"]
X = df.drop(columns=['latestPrice'])

In [9]:
# Split data into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train.shape

(11378, 58)

In [10]:
# Create DMatrices

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [13]:
from sklearn.metrics import mean_absolute_error
import numpy as np

# "Learn" the mean from the training data
mean_train = np.mean(y_train)

# Get predictions on the test set
baseline_predictions = np.ones(y_test.shape) * mean_train

# Compute MAE
mae_baseline = mean_absolute_error(y_test, baseline_predictions)
print("Baseline MAE is {:.2f}".format(mae_baseline))

Baseline MAE is 240070.42


In [34]:
# Create parameters dictionary
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:squarederror',
}

In [35]:
params['eval_metric'] = "mae"

num_boost_round = 999

In [36]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

print("Best MAE: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

[0]	Test-mae:371802
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:268838
[2]	Test-mae:203009
[3]	Test-mae:160872
[4]	Test-mae:135725
[5]	Test-mae:122136
[6]	Test-mae:115514
[7]	Test-mae:110654
[8]	Test-mae:108049
[9]	Test-mae:107020
[10]	Test-mae:104511
[11]	Test-mae:103698
[12]	Test-mae:103037
[13]	Test-mae:102574
[14]	Test-mae:102304
[15]	Test-mae:101277
[16]	Test-mae:101317
[17]	Test-mae:100788
[18]	Test-mae:101055
[19]	Test-mae:100115
[20]	Test-mae:99930.2
[21]	Test-mae:99685.7
[22]	Test-mae:99054.4
[23]	Test-mae:99138.3
[24]	Test-mae:99013.4
[25]	Test-mae:98618.7
[26]	Test-mae:98556.2
[27]	Test-mae:98478.4
[28]	Test-mae:98488.6
[29]	Test-mae:98262.2
[30]	Test-mae:97792.7
[31]	Test-mae:97801.1
[32]	Test-mae:97781.5
[33]	Test-mae:97510.8
[34]	Test-mae:97379.4
[35]	Test-mae:97334.6
[36]	Test-mae:97123.7
[37]	Test-mae:96989.1
[38]	Test-mae:96866.6
[39]	Test-mae:96873.8
[40]	Test-mae:96841.8
[41]	Test-mae:96841.5
[42]	Test-mae:96724.1
[43]	Test-mae:96763.6
[44]	T

In [37]:
# Get cross validation score with current params
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)
cv_results

Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
0,358610.325000,1707.238965,359592.568750,7811.389060
1,255523.503125,1271.582085,258328.128125,7006.336147
2,187628.384375,1207.269649,193490.743750,6573.368819
3,144842.118750,929.494355,154674.618750,5344.360701
4,119046.017187,1034.061794,131827.896875,4904.250330
...,...,...,...,...
91,41196.150781,694.527787,95722.159375,4604.913034
92,40958.022656,710.583519,95717.184375,4576.112625
93,40787.506250,756.763426,95718.754688,4562.247619
94,40485.134375,729.145007,95732.106250,4608.051938


In [38]:
cv_results['test-mae-mean'].min()

95708.3515622

In [39]:
# You can try wider intervals with a larger step between
# each value and then narrow it down. Here after several
# iteration I found that the optimal value was in the
# following ranges.
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(4,12)
    for min_child_weight in range(1,10)
]

In [40]:
# Define initial best params and MAE
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=4, min_child_weight=1
	MAE 96424.3203124 for 119 rounds
CV with max_depth=4, min_child_weight=2
	MAE 96391.4609376 for 131 rounds
CV with max_depth=4, min_child_weight=3
	MAE 98744.8171874 for 101 rounds
CV with max_depth=4, min_child_weight=4
	MAE 99430.6187502 for 59 rounds
CV with max_depth=4, min_child_weight=5
	MAE 98179.7906252 for 169 rounds
CV with max_depth=4, min_child_weight=6
	MAE 98784.07656280001 for 144 rounds
CV with max_depth=4, min_child_weight=7
	MAE 98475.4140626 for 132 rounds
CV with max_depth=4, min_child_weight=8
	MAE 99025.3421874 for 104 rounds
CV with max_depth=4, min_child_weight=9
	MAE 98789.96249979999 for 144 rounds
CV with max_depth=5, min_child_weight=1
	MAE 95734.9296876 for 156 rounds
CV with max_depth=5, min_child_weight=2
	MAE 95970.5671876 for 94 rounds
CV with max_depth=5, min_child_weight=3
	MAE 99001.1484374 for 45 rounds
CV with max_depth=5, min_child_weight=4
	MAE 98770.2984376 for 81 rounds
CV with max_depth=5, min_child_wei

In [41]:
# Update parameters with best found parameters
params['max_depth'] = 6
params['min_child_weight'] = 1

In [42]:
# Tuning 'subsample' and 'colsample_bytree' parameters
# Create list of possible params
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [43]:
min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with subsample=1.0, colsample=1.0
	MAE 95708.35312479999 for 95 rounds
CV with subsample=1.0, colsample=0.9
	MAE 95012.9187498 for 49 rounds
CV with subsample=1.0, colsample=0.8
	MAE 95078.2625 for 98 rounds
CV with subsample=1.0, colsample=0.7
	MAE 97832.4484376 for 67 rounds
CV with subsample=0.9, colsample=1.0
	MAE 96267.5937502 for 69 rounds
CV with subsample=0.9, colsample=0.9
	MAE 97180.0859376 for 42 rounds
CV with subsample=0.9, colsample=0.8
	MAE 97104.3625 for 68 rounds
CV with subsample=0.9, colsample=0.7
	MAE 96206.1968752 for 96 rounds
CV with subsample=0.8, colsample=1.0
	MAE 98440.3359376 for 64 rounds
CV with subsample=0.8, colsample=0.9
	MAE 97914.2078126 for 58 rounds
CV with subsample=0.8, colsample=0.8
	MAE 99066.8109376 for 50 rounds
CV with subsample=0.8, colsample=0.7
	MAE 98151.4031252 for 70 rounds
CV with subsample=0.7, colsample=1.0
	MAE 100677.95 for 45 rounds
CV with subsample=0.7, colsample=0.9
	MAE 99133.7234372 for 73 rounds
CV with subsample=0.7, col

In [33]:
# Update params dictionary
params['subsample'] = 1.0
params['colsample_bytree'] = 0.9

In [49]:
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    cv_results = xgb.cv(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            seed=42,
            nfold=5,
            metrics=['mae'],
            early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))

CV with eta=0.3
	MAE 101182.45625 for 59 rounds

CV with eta=0.2
	MAE 95436.4203126 for 104 rounds

CV with eta=0.1
	MAE 91620.69375020001 for 248 rounds

CV with eta=0.05
	MAE 89973.8687502 for 602 rounds

CV with eta=0.01
	MAE 90535.7859374 for 998 rounds

CV with eta=0.005
	MAE 94873.246875 for 998 rounds

Best params: 0.05, MAE: 89973.8687502


In [50]:
# Update parameters dictionary
params['eta'] = .05

In [51]:
params

{'max_depth': 6,
 'min_child_weight': 1,
 'eta': 0.05,
 'subsample': 0.7,
 'colsample_bytree': 0.7,
 'objective': 'reg:squarederror',
 'eval_metric': 'mae'}

In [55]:
# Train the model with the tuned parameters and use test data
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

print("Best MAE: {:.2f} in {} rounds".format(model.best_score, model.best_iteration+1))

[0]	Test-mae:499657
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:475327
[2]	Test-mae:452366
[3]	Test-mae:430500
[4]	Test-mae:409607
[5]	Test-mae:389680
[6]	Test-mae:371367
[7]	Test-mae:353558
[8]	Test-mae:336810
[9]	Test-mae:320925
[10]	Test-mae:305914
[11]	Test-mae:291957
[12]	Test-mae:278973
[13]	Test-mae:266606
[14]	Test-mae:254978
[15]	Test-mae:243957
[16]	Test-mae:233571
[17]	Test-mae:223879
[18]	Test-mae:214773
[19]	Test-mae:206400
[20]	Test-mae:198475
[21]	Test-mae:190944
[22]	Test-mae:184038
[23]	Test-mae:177532
[24]	Test-mae:171641
[25]	Test-mae:165971
[26]	Test-mae:160715
[27]	Test-mae:155811
[28]	Test-mae:151256
[29]	Test-mae:147138
[30]	Test-mae:143425
[31]	Test-mae:139988
[32]	Test-mae:136805
[33]	Test-mae:133998
[34]	Test-mae:131235
[35]	Test-mae:128767
[36]	Test-mae:126146
[37]	Test-mae:124040
[38]	Test-mae:121991
[39]	Test-mae:120240
[40]	Test-mae:118495
[41]	Test-mae:117326
[42]	Test-mae:115908
[43]	Test-mae:114686
[44]	Test-mae:113610
[45]	Test

In [56]:
# Save model with best parameters
num_boost_round = model.best_iteration + 1
best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

[0]	Test-mae:499657
[1]	Test-mae:475327
[2]	Test-mae:452366
[3]	Test-mae:430500
[4]	Test-mae:409607
[5]	Test-mae:389680
[6]	Test-mae:371367
[7]	Test-mae:353558
[8]	Test-mae:336810
[9]	Test-mae:320925
[10]	Test-mae:305914
[11]	Test-mae:291957
[12]	Test-mae:278973
[13]	Test-mae:266606
[14]	Test-mae:254978
[15]	Test-mae:243957
[16]	Test-mae:233571
[17]	Test-mae:223879
[18]	Test-mae:214773
[19]	Test-mae:206400
[20]	Test-mae:198475
[21]	Test-mae:190944
[22]	Test-mae:184038
[23]	Test-mae:177532
[24]	Test-mae:171641
[25]	Test-mae:165971
[26]	Test-mae:160715
[27]	Test-mae:155811
[28]	Test-mae:151256
[29]	Test-mae:147138
[30]	Test-mae:143425
[31]	Test-mae:139988
[32]	Test-mae:136805
[33]	Test-mae:133998
[34]	Test-mae:131235
[35]	Test-mae:128767
[36]	Test-mae:126146
[37]	Test-mae:124040
[38]	Test-mae:121991
[39]	Test-mae:120240
[40]	Test-mae:118495
[41]	Test-mae:117326
[42]	Test-mae:115908
[43]	Test-mae:114686
[44]	Test-mae:113610
[45]	Test-mae:112641
[46]	Test-mae:111698
[47]	Test-mae:110788
[4

In [60]:
# Create prediction
y_pred = best_model.predict(dtest)

In [59]:
# Save model
best_model.save_model("xgboost_optimal.model")

In [61]:
# Calculate R squared and Adjusted R Square
import statsmodels.api as sm
result = sm.OLS(y_pred, y_test).fit()
print(result.rsquared, result.rsquared_adj)

0.8758082482435438 0.8757754972541565


In [62]:
# Calculate Mean Squared Error
from sklearn.metrics import mean_squared_error, mean_squared_log_error
import math
print(mean_squared_error(y_test, y_pred))
print(math.sqrt(mean_squared_error(y_test, y_pred)))
print(mean_squared_log_error(y_test, y_pred))

61564188340.977
248121.3177882485
0.06847117233998666


In [63]:
# Calculate Mean Absolute Error(MAE)
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test, y_pred))

89670.59285196415


In [None]:
# # Code to load model for other datasets:
# loaded_model = xgb.Booster()
# loaded_model.load_model("my_model.model")
# # And use it for predictions.
# loaded_model.predict(dtest)