In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
import numpy as np
from scipy import stats
from scipy.stats import norm
import seaborn as sns

In [2]:
# Import data
df = pd.read_csv(Path('../resources/regressiondata.csv'))
df.head()

Unnamed: 0.1,Unnamed: 0,zpid,zipcode,latitude,longitude,propertyTaxRate,garageSpaces,hasAssociation,hasCooling,hasGarage,...,homeType_Apartment,homeType_Condo,homeType_Mobile / Manufactured,homeType_MultiFamily,homeType_Multiple Occupancy,homeType_Other,homeType_Residential,homeType_Single Family,homeType_Townhouse,homeType_Vacant Land
0,0,111373431,78660,30.430632,-97.663078,1.98,2,1,1,1,...,0,0,0,0,0,0,0,1,0,0
1,1,120900430,78660,30.432673,-97.661697,1.98,2,1,1,1,...,0,0,0,0,0,0,0,1,0,0
2,2,2084491383,78660,30.409748,-97.639771,1.98,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
3,3,120901374,78660,30.432112,-97.661659,1.98,2,1,1,1,...,0,0,0,0,0,0,0,1,0,0
4,4,60134862,78660,30.437368,-97.65686,1.98,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0


In [3]:
df.columns

Index(['Unnamed: 0', 'zpid', 'zipcode', 'latitude', 'longitude',
       'propertyTaxRate', 'garageSpaces', 'hasAssociation', 'hasCooling',
       'hasGarage', 'hasHeating', 'hasSpa', 'hasView', 'parkingSpaces',
       'yearBuilt', 'latestPrice', 'numPriceChanges', 'latest_saledate',
       'latest_salemonth', 'latest_saleyear', 'numOfPhotos',
       'numOfAccessibilityFeatures', 'numOfAppliances', 'numOfParkingFeatures',
       'numOfPatioAndPorchFeatures', 'numOfSecurityFeatures',
       'numOfWaterfrontFeatures', 'numOfWindowFeatures',
       'numOfCommunityFeatures', 'lotSizeSqFt', 'livingAreaSqFt',
       'numOfPrimarySchools', 'numOfElementarySchools', 'numOfMiddleSchools',
       'numOfHighSchools', 'avgSchoolDistance', 'avgSchoolRating',
       'avgSchoolSize', 'MedianStudentsPerTeacher', 'numOfBathrooms',
       'numOfBedrooms', 'numOfStories', 'city_austin', 'city_del valle',
       'city_driftwood', 'city_dripping springs', 'city_manchaca',
       'city_manor', 'city_pflugerv

In [4]:
df.dtypes

Unnamed: 0                  int64
zpid                        int64
zipcode                     int64
latitude                  float64
longitude                 float64
                           ...   
homeType_Other              int64
homeType_Residential        int64
homeType_Single Family      int64
homeType_Townhouse          int64
homeType_Vacant Land        int64
Length: 61, dtype: object

In [5]:
df.isnull().sum()

Unnamed: 0                0
zpid                      0
zipcode                   0
latitude                  0
longitude                 0
                         ..
homeType_Other            0
homeType_Residential      0
homeType_Single Family    0
homeType_Townhouse        0
homeType_Vacant Land      0
Length: 61, dtype: int64

In [6]:
# Save 'zpid'
id_df = df['zpid']

# Drop 'zpid' column
df.drop("zpid", axis = 1, inplace = True)
df.drop("latest_saledate", axis = 1, inplace = True)

# Check data size after dropping the 'Id' variable
print("\nData size: {} ".format(df.shape)) 


Data size: (15171, 59) 


In [7]:
# Create features and target
y = df["latestPrice"]
X = df.drop(columns=['latestPrice'])

In [8]:
# Split data into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train.shape

(11378, 58)

In [9]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.2, gamma=0.0, 
                             learning_rate=0.05, max_depth=6, 
                             min_child_weight=1.5, n_estimators=100,
                             reg_alpha=0.9, reg_lambda=0.6,
                             subsample=0.2,seed=42, silent=1,
                             random_state =7)

In [11]:
# Fitting the model
model_xgb = model_xgb.fit(X_train, y_train)

In [12]:
# Create prediction
y_pred = model_xgb.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,399509.2,439900.0
1,536282.2,560000.0
2,378341.6,450000.0
3,431315.8,375000.0
4,1482342.0,2100000.0
5,705700.2,775000.0
6,455681.6,433000.0
7,293935.3,279500.0
8,668182.2,575000.0
9,2787602.0,2199000.0


In [13]:
# Calculate R squared and Adjusted R Square
import statsmodels.api as sm
result = sm.OLS(y_pred, y_test).fit()
print(result.rsquared, result.rsquared_adj)

0.8522453872484721 0.8522064224244342


In [14]:
# Calculate Mean Squared Error
from sklearn.metrics import mean_squared_error, mean_squared_log_error
import math
print(mean_squared_error(y_test, y_pred))
print(math.sqrt(mean_squared_error(y_test, y_pred)))
print(mean_squared_log_error(y_test, y_pred))

74819280843.0606
273531.13322446606
0.09003668587549966


In [15]:
# Calculate Mean Absolute Error(MAE)
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test, y_pred))

109245.37396602293
