In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error



In [2]:
df = pd.read_csv("./data/ml_house_data_set_updated.csv")

In [3]:
# Remove the fields from the data set that we don't want to include in our model
del df['house_number']
del df['unit_number']
del df['street_name']
del df['zip_code']

In [4]:
# Replace categorical data with one-hot encoded data
features_df = pd.get_dummies(df, columns=['garage_type', 'city'])

In [5]:
del features_df['sale_price']

In [6]:
X = features_df
y = df['sale_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [7]:
model = ensemble.GradientBoostingRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=6,
    min_samples_leaf=9,
    max_features=0.1,
    loss='huber'
)

model.fit(X_train, y_train)

In [8]:
import joblib


joblib.dump(model, 'trained_house_classifier_model.pkl')

['trained_house_classifier_model.pkl']

In [9]:
# Find the error rate on the training set
mse = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

Training Set Mean Absolute Error: 52383.3066


In [10]:
# Find the error rate on the test set
mse = mean_absolute_error(y_test, model.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)

Test Set Mean Absolute Error: 61372.8658
