In [3]:
!pip install joblib




In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
import joblib

In [6]:
# Load the data set
df = pd.read_csv("ml_house_data_set.csv")

In [16]:
# Display the first five rows of the DataFrame
print(df.head())

   year_built  stories  num_bedrooms  full_bathrooms  half_bathrooms  \
0        1978        1             4               1               1   
1        1958        1             3               1               1   
2        2002        1             3               2               0   
3        2004        1             4               2               0   
4        2006        1             4               2               0   

   livable_sqft  total_sqft garage_type  garage_sqft  carport_sqft  \
0          1689        1859    attached          508             0   
1          1984        2002    attached          462             0   
2          1581        1578        none            0           625   
3          1829        2277    attached          479             0   
4          1580        1749    attached          430             0   

   has_fireplace  has_pool  has_central_heating  has_central_cooling  \
0           True     False                 True                 True   
1 

In [7]:
# Remove the fields from the data set that we don't want to include in our model
del df['house_number']
del df['unit_number']
del df['street_name']
del df['zip_code']

In [8]:
# Replace categorical data with one-hot encoded data
features_df = pd.get_dummies(df, columns=['garage_type', 'city'])

# Remove the sale price from the feature data
del features_df['sale_price']

In [10]:
# Create the X and y arrays
X = features_df.to_numpy()
y = df['sale_price'].to_numpy()

In [11]:
# Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [12]:
# Fit regression model
model = ensemble.GradientBoostingRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=6,
    min_samples_leaf=9,
    max_features=0.1,
    loss='huber',
    random_state=0
)
model.fit(X_train, y_train)

In [13]:
# Save the trained model to a file so we can use it in other programs
joblib.dump(model, 'trained_house_classifier_model.pkl')

['trained_house_classifier_model.pkl']

In [14]:
# Find the error rate on the training set
mse = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

Training Set Mean Absolute Error: 48727.0015


In [15]:
# Find the error rate on the test set
mse = mean_absolute_error(y_test, model.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)

Test Set Mean Absolute Error: 59225.2075


In [18]:
# These are the feature labels from our data set
import numpy as np
feature_labels = np.array(['year_built', 'stories', 'num_bedrooms', 'full_bathrooms', 'half_bathrooms', 'livable_sqft', 'total_sqft', 'garage_sqft', 'carport_sqft', 'has_fireplace', 'has_pool', 'has_central_heating', 'has_central_cooling', 'garage_type_attached', 'garage_type_detached', 'garage_type_none', 'city_Amystad', 'city_Brownport', 'city_Chadstad', 'city_Clarkberg', 'city_Coletown', 'city_Davidfort', 'city_Davidtown', 'city_East Amychester', 'city_East Janiceville', 'city_East Justin', 'city_East Lucas', 'city_Fosterberg', 'city_Hallfort', 'city_Jeffreyhaven', 'city_Jenniferberg', 'city_Joshuafurt', 'city_Julieberg', 'city_Justinport', 'city_Lake Carolyn', 'city_Lake Christinaport', 'city_Lake Dariusborough', 'city_Lake Jack', 'city_Lake Jennifer', 'city_Leahview', 'city_Lewishaven', 'city_Martinezfort', 'city_Morrisport', 'city_New Michele', 'city_New Robinton', 'city_North Erinville', 'city_Port Adamtown', 'city_Port Andrealand', 'city_Port Daniel', 'city_Port Jonathanborough', 'city_Richardport', 'city_Rickytown', 'city_Scottberg', 'city_South Anthony', 'city_South Stevenfurt', 'city_Toddshire', 'city_Wendybury', 'city_West Ann', 'city_West Brittanyview', 'city_West Gerald', 'city_West Gregoryview', 'city_West Lydia', 'city_West Terrence'])


In [19]:
# Load the trained model created with train_model.py
model = joblib.load('trained_house_classifier_model.pkl')

In [20]:
# Create a numpy array based on the model's feature importances
importance = model.feature_importances_

In [21]:
# Sort the feature labels based on the feature importance rankings from the model
feauture_indexes_by_importance = importance.argsort()

In [22]:
# Print each feature label, from most important to least important (reverse order)
for index in feauture_indexes_by_importance:
    print("{} - {:.2f}%".format(feature_labels[index], (importance[index] * 100.0)))

city_New Robinton - 0.00%
city_New Michele - 0.00%
city_Martinezfort - 0.00%
city_Julieberg - 0.00%
city_Davidtown - 0.00%
city_Lake Jennifer - 0.00%
city_Rickytown - 0.01%
city_Fosterberg - 0.01%
city_East Justin - 0.01%
city_West Terrence - 0.01%
city_West Brittanyview - 0.01%
city_South Stevenfurt - 0.01%
city_Joshuafurt - 0.02%
city_Leahview - 0.02%
city_East Janiceville - 0.02%
city_Brownport - 0.03%
city_Amystad - 0.03%
city_Toddshire - 0.03%
city_Wendybury - 0.05%
city_Port Adamtown - 0.05%
city_Port Daniel - 0.05%
city_Clarkberg - 0.09%
city_Davidfort - 0.09%
city_West Lydia - 0.12%
city_Port Jonathanborough - 0.12%
garage_type_detached - 0.18%
city_Jenniferberg - 0.19%
city_East Amychester - 0.20%
city_Morrisport - 0.21%
city_Lewishaven - 0.22%
city_West Gerald - 0.23%
city_Richardport - 0.25%
city_North Erinville - 0.25%
city_East Lucas - 0.27%
city_Lake Carolyn - 0.30%
has_central_heating - 0.33%
city_West Gregoryview - 0.33%
city_West Ann - 0.39%
city_Lake Dariusborough - 0