In [1]:
# Import dependencies
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from path import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Load Data
source = "Resources/final_housing.csv"
df_data = pd.read_csv(source)
df_data.tail()
df_data = df_data.dropna()
df_data

Unnamed: 0,Sale Date,City,Zip Code,Year Built,Bed,Bath,Sale Price,Square Feet,Lot Size,$/SF,Zip Population,Zip SqMi,Zip Pop Density,Zip Mean HHI
0,2021-06-23,Portland,97212,2013,3,4.0,740000,2030,2613,364.53,26991,2.775,9726.486486,146186
1,2021-05-21,Portland,97212,2013,5,4.0,1200000,3557,4791,337.36,26991,2.775,9726.486486,146186
2,2021-05-21,Portland,97212,1952,4,3.0,846000,3122,6098,270.98,26991,2.775,9726.486486,146186
3,2021-05-28,Portland,97212,2015,4,4.0,1300000,3358,4791,387.14,26991,2.775,9726.486486,146186
4,2021-06-14,Portland,97212,1957,3,3.0,1300000,3912,7405,332.31,26991,2.775,9726.486486,146186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14108,2020-07-28,Portland,97231,2011,4,3.0,610000,2300,435600,265.22,2630,77.670,33.861208,161621
14109,2020-08-13,Portland,97231,1997,5,3.0,649950,3104,41817,209.39,2630,77.670,33.861208,161621
14110,2020-12-07,Portland,97231,1993,2,2.0,508000,3918,1742400,129.66,2630,77.670,33.861208,161621
14111,2021-05-26,Portland,97231,1960,5,2.0,575000,2688,77536,213.91,2630,77.670,33.861208,161621


In [3]:
df_data.dtypes

Sale Date           object
City                object
Zip Code             int64
Year Built           int64
Bed                  int64
Bath               float64
Sale Price           int64
Square Feet          int64
Lot Size             int64
$/SF               float64
Zip Population       int64
Zip SqMi           float64
Zip Pop Density    float64
Zip Mean HHI         int64
dtype: object

In [4]:
# Define the features set.
X = df_data.copy()
X = X.drop("Sale Price", axis=1)
X = X.drop("Sale Date", axis=1)
X = X.drop("City", axis=1)
X = X.drop("Zip Code", axis=1)
X = X.drop("Zip Population", axis=1)
X = X.drop("$/SF", axis=1)
X = X.drop("Zip SqMi", axis=1)
X_list = list(X.columns)
X

Unnamed: 0,Year Built,Bed,Bath,Square Feet,Lot Size,Zip Pop Density,Zip Mean HHI
0,2013,3,4.0,2030,2613,9726.486486,146186
1,2013,5,4.0,3557,4791,9726.486486,146186
2,1952,4,3.0,3122,6098,9726.486486,146186
3,2015,4,4.0,3358,4791,9726.486486,146186
4,1957,3,3.0,3912,7405,9726.486486,146186
...,...,...,...,...,...,...,...
14108,2011,4,3.0,2300,435600,33.861208,161621
14109,1997,5,3.0,3104,41817,33.861208,161621
14110,1993,2,2.0,3918,1742400,33.861208,161621
14111,1960,5,2.0,2688,77536,33.861208,161621


In [5]:
# Define the target set.
y = df_data["Sale Price"].values
y[:14113]


array([ 740000, 1200000,  846000, ...,  508000,  575000, 1842500],
      dtype=int64)

In [6]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.shape, X_test.shape

((10584, 7), (3529, 7))

In [7]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
# Create a random forest regressor.
rf_model = RandomForestRegressor(n_estimators=20, random_state=42, max_depth = 5)

In [9]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [10]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [11]:
predictions

array([526086.99166749, 659852.12136821, 913864.62200549, ...,
       600110.06028897, 679615.59293315, 430077.52860047])

In [12]:
# Calculate the absolute errors
errors = abs(predictions - y_test)

In [13]:
print('Average error: ', round(np.mean(errors), 2))

Average error:  81053.58


In [14]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 86.4 %.


In [15]:
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot

# Pull out one tree from the forest
tree = rf_model.estimators_[5]
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
tree = rf_model.estimators_[5]
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = X_list, rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')
# Write graph to a png file
graph.write_png('tree.png')