In [1]:
# Import dependencies
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Load Data
source = "Resources/final_housing.csv"
df_data = pd.read_csv(source)
df_data.tail()
df_data = df_data.dropna()
df_data

Unnamed: 0,Sale Date,City,Zip Code,Year Built,Bed,Bath,Sale Price,Square Feet,Lot Size,$/SF,Zip Population,Zip SqMi,Zip Pop Density,Zip Mean HHI
0,2021-06-23,Portland,97212,2013,3,4.0,740000,2030,2613,364.53,26991,2.775,9726.486486,146186
1,2021-05-21,Portland,97212,2013,5,4.0,1200000,3557,4791,337.36,26991,2.775,9726.486486,146186
2,2021-05-21,Portland,97212,1952,4,3.0,846000,3122,6098,270.98,26991,2.775,9726.486486,146186
3,2021-05-28,Portland,97212,2015,4,4.0,1300000,3358,4791,387.14,26991,2.775,9726.486486,146186
4,2021-06-14,Portland,97212,1957,3,3.0,1300000,3912,7405,332.31,26991,2.775,9726.486486,146186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14108,2020-07-28,Portland,97231,2011,4,3.0,610000,2300,435600,265.22,2630,77.670,33.861208,161621
14109,2020-08-13,Portland,97231,1997,5,3.0,649950,3104,41817,209.39,2630,77.670,33.861208,161621
14110,2020-12-07,Portland,97231,1993,2,2.0,508000,3918,1742400,129.66,2630,77.670,33.861208,161621
14111,2021-05-26,Portland,97231,1960,5,2.0,575000,2688,77536,213.91,2630,77.670,33.861208,161621


In [3]:
# Establish the spending bins and group names.
sale_bins = [0, 250000, 400000, 600000, 800000, 1000000, 1500000, 2000000]
bin_names = ["<250k", "250k-400k", "400k-600k", "600k-800k", "800k-1M", "1M-1.5M" , "1.5M-2M"]
# Categorize spending based on the bins.
df_data["Sale Price Ranges"] = pd.cut(df_data['Sale Price'], sale_bins, labels=bin_names)

df_data

Unnamed: 0,Sale Date,City,Zip Code,Year Built,Bed,Bath,Sale Price,Square Feet,Lot Size,$/SF,Zip Population,Zip SqMi,Zip Pop Density,Zip Mean HHI,Sale Price Ranges
0,2021-06-23,Portland,97212,2013,3,4.0,740000,2030,2613,364.53,26991,2.775,9726.486486,146186,600k-800k
1,2021-05-21,Portland,97212,2013,5,4.0,1200000,3557,4791,337.36,26991,2.775,9726.486486,146186,1M-1.5M
2,2021-05-21,Portland,97212,1952,4,3.0,846000,3122,6098,270.98,26991,2.775,9726.486486,146186,800k-1M
3,2021-05-28,Portland,97212,2015,4,4.0,1300000,3358,4791,387.14,26991,2.775,9726.486486,146186,1M-1.5M
4,2021-06-14,Portland,97212,1957,3,3.0,1300000,3912,7405,332.31,26991,2.775,9726.486486,146186,1M-1.5M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14108,2020-07-28,Portland,97231,2011,4,3.0,610000,2300,435600,265.22,2630,77.670,33.861208,161621,600k-800k
14109,2020-08-13,Portland,97231,1997,5,3.0,649950,3104,41817,209.39,2630,77.670,33.861208,161621,600k-800k
14110,2020-12-07,Portland,97231,1993,2,2.0,508000,3918,1742400,129.66,2630,77.670,33.861208,161621,400k-600k
14111,2021-05-26,Portland,97231,1960,5,2.0,575000,2688,77536,213.91,2630,77.670,33.861208,161621,400k-600k


In [4]:
# Define the features set.
X = df_data.copy()
X = X.drop("Sale Price", axis=1)
X = X.drop("Sale Date", axis=1)
X = X.drop("City", axis=1)
X = X.drop("Zip Code", axis=1)
X = X.drop("Zip Population", axis=1)
X = X.drop("Zip SqMi", axis=1)
X = X.drop("Sale Price Ranges", axis=1)
X_list = list(X.columns)
X

Unnamed: 0,Year Built,Bed,Bath,Square Feet,Lot Size,$/SF,Zip Pop Density,Zip Mean HHI
0,2013,3,4.0,2030,2613,364.53,9726.486486,146186
1,2013,5,4.0,3557,4791,337.36,9726.486486,146186
2,1952,4,3.0,3122,6098,270.98,9726.486486,146186
3,2015,4,4.0,3358,4791,387.14,9726.486486,146186
4,1957,3,3.0,3912,7405,332.31,9726.486486,146186
...,...,...,...,...,...,...,...,...
14108,2011,4,3.0,2300,435600,265.22,33.861208,161621
14109,1997,5,3.0,3104,41817,209.39,33.861208,161621
14110,1993,2,2.0,3918,1742400,129.66,33.861208,161621
14111,1960,5,2.0,2688,77536,213.91,33.861208,161621


In [5]:
# Define the target set.
y = df_data["Sale Price Ranges"].values
y[:13956]


['600k-800k', '1M-1.5M', '800k-1M', '1M-1.5M', '1M-1.5M', ..., '600k-800k', '800k-1M', '400k-600k', '250k-400k', '400k-600k']
Length: 13956
Categories (7, object): ['<250k' < '250k-400k' < '400k-600k' < '600k-800k' < '800k-1M' < '1M-1.5M' < '1.5M-2M']

In [6]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
X_train.shape, X_test.shape

((10584, 8), (3529, 8))

In [7]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [9]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [10]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [11]:
predictions

array(['400k-600k', '400k-600k', '600k-800k', ..., '400k-600k',
       '250k-400k', '600k-800k'], dtype=object)

In [12]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(
    cm, index=["<250k", "250k-400k", "400k-600k", "600k-800k", "800k-1M", "1M-1.5M" , "1.5M-2M"], columns=["P<250k", "P250k-400k", "P400k-600k", "P600k-800k", "P800k-1M", "P1M-1.5M" , "P1.5M-2M"])

cm_df

Unnamed: 0,P<250k,P250k-400k,P400k-600k,P600k-800k,P800k-1M,P1M-1.5M,P1.5M-2M
<250k,17,26,0,0,0,0,0
250k-400k,1,129,0,0,3,19,0
400k-600k,0,0,500,63,0,0,0
600k-800k,0,0,26,1559,32,0,0
800k-1M,0,1,0,65,750,10,0
1M-1.5M,0,9,0,0,66,225,0
1.5M-2M,0,0,20,0,0,0,8


In [13]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
print (acc_score)

0.9033720600736752


In [14]:
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot

# Pull out one tree from the forest
tree = rf_model.estimators_[5]
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
tree = rf_model.estimators_[5]
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree2.dot', feature_names = X_list, rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree2.dot')
# Write graph to a png file
graph.write_png('tree2.png')