# Random Forest Regression Tree

#### Load the packages and import the data

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("./Data Files/USA_Housing.csv")
data.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.642455,6.0029,6.730821,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.067179,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820
4,59982.197226,5.040555,7.839388,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386


In [2]:
data.columns

Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price', 'Address'],
      dtype='object')

#### Split data into an X DataFrame and y vector

In [3]:
X = data[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population']]
y = data["Price"]

#### Split the data into a train_set and test_set

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1111)

#### Fit the Random Forest Regression Tree Model

In [8]:
from sklearn.ensemble import RandomForestRegressor
rfr_model = RandomForestRegressor(n_estimators = 200,  # Choose value
                                   random_state = 1111)  
rfr_model.fit(X_train, y_train)

pd.DataFrame(rfr_model.feature_importances_, X.columns, 
             columns=["Importance"]).sort_values("Importance", ascending=False)

Unnamed: 0,Importance
Avg. Area Income,0.434578
Avg. Area House Age,0.22725
Area Population,0.190775
Avg. Area Number of Rooms,0.130401
Avg. Area Number of Bedrooms,0.016995


#### Predict on Test Set

In [12]:
y_pred = rfr_model.predict(X_test)
pred_summary = X_test.copy()
pred_summary[y.name] = y_test
pred_summary["y_pred"] = y_pred
pred_summary.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,y_pred
3652,65966.017208,7.876933,5.524962,3.3,42710.821809,1342819.0,1338320.0
1862,60288.475915,6.170239,7.014315,3.28,34651.072317,1144938.0,1089905.0
2766,53664.077704,4.415997,5.938396,2.19,57110.648936,996243.4,888118.8
4120,70169.710552,6.227945,7.651813,3.36,35197.384961,1217022.0,1423955.0
3447,73092.741315,5.61546,6.524657,2.21,43509.458399,1336172.0,1340157.0


#### Evaluate the Model

In [13]:
from sklearn import metrics
print("MSE:", metrics.mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("Explained Variance:", round(metrics.explained_variance_score(y_test, y_pred), 2))

MSE: 13402800937.8
RMSE: 115770.466604
Explained Variance: 0.89
