### Data Preprocessing based on Casey's code

In [16]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [17]:
df = pd.read_csv('final_data.csv')
df = df.drop('Unnamed: 0', axis = 1) # remove extra column

In [18]:
response_var = df[['SalePrice']] # separate the y from the features
feats = df.drop('SalePrice', axis = 1) # features

### Setting up the decision tree

In [87]:
regr_1 = DecisionTreeRegressor(criterion='mse',max_depth=2)
regr_2 = DecisionTreeRegressor(criterion='mse',max_depth=5)
regr_1.fit(feats, response_var)
regr_2.fit(feats, response_var)

DecisionTreeRegressor(max_depth=5)

In [88]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error
import seaborn as sns

np.random.seed(1) # for reprodusability of results
Xtrain, Xtest, ytrain, ytest = train_test_split(feats, response_var)

In [89]:
regr_1 = DecisionTreeRegressor(criterion='mse',max_depth=2)
regr_2 = DecisionTreeRegressor(criterion='mse',max_depth=7)
regr_1.fit(Xtrain, ytrain)
regr_2.fit(Xtrain, ytrain)

DecisionTreeRegressor(max_depth=7)

### Calculating Predicted Sales Price based on regression decision tree features

In [90]:
y_pred1 = regr_1.predict(Xtest)
y_pred1

array([12.25701661, 11.67314835, 11.67314835, 11.67314835, 11.95305136,
       12.64784125, 12.64784125, 12.25701661, 12.25701661, 12.25701661,
       11.95305136, 11.67314835, 12.64784125, 12.64784125, 12.64784125,
       11.67314835, 11.67314835, 11.95305136, 12.25701661, 11.67314835,
       11.67314835, 11.95305136, 12.25701661, 12.64784125, 11.95305136,
       11.95305136, 11.95305136, 12.25701661, 12.64784125, 11.95305136,
       11.95305136, 11.67314835, 11.67314835, 11.67314835, 11.95305136,
       12.64784125, 11.67314835, 11.67314835, 12.25701661, 11.67314835,
       11.67314835, 11.67314835, 11.95305136, 11.95305136, 11.95305136,
       11.95305136, 12.25701661, 12.25701661, 12.64784125, 12.25701661,
       11.67314835, 12.64784125, 11.67314835, 12.25701661, 12.64784125,
       11.67314835, 11.67314835, 12.25701661, 11.67314835, 11.95305136,
       11.95305136, 12.25701661, 11.67314835, 11.67314835, 11.95305136,
       11.67314835, 11.67314835, 12.25701661, 11.95305136, 11.67

In [91]:
y_pred2 = regr_2.predict(Xtest)
y_pred2

array([12.16954797, 11.84536519, 11.69356162, 11.2598616 , 11.99655448,
       12.61032963, 12.61274572, 12.10406554, 12.33848433, 12.3079557 ,
       12.11159394, 11.53966707, 11.9879294 , 12.85465073, 12.45265364,
       11.72785723, 11.69356162, 11.99655448, 12.16954797, 11.69356162,
       11.84536519, 11.88494576, 12.58304588, 12.85465073, 11.60077314,
       12.16904109, 11.60077314, 12.10406554, 13.25269776, 11.99655448,
       12.31619553, 11.72785723, 11.69356162, 11.53966707, 11.86396844,
       12.85465073, 11.69356162, 11.69356162, 12.33848433, 11.59141331,
       11.59141331, 11.84536519, 11.72176127, 11.72176127, 12.11159394,
       12.11159394, 12.09368751, 12.10406554, 12.45265364, 12.33848433,
       11.53776623, 12.89808133, 11.53776623, 12.45540299, 11.9879294 ,
       11.53776623, 11.69356162, 12.10406554, 11.84536519, 12.11159394,
       12.11159394, 12.45540299, 11.69356162, 11.59906108, 12.11159394,
       11.69356162, 11.72785723, 12.33848433, 11.99655448, 11.59

### Calculating root mean squared errors

In [92]:
mse1 = mean_squared_error(ytest,y_pred1)
mse2 = mean_squared_error(ytest,y_pred2)

In [93]:
rmse1 = np.sqrt(mse1)
rmse2 = np.sqrt(mse2)
print('Root mean squared for decision tree with maximum depth = 2: %.4f' %rmse1)
print('Root mean squared for decision tree with maximum depth = 5: %.4f' %rmse2)

Root mean squared for decision tree with maximum depth = 2: 0.2607
Root mean squared for decision tree with maximum depth = 5: 0.2006


### Applying 10 fold cross validation


In [94]:
from sklearn.model_selection import cross_val_score
scoret1 = cross_val_score(regr_1, Xtrain, ytrain, cv=10)
scoret2 = cross_val_score(regr_2, Xtrain, ytrain, cv=10)
score1 = cross_val_score(regr_1, Xtest, ytest, cv=10)
score2 = cross_val_score(regr_2, Xtest, ytest, cv=10)
print(f"Training score based on tree 1: {scoret1}")
print(f"Training score based on tree 2: {scoret2}")
print(f"Testing score based on tree 1: {score1}")
print(f"Testing score based on tree 2: {score2}")

Training score based on tree 1: [0.54837561 0.53017289 0.60066327 0.56617855 0.56166654 0.53726761
 0.56279964 0.5196128  0.57945824 0.61526907]
Training score based on tree 2: [0.67966016 0.62556156 0.61978493 0.72090598 0.78584017 0.69339675
 0.52767943 0.67506019 0.74960248 0.79708889]
Testing score based on tree 1: [ 0.67440387  0.57686695 -0.01500865  0.49378895  0.3863423   0.67445925
  0.47805461  0.56406095  0.49131686  0.63030423]
Testing score based on tree 2: [0.8524907  0.70922636 0.36625236 0.65474665 0.4989576  0.84794976
 0.45639187 0.64032843 0.54835688 0.81320228]


In [96]:
print(f"Average training score based on tree 1: {scoret1.mean()}")
print(f"Average training score based on tree 2: {scoret2.mean()}")
print(f"Average testing score based on tree 1: {score1.mean()}")
print(f"Average testing score based on tree 2: {score2.mean()}")

Average training score based on tree 1: 0.5621464239680554
Average training score based on tree 2: 0.6874580528560676
Average testing score based on tree 1: 0.4954589330914815
Average testing score based on tree 2: 0.638790287474648


### Visualizing the results