### Data Preprocessing based on Casey's code

In [16]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [17]:
df = pd.read_csv('final_data.csv')
df = df.drop('Unnamed: 0', axis = 1) # remove extra column

In [18]:
response_var = df[['SalePrice']] # separate the y from the features
feats = df.drop('SalePrice', axis = 1) # features

### Setting up the decision tree

In [20]:
regr_1 = DecisionTreeRegressor(criterion='mse',max_depth=2)
regr_2 = DecisionTreeRegressor(criterion='mse',max_depth=5)
regr_1.fit(feats, response_var)
regr_2.fit(feats, response_var)

DecisionTreeRegressor(max_depth=5)

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error
import seaborn as sns

np.random.seed(1) # for reprodusability of results
Xtrain, Xtest, ytrain, ytest = train_test_split(feats, response_var)

In [31]:
regr_1 = DecisionTreeRegressor(criterion='mse',max_depth=2)
regr_2 = DecisionTreeRegressor(criterion='mse',max_depth=5)
regr_1.fit(Xtrain, ytrain)
regr_2.fit(Xtrain, ytrain)

DecisionTreeRegressor(max_depth=5)

### Calculating Predicted Sales Price based on regression decision tree features

In [36]:
y_pred1 = regr_1.predict(Xtest)
y_pred1

array([12.25701661, 11.67314835, 11.67314835, 11.67314835, 11.95305136,
       12.64784125, 12.64784125, 12.25701661, 12.25701661, 12.25701661,
       11.95305136, 11.67314835, 12.64784125, 12.64784125, 12.64784125,
       11.67314835, 11.67314835, 11.95305136, 12.25701661, 11.67314835,
       11.67314835, 11.95305136, 12.25701661, 12.64784125, 11.95305136,
       11.95305136, 11.95305136, 12.25701661, 12.64784125, 11.95305136,
       11.95305136, 11.67314835, 11.67314835, 11.67314835, 11.95305136,
       12.64784125, 11.67314835, 11.67314835, 12.25701661, 11.67314835,
       11.67314835, 11.67314835, 11.95305136, 11.95305136, 11.95305136,
       11.95305136, 12.25701661, 12.25701661, 12.64784125, 12.25701661,
       11.67314835, 12.64784125, 11.67314835, 12.25701661, 12.64784125,
       11.67314835, 11.67314835, 12.25701661, 11.67314835, 11.95305136,
       11.95305136, 12.25701661, 11.67314835, 11.67314835, 11.95305136,
       11.67314835, 11.67314835, 12.25701661, 11.95305136, 11.67

In [33]:
y_pred2 = regr_2.predict(Xtest)
y_pred2

array([12.15887003, 11.82410259, 11.64172974, 11.27075234, 11.96925   ,
       12.63324347, 12.63324347, 12.15887003, 12.3611113 , 12.15887003,
       12.06318059, 11.64172974, 12.41345235, 12.84723522, 12.41345235,
       11.66393765, 11.64172974, 11.96925   , 12.15887003, 11.64172974,
       11.82410259, 11.96382847, 12.48471834, 12.84723522, 11.72163036,
       12.22680507, 11.72163036, 12.15887003, 13.27773466, 11.96925   ,
       12.22680507, 11.66393765, 11.64172974, 11.64172974, 11.72163036,
       12.84723522, 11.64172974, 11.64172974, 12.3611113 , 11.82410259,
       11.82410259, 11.82410259, 11.72163036, 11.72163036, 12.06318059,
       12.06318059, 11.90817354, 12.15887003, 12.41345235, 12.3611113 ,
       11.43924886, 12.91066924, 11.43924886, 12.50689578, 12.41345235,
       11.43924886, 11.64172974, 12.15887003, 11.82410259, 12.06318059,
       12.06318059, 12.50689578, 11.64172974, 11.62792629, 12.06318059,
       11.64172974, 11.66393765, 12.3611113 , 11.96925   , 11.82

### Calculating root mean squared errors

In [37]:
mse1 = mean_squared_error(ytest,y_pred1)
mse2 = mean_squared_error(ytest,y_pred2)

In [49]:
rmse1 = np.sqrt(mse1)
rmse2 = np.sqrt(mse2)
print('Root mean squared for decision tree with maximum depth = 2: %.4f' %rmse1)
print('Root mean squared for decision tree with maximum depth = 5: %.4f' %rmse2)

Root mean squared for decision tree with maximum depth = 2: 0.2607
Root mean squared for decision tree with maximum depth = 5: 0.2037


### Applying 10 fold cross validation


In [63]:
from sklearn.model_selection import cross_val_score
scoret1 = cross_val_score(regr_1, Xtrain, ytrain, cv=10)
scoret2 = cross_val_score(regr_2, Xtrain, ytrain, cv=10)
score1 = cross_val_score(regr_1, Xtest, ytest, cv=10)
score2 = cross_val_score(regr_2, Xtest, ytest, cv=10)
print(f"Training score based on tree 1: {scoret1}")
print(f"Training score based on tree 2: {scoret2}")
print(f"Testing score based on tree 1: {score1}")
print(f"Testing score based on tree 2: {score2}")

Training score based on tree 1: [0.54837561 0.53017289 0.60066327 0.56617855 0.56166654 0.53726761
 0.56279964 0.5196128  0.57945824 0.61526907]
Training score based on tree 2: [0.73497673 0.60782579 0.68857935 0.74095998 0.77183245 0.67801573
 0.66794353 0.70792635 0.72418223 0.81441621]
Testing score based on tree 1: [ 0.67440387  0.57686695 -0.01500865  0.49378895  0.3863423   0.67445925
  0.47805461  0.56406095  0.49131686  0.63030423]
Testing score based on tree 2: [0.84901009 0.74493298 0.46593197 0.68052882 0.45159022 0.82265259
 0.55814978 0.65879362 0.72642373 0.74703749]


In [57]:
print(f"Average training score based on tree 1: {scoret1.mean()}")
print(f"Average training score based on tree 2: {scoret2.mean()}")
print(f"Average testing score based on tree 1: {score1.mean()}")
print(f"Average testing score based on tree 2: {score2.mean()}")

Average training score based on tree 1: 0.562146423968055
Average training score based on tree 2: 0.7100441126607342
Average testing score based on tree 1: 0.495458933091481
Average testing score based on tree 2: 0.6290113801869202


### Visualizing the results