In [29]:
import numpy as np
import pandas as pd

## 4. Decision Tree

In [30]:
#reading in training and testing datasets
testing = pd.read_csv('project_test.csv')
training = pd.read_csv('project_train.csv')

In [31]:
testing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1260 entries, 0 to 1259
Data columns (total 26 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  1260 non-null   int64  
 1   County                      1260 non-null   object 
 2   MonthNumeric                1260 non-null   int64  
 3   Year                        1260 non-null   int64  
 4   MedianPrice                 1260 non-null   float64
 5   PercentageOfBachelorDegree  1260 non-null   float64
 6   Per_capita_income           1260 non-null   int64  
 7   total_law_enforcement       1260 non-null   float64
 8   Precipitation               1260 non-null   float64
 9   Population                  1260 non-null   float64
 10  NumAirports                 1260 non-null   float64
 11  GradRates                   1260 non-null   float64
 12  CollegeEnrollmentRate       1260 non-null   float64
 13  AvgHouseholdSize            1260 

In [32]:
#choosing features to include
x_train = training[['Per_capita_income','Precipitation','Population', 'Year','MonthNumeric'
                    ,"PercentageOfBachelorDegree", 'NumAirports', 'GradRates', 'total_law_enforcement'
                    ,'AvgHouseholdSize',  'ViolentCrimeRate', 'HasCostco','CollegeEnrollmentRate'
                    , 'Avg18Younger', 'Avg18Older','PropertyCrimeRate', 'NationalRiskIndex','LocalTaxRate'
                    ,'HasWalMart', 'Urban','Suburban', 'Rural']]

y_train = training['MedianPrice']

x_test = testing[['Per_capita_income','Precipitation','Population', 'Year','MonthNumeric'
                    ,"PercentageOfBachelorDegree", 'NumAirports', 'GradRates', 'total_law_enforcement'
                    ,'AvgHouseholdSize',  'ViolentCrimeRate', 'HasCostco','CollegeEnrollmentRate'
                    , 'Avg18Younger', 'Avg18Older','PropertyCrimeRate', 'NationalRiskIndex','LocalTaxRate'
                    ,'HasWalMart', 'Urban','Suburban', 'Rural']]

y_test = testing['MedianPrice']


### 4.1 Decision Tree Regressor

In [33]:
#Building decision tree regressor without CV

from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor(min_samples_leaf=2, 
                             ccp_alpha=0.001,
                             random_state = 88)

dtr = dtr.fit(x_train, y_train)

In [34]:
#Calculating OSR^2

def OSR2(model, X_test, y_test, y_train):
    
    y_pred = model.predict(X_test)
    SSE = (np.sum((y_test - y_pred)**2))
    SST = (np.sum((y_test - np.mean(y_train))**2))
                 
    return (1 - SSE/SST)

y_pred = dtr.predict(x_test)
osr2 = OSR2(dtr, x_test, y_test, y_train)
osr2

0.8815539289341042

In [35]:
# r^2 score of model on training set
dtr.score(x_train, y_train)

0.996833420369575

In [37]:
# calculating MSE for decision tree
from sklearn.metrics import mean_squared_error
decision_tree_mse = mean_squared_error(y_pred, y_test)
print("Test MSE for decision tree:", decision_tree_mse)

Test MSE for decision tree: 13780485977.697178


### 4.2 Decision Tree Regressor with cross validation

In [38]:
#building decision tree with cross validation on min_sample_leaf

from sklearn.model_selection import GridSearchCV

grid_values = {'min_samples_leaf': np.arange(1,31),
               'min_samples_split': [2],
               'max_depth':[5],
               'random_state': [88]}

            
dtr2 = DecisionTreeRegressor()
dtr2_cv= GridSearchCV(dtr2, param_grid = grid_values, scoring = 'r2', cv=10, verbose=0) 
dtr2_cv.fit(x_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [5],
                         'min_samples_leaf': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]),
                         'min_samples_split': [2], 'random_state': [88]},
             scoring='r2')

In [39]:
# Optimal value for min_samples_leaf
dtr2_cv.best_params_['min_samples_leaf']

25

In [50]:
# OSR^2, has increased after tuning tree
y_pred2 = dtr2_cv.predict(x_test)
cv_osr2 = OSR2(dtr2_cv, x_test, y_test, y_train)
cv_osr2

0.896141320680433

In [41]:
# MSE on test set, has decreased after tuning the tree
cv_mse = mean_squared_error(y_pred2, y_test)
print("Test MSE for decision tree:", cv_mse)

Test MSE for decision tree: 12083330929.813627


In [42]:
# r^2 score of model on training set, has decreased after tuning the tree
dtr2_cv.score(x_train, y_train)

0.9423086840010003

In [43]:
MAE(y_pred2, y_test)

77996.01071220846

In [44]:
#recreating tree with chosen optimal parameters and creating feature importance chart
dtr3 = DecisionTreeRegressor(min_samples_leaf = 19,
               min_samples_split= 2,
               max_depth=5,
               random_state= 88)

dtr3 = dtr3.fit(x_train, y_train)

pd.DataFrame({'Feature': x_train.columns, 'Importance': dtr3.feature_importances_})

Unnamed: 0,Feature,Importance
0,Per_capita_income,0.882721
1,Precipitation,0.0
2,Population,0.023942
3,Year,0.026037
4,MonthNumeric,0.001605
5,PercentageOfBachelorDegree,0.0
6,NumAirports,0.00222
7,GradRates,0.008424
8,total_law_enforcement,0.000252
9,AvgHouseholdSize,0.0


The features that affect median house price the most are per capita income, year and national risk index. There are a number of features that have a feature importance of 0, meaning they have little effect on the target variable, including percentage of bachelor's degreees, whether or not there is a costco, average amount of people under 18, local tax rate, etc. 

In [45]:
def MAE(y_test, y_pred):
    return (np.mean(abs(y_test - y_pred)))

In [46]:
def RMSE(y_test, y_pred):
    return np.sqrt(np.mean((y_test - y_pred)**2))

In [52]:
performance = {'Decision Tree':[osr2, RMSE(y_test, y_pred), MAE(y_test, y_pred)],
               'Decision Tree with cv':[cv_osr2, RMSE(y_test, y_pred2), MAE(y_test, y_pred2)]}

performance_tbl = pd.DataFrame(performance, columns=['Decision Tree','Decision Tree with cv'])

performance_tbl.index = ['OSR2', 'RMSE', 'MAE']
performance_tbl

Unnamed: 0,Decision Tree,Decision Tree with cv
OSR2,0.896141,0.896141
RMSE,117390.314667,109924.205386
MAE,81429.549735,77996.010712


In [53]:
performance_tbl.to_csv('performance2.csv')