In [8]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor

In [3]:
df = pd.read_csv('../data/housing.csv')

In [4]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [5]:
tree = DecisionTreeRegressor()

In [20]:
left = df[df.LSTAT < 9.67]
right = df[df.LSTAT >= 9.67]

In [21]:
root_error = np.mean((df['PRICE']-df['PRICE'].mean())**2)

In [30]:
root_error

84.4195561561656

In [22]:
right_error = np.mean((right['PRICE']-right['PRICE'].mean())**2)

In [23]:
left_error = np.mean((left['PRICE']-left['PRICE'].mean())**2)

In [24]:
right_error

24.32308736092969

In [25]:
left_error

79.95297868897926

In [26]:
num_samples = df.shape[0]
right_samples = right.shape[0]
left_samples = left.shape[0]

In [27]:
right_samples/num_samples * right_error

14.32466409793883

In [28]:
left_samples/num_samples * left_error

32.86604657570689

In [29]:
root_error - (right_samples/num_samples * right_error) - (left_samples/num_samples * left_error)

37.22884548251987

In [33]:
tree = DecisionTreeRegressor(max_depth=4)

In [34]:
X = df.drop('PRICE', axis=1)
y = df['PRICE']

tree.fit(X,y)

DecisionTreeRegressor(max_depth=4)

In [35]:
from prep import draw_tree

draw_tree(tree, X)

ModuleNotFoundError: No module named 'graphviz'

In [37]:
df['Prediction'] = tree.predict(X)

In [38]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE,Prediction
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0,27.427273
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6,21.629744
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,32.74878
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,32.74878
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2,32.74878


In [41]:
# score for R squared:
tree.score(X,y)

0.8857396443908376

In [40]:
tree.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': 4,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

In [43]:
#boosting starts with a naive prediction
naive_guess = y.mean()

In [44]:
#get the error column
gradient = y-y.mean()

In [45]:
gradient

0       1.467194
1      -0.932806
2      12.167194
3      10.867194
4      13.667194
         ...    
501    -0.132806
502    -1.932806
503     1.367194
504    -0.532806
505   -10.632806
Name: PRICE, Length: 506, dtype: float64

In [47]:
#initialize a decision tree
tree = DecisionTreeRegressor(max_depth=4)

In [49]:
# fit a tree on X and the gradient
tree.fit(X, gradient)

DecisionTreeRegressor(max_depth=4)

#### The decision tree in this scenario is predicting the **error** of y, not y itself

In [51]:
# predicting the ERROR of y, not y
tree.predict(X)

array([  4.8944664 ,  -0.90306273,  10.21597416,  10.21597416,
        10.21597416,  -0.90306273,  -0.90306273,  -2.51197299,
        -2.51197299,  -2.51197299,  -2.51197299,  -0.90306273,
        -2.51197299,  -0.90306273,  -0.90306273,  -0.90306273,
        -0.90306273,  -6.29384529,  -0.90306273,  -0.90306273,
        -6.29384529,  -0.90306273,  -6.29384529,  -6.29384529,
        -6.29384529,  -6.29384529,  -6.29384529,  -6.29384529,
        -0.90306273,   4.8944664 ,  -6.29384529,  -0.90306273,
        -6.29384529,  -6.29384529,  -6.29384529,  -0.90306273,
        -0.90306273,  -0.90306273,  -0.90306273,   4.8944664 ,
        10.21597416,   4.8944664 ,  -0.90306273,  -0.90306273,
        -0.90306273,  -0.90306273,  -0.90306273,  -2.51197299,
        -2.51197299,  -2.51197299,  -0.90306273,  -0.90306273,
        -0.90306273,  -0.90306273,  -2.51197299,  10.21597416,
        -0.90306273,   4.8944664 ,  -0.90306273,  -0.90306273,
        -0.90306273,  -2.51197299,  -0.90306273,   4.89

In [52]:
naive_guess += tree.predict(X)

In [53]:
naive_guess

array([27.42727273, 21.62974359, 32.74878049, 32.74878049, 32.74878049,
       21.62974359, 21.62974359, 20.02083333, 20.02083333, 20.02083333,
       20.02083333, 21.62974359, 20.02083333, 21.62974359, 21.62974359,
       21.62974359, 21.62974359, 16.23896104, 21.62974359, 21.62974359,
       16.23896104, 21.62974359, 16.23896104, 16.23896104, 16.23896104,
       16.23896104, 16.23896104, 16.23896104, 21.62974359, 27.42727273,
       16.23896104, 21.62974359, 16.23896104, 16.23896104, 16.23896104,
       21.62974359, 21.62974359, 21.62974359, 21.62974359, 27.42727273,
       32.74878049, 27.42727273, 21.62974359, 21.62974359, 21.62974359,
       21.62974359, 21.62974359, 20.02083333, 20.02083333, 20.02083333,
       21.62974359, 21.62974359, 21.62974359, 21.62974359, 20.02083333,
       32.74878049, 21.62974359, 27.42727273, 21.62974359, 21.62974359,
       21.62974359, 20.02083333, 21.62974359, 27.42727273, 32.74878049,
       21.62974359, 21.62974359, 21.62974359, 21.62974359, 21.62

In [54]:
gradient = y - naive_guess

In [55]:
gradient

0     -3.427273
1     -0.029744
2      1.951220
3      0.651220
4      3.451220
         ...   
501   -5.027273
502   -1.029744
503   -8.848780
504   -5.427273
505   -9.729744
Name: PRICE, Length: 506, dtype: float64

In [56]:
tree.fit(X, gradient)

DecisionTreeRegressor(max_depth=4)

In [57]:
tree.predict(X)

array([-0.7878807 , -0.7878807 , -0.7878807 ,  1.44307887,  1.44307887,
        3.88472594, -0.7878807 ,  0.54781545, -1.93156374,  0.54781545,
        0.54781545, -0.7878807 ,  0.54781545, -0.7878807 , -0.7878807 ,
       -0.7878807 ,  3.844535  ,  0.54781545, -0.7878807 , -0.7878807 ,
        0.54781545, -0.7878807 ,  0.54781545,  0.54781545,  0.54781545,
        0.54781545,  0.54781545,  0.54781545, -0.7878807 , -0.7878807 ,
        0.54781545, -0.7878807 , -1.93156374,  0.54781545,  0.54781545,
       -0.7878807 , -0.7878807 , -0.7878807 ,  3.844535  ,  1.73639029,
        1.73639029,  1.73639029,  1.73639029,  1.73639029, -0.7878807 ,
       -0.7878807 , -0.7878807 ,  0.54781545, -1.93156374,  0.54781545,
       -0.7878807 , -0.7878807 ,  1.15555972,  1.15555972,  0.54781545,
        1.15555972, -0.7878807 , -0.7878807 , -0.98539764, -0.7878807 ,
       -0.7878807 , -0.7878807 , -0.7878807 , -0.7878807 ,  1.44307887,
        1.15555972,  1.15555972,  1.15555972, -0.7878807 ,  1.15

In [58]:
naive_guess += tree.predict(X)

In [59]:
gradient = y - naive_guess

In [60]:
tree.fit(X, gradient)

DecisionTreeRegressor(max_depth=4)

In [61]:
tree.predict(X)

array([-0.04749562,  1.31749276, -0.04749562, -0.04749562, -0.04749562,
        1.31749276, -1.68604327,  1.31749276, -1.68604327, -1.68604327,
        1.31749276, -1.68604327, -1.68604327, -1.56022518, -1.56022518,
       -1.56022518, -3.50683531, -1.56022518, -1.56022518, -1.56022518,
       -3.61531683, -1.56022518, -3.61531683, -1.56022518, -1.56022518,
       -1.56022518, -1.56022518, -1.56022518, -1.56022518, -3.61531683,
       -3.61531683, -3.61531683, -3.61531683, -3.61531683, -3.61531683,
        0.16784939,  0.16784939,  0.16784939,  0.16784939, -0.04749562,
       -0.04749562, -0.04749562,  1.31749276,  1.31749276,  1.31749276,
       -1.68604327, -1.68604327, -1.68604327, -1.68604327, -1.68604327,
       -1.68604327,  1.31749276,  1.31749276, -1.68604327, -1.56022518,
       -0.04749562,  1.31749276, -0.04749562,  1.31749276, -1.68604327,
       -1.68604327, -1.68604327,  1.31749276, -0.04749562, -0.04749562,
        1.31749276, -1.68604327, -1.68604327, -1.68604327, -1.68

In [62]:
naive_guess += tree.predict(X)

In [63]:
naive_guess

array([26.59189641, 22.15935565, 31.91340417, 34.14436374, 34.14436374,
       26.83196229, 19.15581962, 21.88614154, 16.40322632, 18.88260551,
       21.88614154, 19.15581962, 18.88260551, 19.28163771, 19.28163771,
       19.28163771, 21.96744328, 15.22655131, 19.28163771, 19.28163771,
       13.17145966, 19.28163771, 13.17145966, 15.22655131, 15.22655131,
       15.22655131, 15.22655131, 15.22655131, 19.28163771, 23.0240752 ,
       13.17145966, 17.22654606, 10.69208047, 13.17145966, 13.17145966,
       21.00971228, 21.00971228, 21.00971228, 25.64212798, 29.1161674 ,
       34.43767516, 29.1161674 , 24.68362664, 24.68362664, 22.15935565,
       19.15581962, 19.15581962, 18.88260551, 16.40322632, 18.88260551,
       19.15581962, 22.15935565, 24.10279608, 21.09926005, 19.0084236 ,
       33.8568446 , 22.15935565, 26.59189641, 21.96183871, 19.15581962,
       19.15581962, 17.54690936, 22.15935565, 26.59189641, 34.14436374,
       24.10279608, 21.09926005, 21.09926005, 19.15581962, 21.09

In [64]:
naive_guess[:5]

array([26.59189641, 22.15935565, 31.91340417, 34.14436374, 34.14436374])

In [65]:
y[:5]

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: PRICE, dtype: float64

In [70]:
#smoothing parameter
learning_rate = .1

In [68]:
naive_guess += tree.predict(X) * learning_rate

In [69]:
naive_guess

array([26.58714685, 22.29110493, 31.90865461, 34.13961418, 34.13961418,
       26.96371157, 18.98721529, 22.01789082, 16.234622  , 18.71400119,
       22.01789082, 18.98721529, 18.71400119, 19.12561519, 19.12561519,
       19.12561519, 21.61675975, 15.07052879, 19.12561519, 19.12561519,
       12.80992797, 19.12561519, 12.80992797, 15.07052879, 15.07052879,
       15.07052879, 15.07052879, 15.07052879, 19.12561519, 22.66254351,
       12.80992797, 16.86501438, 10.33054878, 12.80992797, 12.80992797,
       21.02649722, 21.02649722, 21.02649722, 25.65891292, 29.11141784,
       34.4329256 , 29.11141784, 24.81537592, 24.81537592, 22.29110493,
       18.98721529, 18.98721529, 18.71400119, 16.234622  , 18.71400119,
       18.98721529, 22.29110493, 24.23454535, 20.93065572, 18.85240109,
       33.85209504, 22.29110493, 26.58714685, 22.09358799, 18.98721529,
       18.98721529, 17.37830504, 22.29110493, 26.58714685, 34.13961418,
       24.23454535, 20.93065572, 20.93065572, 18.98721529, 20.93

In [71]:
from sklearn.ensemble import GradientBoostingRegressor

In [72]:
gbm = GradientBoostingRegressor()

In [74]:
# n_estimators is the number of boosting rounds
gbm.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'presort': 'deprecated',
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [76]:
tree = DecisionTreeRegressor(max_depth=4)

In [78]:
gbm.fit(X,y)

GradientBoostingRegressor()

In [79]:
gbm.score(X,y)

0.9761405838418584