In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
concrete = pd.read_csv('datasets/concrete_data.csv')

concrete.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
X = concrete.drop('csMPa', axis=1)
Y = concrete['csMPa']

In [4]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

Fit a DecisionTreeRegressor to the training set

In [5]:
from sklearn.tree import DecisionTreeRegressor

In [6]:
tree_reg1 = DecisionTreeRegressor(max_depth=3)
tree_reg1.fit(x_train, y_train)

DecisionTreeRegressor(max_depth=3)

Now train a second DecisionTreeRegressor on the residual errors made by the first predictor

In [7]:
y2 = y_train - tree_reg1.predict(x_train)

y2[:10]

350      5.640682
507     -1.183704
49       3.382414
982    -15.321048
41      -3.937255
806     -5.437586
215     -5.339318
662      7.053519
1021    -8.541048
646      7.288952
Name: csMPa, dtype: float64

In [8]:
tree_reg2 = DecisionTreeRegressor(max_depth=4)
tree_reg2.fit(x_train, y2)

DecisionTreeRegressor(max_depth=4)

Now we train a third regressor on the residual errors made by the second predictor

In [9]:
y3 = y2 - tree_reg2.predict(x_train)

y3[:10]

350      7.274036
507      0.449650
49      -0.240309
982    -13.687694
41      -7.559977
806     -0.153889
215     -3.705964
662      8.140812
1021    -3.257351
646     -0.458873
Name: csMPa, dtype: float64

In [10]:
tree_reg3 = DecisionTreeRegressor(max_depth=5)
tree_reg3.fit(x_train, y3)

DecisionTreeRegressor(max_depth=5)

Now we have an ensemble containing three trees. It can make predictions on a new instance simply by adding up all the predictions of all the trees

In [11]:
y_pred = sum(tree.predict(x_test) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [12]:
from sklearn.metrics import r2_score

r2_score(y_test, y_pred)

0.7690600485748594

In [13]:
y4 = y3 - tree_reg3.predict(x_train)

y4[:10]

350     3.718908
507     4.837352
49     -1.130000
982    -9.299992
41      0.806000
806    -3.709017
215    -7.261092
662     8.222914
1021    1.130351
646    -7.423774
Name: csMPa, dtype: float64

In [14]:
tree_reg4 = DecisionTreeRegressor(max_depth=5)
tree_reg4.fit(x_train, y4)

DecisionTreeRegressor(max_depth=5)

In [15]:
y_pred = sum(tree.predict(x_test) for tree in (tree_reg1, tree_reg2, tree_reg3, tree_reg4))

r2_score(y_test, y_pred)

0.8230969262238206

### GradientBoostingRegressor
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html

In [16]:
from sklearn.ensemble import GradientBoostingRegressor

In [40]:
gbr = GradientBoostingRegressor(max_depth=3, n_estimators=10000, learning_rate=1.5)

gbr.fit(x_train, y_train)

GradientBoostingRegressor(learning_rate=1.5, n_estimators=10000)

In [41]:
y_pred = gbr.predict(x_test)

r2_score(y_test, y_pred)

0.7645375425113543

In [39]:
gbr = GradientBoostingRegressor(max_depth=3, n_estimators=3, learning_rate=0.1)

gbr.fit(x_train, y_train)

y_pred = gbr.predict(x_test)

r2_score(y_test, y_pred)

0.2735295389367275

In [30]:
gbr = GradientBoostingRegressor(max_depth=3, n_estimators=30, learning_rate=0.1)

gbr.fit(x_train, y_train)

y_pred = gbr.predict(x_test)

r2_score(y_test, y_pred)

0.84540934383657