In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
concrete = pd.read_csv('datasets/concrete_data.csv')

concrete.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
X = concrete.drop('csMPa', axis=1)

Y = concrete['csMPa']

In [4]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

### RandomForestRegressor
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

In [5]:
from sklearn.ensemble import RandomForestRegressor

In [6]:
rnd_reg = RandomForestRegressor(n_estimators=600, max_leaf_nodes=12, n_jobs=-1)

rnd_reg.fit(x_train, y_train)

RandomForestRegressor(max_leaf_nodes=12, n_estimators=600, n_jobs=-1)

In [7]:
y_pred = rnd_reg.predict(x_test)

In [8]:
from sklearn.metrics import r2_score

r2_score(y_test, y_pred)

0.7970450022518372

In [9]:
important_features = pd.Series(rnd_reg.feature_importances_, index=X.columns).\
                        sort_values(ascending=False)

important_features

age                 0.399367
cement              0.319255
superplasticizer    0.104268
water               0.093542
slag                0.058108
fineaggregate       0.013064
flyash              0.009472
coarseaggregate     0.002924
dtype: float64

#### BaggingRegressor + DecisionTreeRegressor(with splitter='random') = RandomForestRegressor

In [10]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

In [11]:
bag_reg = BaggingRegressor(DecisionTreeRegressor(splitter='random', max_leaf_nodes=12), 
                                                 n_estimators=600,
                                                 bootstrap=True,
                                                 max_samples=1.0,
                                                 n_jobs=-1)
bag_reg.fit(x_train, y_train)

BaggingRegressor(base_estimator=DecisionTreeRegressor(max_leaf_nodes=12,
                                                      splitter='random'),
                 n_estimators=600, n_jobs=-1)

In [12]:
y_pred = bag_reg.predict(x_test)

r2_score(y_test, y_pred)

0.7367800736013519

### ExtraTreesRegressor
https://scikit-learn.org/stable/modules/generated/sklearn.tree.ExtraTreeRegressor.html     
Extra-Tree Regressor is growing tree even more random by using random thresholds for each features rather than searching for best possible thresholds (like the regular decision tree).

In [13]:
from sklearn.ensemble import ExtraTreesRegressor 

In [14]:
extra_reg = ExtraTreesRegressor(n_estimators=600, max_leaf_nodes=12, n_jobs=-1)

extra_reg.fit(x_train, y_train)

ExtraTreesRegressor(max_leaf_nodes=12, n_estimators=600, n_jobs=-1)

In [15]:
y_pred = extra_reg.predict(x_test)

r2_score(y_test, y_pred)

0.7387807759695106

In [16]:
important_features = pd.Series(extra_reg.feature_importances_, index=X.columns).\
                        sort_values(ascending=False)

important_features

age                 0.379397
cement              0.301075
water               0.103183
superplasticizer    0.102918
slag                0.051447
flyash              0.036693
fineaggregate       0.019219
coarseaggregate     0.006069
dtype: float64