In [1]:
from sklearn.datasets import make_moons

dataset = make_moons(n_samples=10000)

X = dataset[0]
y = dataset[1]

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard'
)

voting_clf.fit(X_train, y_train)

In [4]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.8932
RandomForestClassifier 0.9996
SVC 1.0
VotingClassifier 0.9996



### Bagging and Pasting in Scikit-Learn

In [5]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1,
)

bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [6]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True,
)

bag_clf.fit(X_train, y_train)

In [7]:
bag_clf.oob_score_

0.9994666666666666

In [8]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9988

In [9]:
bag_clf.oob_decision_function_

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]])

## Random Forests

In [10]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [11]:
bag_clg = BaggingClassifier(
    DecisionTreeClassifier(splitter='random', max_leaf_nodes=16),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1,
)

In [12]:
print(f'RF score: {rnd_clf.score(X_test, y_test)}')
print(f'Bag score: {bag_clf.score(X_test, y_test)}')

RF score: 0.9996
Bag score: 0.9988


In [13]:
rnd_clf.feature_importances_

array([0.47555038, 0.52444962])

In [14]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris['data'], iris['target'])
for name, score in zip(iris['feature_names'], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.1070982184384432
sepal width (cm) 0.022632308304673675
petal length (cm) 0.45213959470380805
petal width (cm) 0.41812987855307515


## Boosting

### AdaBoost
#### Weighted error rate of the $j^{th}$ predictor
$r_{j} = \dfrac{\hat{y}^{(i)}_{j} \neq y^{(i)}}{\sum _{i=1}^{m} w^{(i)}}$

#### Predictor weight
$\alpha_{j} = \eta \cdot log(\dfrac{1 - r_{j}}{r_{j}})$

#### Weight update rule
for $i = 1,2,...,m$
$w^{(i)} = w^{(i)} ~~~~~~~~~~~~~~~~~ if~ \hat{y_j}^{(i)} = y^{(i)} \\$
$w^{(i)} = w^{(i)}\cdot exp(\alpha_{j}) ~~ if ~ \hat{y_j}^{(i)} \neq y^{(i)}$

#### AdaBoost predictions
$\hat{y}(x) = argmax_k ~ \sum _{j=1, \hat{y}_j(x) = k}^{N} \alpha_j \newline$
where N is the number of predictors

In [15]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5,
)
ada_clf.fit(X_train, y_train)

In [16]:
ada_clf.score(X_test, y_test)

0.9988

## Gradient Boosting

In [17]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

In [18]:
import math
import numpy as np

In [19]:
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

In [20]:
y3 = y - np.ceil(tree_reg2.predict(X))
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

In [21]:
y_pred = sum(tree.predict(X_test) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [22]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators)

gbrt_best.fit(X_train, y_train)

In [24]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break

In [25]:
gbrt.get_params

<bound method BaseEstimator.get_params of GradientBoostingRegressor(max_depth=2, n_estimators=119, warm_start=True)>

In [26]:
import xgboost

xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)

In [27]:
xgb_reg.fit(X_train, y_train,
            eval_set = [(X_val, y_val)], early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_val)

[0]	validation_0-rmse:0.35021
[1]	validation_0-rmse:0.24531
[2]	validation_0-rmse:0.17183
[3]	validation_0-rmse:0.12037
[4]	validation_0-rmse:0.08433
[5]	validation_0-rmse:0.05908
[6]	validation_0-rmse:0.04140
[7]	validation_0-rmse:0.02901
[8]	validation_0-rmse:0.02033
[9]	validation_0-rmse:0.01425
[10]	validation_0-rmse:0.00999
[11]	validation_0-rmse:0.00700
[12]	validation_0-rmse:0.00491
[13]	validation_0-rmse:0.00344
[14]	validation_0-rmse:0.00242
[15]	validation_0-rmse:0.00170
[16]	validation_0-rmse:0.00119
[17]	validation_0-rmse:0.00084
[18]	validation_0-rmse:0.00059
[19]	validation_0-rmse:0.00041
[20]	validation_0-rmse:0.00029
[21]	validation_0-rmse:0.00020
[22]	validation_0-rmse:0.00014
[23]	validation_0-rmse:0.00010
[24]	validation_0-rmse:0.00007
[25]	validation_0-rmse:0.00005
[26]	validation_0-rmse:0.00005
[27]	validation_0-rmse:0.00004
[28]	validation_0-rmse:0.00004
[29]	validation_0-rmse:0.00003
[30]	validation_0-rmse:0.00003




## Exercises

#### Question 1
    Yes, by using a Voting Classifier, i.e. comparing their individual results and outputing the most frequent (for classification) or the mean (for regression).

#### Question 2
    The hard voting uses the frequency of the classes to give a prediction. On the other hand the soft voting uses the probabilities of each class to weight its decision.  

#### Question 3
    For bagging and pasting yes. For boosting, no, since they need the previous result. And Stacking depends on how you train the individual predictors before the blender.

#### Question 4
    By using it, it's possible to see how well your model is generalizing.

#### Question 5
    Extra-Trees not only uses random features (as Random Forest), but also selects random tresholds for these features, which helps to prevent overfitting. Considering the randomness of the parameters, it can take more time to train then the Random Forest, but I don't think it's a considerably greater time. 

#### Question 6
    You can try increasing the number of estimators or their max_depth hyperparameter.

#### Question 7 
    Decrease it and using more tree (not too much), using the Regularization technique called shrinkage.

#### Question 8

In [33]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1)
mnist.data.shape

(70000, 784)

In [34]:
X_train, X_test, y_train, y_test = train_test_split(mnist.data, mnist.target, test_size=1/7, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=1/6, random_state=42)

In [37]:
# Random Forest
rnd_clf = RandomForestClassifier()
rnd_clf.fit(X_train, y_train)
rnd_clf.score(X_val, y_val)

0.9693

In [38]:
# Extra Trees
from sklearn.ensemble import ExtraTreesClassifier 
ext_clf = ExtraTreesClassifier()
ext_clf.fit(X_train, y_train)
ext_clf.score(X_val, y_val)

0.9717

In [44]:
# SVM Classifier
svc_clf = SVC(probability=True)
svc_clf.fit(X_train, y_train)
svc_clf.score(X_val, y_val)

0.9788

In [41]:
# Hard Voting Classifier
voting_clf = VotingClassifier(
    estimators = [('ext', ext_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard',
    n_jobs=-1,
)
voting_clf.fit(X_train, y_train)
voting_clf.score(X_val, y_val)

0.9759

In [45]:
# Soft Voting Classifier
svoting_clf = VotingClassifier(
    estimators = [('ext', ext_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft',
    n_jobs=-1,
)
svoting_clf.fit(X_train, y_train)
svoting_clf.score(X_val, y_val)

AttributeError: predict_proba is not available when  probability=False