In [88]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

dataset = make_moons(n_samples=5000, noise=0.3)

X_train, X_test, y_train, y_test = train_test_split(dataset[0], dataset[1],test_size=0.2)

In [42]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# log_clf = LogisticRegression()
# rnd_clf = RandomForestClassifier()
# svm_clf = SVC()
log_clf = LogisticRegression(solver="lbfgs", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma="scale", probability=True,random_state=42)

voting_clf = VotingClassifier(estimators=
                            #   [('lr', log_clf), 
                               [('rf', rnd_clf), 
                               ('svc', svm_clf)],
                              voting='soft')

In [43]:
from sklearn.metrics import accuracy_score

for clf in [log_clf,rnd_clf, svm_clf, voting_clf]:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.8573
RandomForestClassifier 0.9055
SVC 0.9178
VotingClassifier 0.9146


In [91]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

accuracy_score(y_test, y_pred)

0.918

In [71]:
bag_clf.oob_score_

0.910325

In [90]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred = rnd_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.914

In [80]:
# Equivalent Bagging Classifier
bag_clf = BaggingClassifier(DecisionTreeClassifier(splitter="random", max_leaf_nodes=16),
                            n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1)


In [94]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=500, algorithm="SAMME.R", 
    learning_rate=0.3)
ada_clf.fit(X_train, y_train)

y_pred = ada_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.912

In [92]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'n_estimators':[100,200,300,400,500,600], 'learning_rate': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]}
]
gs = GridSearchCV(ada_clf, param_grid, cv=5, n_jobs=-1)
gs.fit(X_train, y_train)

In [93]:
gs.best_params_

{'learning_rate': 0.3, 'n_estimators': 500}

In [96]:
# Gradient Boosting
import numpy as np
from sklearn.tree import DecisionTreeRegressor

np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)


tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg1.fit(X, y)

y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg2.fit(X, y2)

y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg3.fit(X, y3)

X_new = np.array([[0.8]])
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))
y_pred


array([0.75026781])

In [97]:
from sklearn.ensemble import GradientBoostingRegressor

# Equivalent
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X,y)

In [106]:
from sklearn.ensemble import GradientBoostingClassifier

gb_clf = GradientBoostingClassifier(max_depth=2, n_estimators=50)

gb_clf.fit(X_train,y_train)

y_pred = gb_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.916

In [109]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X,y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val,y_pred) for y_pred in gbrt.staged_predict(X_val)]

best_n_estimators = np.argmin(errors) + 1
print(errors[np.argmin(errors)])

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=best_n_estimators)
gbrt_best.fit(X_train, y_train)

mean_squared_error(y_val,gbrt_best.predict(X_val))

0.001453240550054879


0.001453240550054879

In [122]:
# Early stopping
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float("inf")
gbrt_best = gbrt
best_n_estimators = 1
error_going_up = 0

for n_estimators in range(1,120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        best_n_estimators = n_estimators
        error_going_up = 0
    else:
        error_going_up += 1
        print(f"Best is {best_n_estimators}, current is {n_estimators}")
        if error_going_up == 5:
            print("early stopping..")
            break


print("OLD MODEL: ", mean_squared_error(y_val,gbrt.predict(X_val)))
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=best_n_estimators)
gbrt.fit(X_train,y_train)
print("NEW MODEL: ",mean_squared_error(y_val,gbrt.predict(X_val)))

Best is 38, current is 39
Best is 42, current is 43
Best is 50, current is 51
Best is 53, current is 54
Best is 53, current is 55
Best is 56, current is 57
Best is 56, current is 58
Best is 56, current is 59
Best is 56, current is 60
Best is 56, current is 61
early stopping..
OLD MODEL:  0.0014905867558970042
NEW MODEL:  0.0014771657215577891


In [123]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-1.7.5-py3-none-manylinux2014_x86_64.whl (200.3 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.3/200.3 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.5


In [133]:
import xgboost

xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train,y_train, eval_set=[(X_val,y_val)], early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_val)

mean_squared_error(y_val, y_pred)

[0]	validation_0-rmse:0.23648
[1]	validation_0-rmse:0.17121
[2]	validation_0-rmse:0.12496
[3]	validation_0-rmse:0.09196
[4]	validation_0-rmse:0.06929
[5]	validation_0-rmse:0.05547
[6]	validation_0-rmse:0.04698
[7]	validation_0-rmse:0.04159
[8]	validation_0-rmse:0.03950
[9]	validation_0-rmse:0.03959
[10]	validation_0-rmse:0.03875
[11]	validation_0-rmse:0.03917




0.0015012255680689807

In [134]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1,as_frame=False)
mnist.keys()

  warn(


dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [159]:
X, y = mnist["data"], mnist["target"]

In [160]:
X.shape

(70000, 784)

In [161]:
y.shape

(70000,)

In [162]:
X_train = X[:50000]
X_val = X[50000:60000]
X_test = X[60000:]
y_train = y[:50000]
y_val = y[50000:60000]
y_test = y[60000:]

(X_train.shape, X_val.shape, X_test.shape)

((50000, 784), (10000, 784), (10000, 784))

In [144]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

svm_clf = LinearSVC()
rf_clf = RandomForestClassifier()
et_clf = ExtraTreesClassifier()

ensemble = VotingClassifier([('svm', svm_clf), ('rf', rf_clf), ('et', et_clf)], voting="soft", n_jobs=-1)


for clf in [svm_clf, rf_clf, et_clf, ensemble]:
    clf.fit(X_train, y_train)
    acc = accuracy_score(y_val, clf.predict(X_val))
    print(f"{clf.__class__.__name__}: ", acc)



SVC:  0.9802
RandomForestClassifier:  0.9706
ExtraTreesClassifier:  0.9753
VotingClassifier:  0.9812


In [145]:
for clf in [svm_clf, rf_clf, et_clf, ensemble]:
    acc = accuracy_score(y_test, clf.predict(X_test))
    print(f"{clf.__class__.__name__}: ", acc)

SVC:  0.9785
RandomForestClassifier:  0.9683
ExtraTreesClassifier:  0.9713
VotingClassifier:  0.9783


In [165]:
predictions = []

for clf in [svm_clf, rf_clf, et_clf]:
    predictions.append(clf.predict(X_val))

features = np.c_[predictions[0], predictions[1], predictions[2]]

In [170]:
from sklearn.ensemble import GradientBoostingClassifier
blender = GradientBoostingClassifier(n_estimators=200)
blender.fit(features,y_val)

In [169]:
predictions = []

for clf in [svm_clf, rf_clf, et_clf]:
    predictions.append(clf.predict(X_test))

X_test_blender = np.c_[predictions[0], predictions[1], predictions[2]]
y_test_blender = y_test

accuracy_score(y_test_blender, blender.predict(X_test_blender))



0.9743

In [171]:

accuracy_score(y_test_blender, blender.predict(X_test_blender))

0.974

In [172]:
rf_clf.predict_proba([X[0]])

array([[0.  , 0.  , 0.02, 0.08, 0.  , 0.87, 0.01, 0.  , 0.02, 0.  ]])

In [173]:
rf_clf.predict_log_proba([X[0]])

  return np.log(proba)


array([[       -inf,        -inf, -3.91202301, -2.52572864,        -inf,
        -0.13926207, -4.60517019,        -inf, -3.91202301,        -inf]])