In [2]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(solver='liblinear', random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)
svm_clf = SVC(gamma='auto', random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(random_state=42,
                                                 solver='liblinear')),
                             ('rf',
                              RandomForestClassifier(n_estimators=10,
                                                     random_state=42)),
                             ('svc', SVC(gamma='auto', random_state=42))])

In [4]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.872
SVC 0.888
VotingClassifier 0.896


In [5]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, random_state=42)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [6]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=500, bootstrap=True, n_jobs=-1, oob_score = True)

In [7]:
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.896

In [8]:
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.904

In [9]:
bag_clf.oob_decision_function_

array([[0.41666667, 0.58333333],
       [0.38333333, 0.61666667],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.00546448, 0.99453552],
       [0.10784314, 0.89215686],
       [0.355     , 0.645     ],
       [0.02030457, 0.97969543],
       [1.        , 0.        ],
       [0.97512438, 0.02487562],
       [0.76767677, 0.23232323],
       [0.        , 1.        ],
       [0.73684211, 0.26315789],
       [0.8627451 , 0.1372549 ],
       [0.96825397, 0.03174603],
       [0.06358382, 0.93641618],
       [0.        , 1.        ],
       [0.9800995 , 0.0199005 ],
       [0.93478261, 0.06521739],
       [0.99393939, 0.00606061],
       [0.01111111, 0.98888889],
       [0.38341969, 0.61658031],
       [0.84895833, 0.15104167],
       [1.        , 0.        ],
       [0.9744898 , 0.0255102 ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.61666667, 0.38333333],
       [0.

In [10]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [12]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(max_features="auto",max_leaf_nodes=16),
                           n_estimators = 500, max_samples=1.0, bootstrap=True, n_jobs=-1)

In [13]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators = 500, n_jobs = -1)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name,score)

sepal length (cm) 0.0920291672212496
sepal width (cm) 0.023839698236347413
petal length (cm) 0.442264300460902
petal width (cm) 0.44186683408150107


In [14]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5, random_state=42)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200, random_state=42)

In [16]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

DecisionTreeRegressor(max_depth=2)

In [17]:
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

DecisionTreeRegressor(max_depth=2)

In [18]:
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

DecisionTreeRegressor(max_depth=2)

In [22]:
#y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [23]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=0.1, random_state=42)
gbrt.fit(X, y)

GradientBoostingRegressor(max_depth=2, n_estimators=3, random_state=42)

In [24]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=49)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120, random_state=42)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred)
          for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth=2,n_estimators=bst_n_estimators, random_state=42)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=104, random_state=42)

In [25]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True, random_state=42)

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break  # 조기 종료

In [26]:
import xgboost

xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train,y_train)
y_pred = xgb_reg.predict(X_val)



In [28]:
xgb_reg.fit(X_train, y_train, eval_set = [(X_val,y_val)],early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_val)

[0]	validation_0-rmse:0.468589
Will train until validation_0-rmse hasn't improved in 2 rounds.
[1]	validation_0-rmse:0.441452
[2]	validation_0-rmse:0.418135
[3]	validation_0-rmse:0.397813
[4]	validation_0-rmse:0.380752
[5]	validation_0-rmse:0.363952
[6]	validation_0-rmse:0.351659
[7]	validation_0-rmse:0.338365
[8]	validation_0-rmse:0.327066
[9]	validation_0-rmse:0.317068
[10]	validation_0-rmse:0.310236
[11]	validation_0-rmse:0.302789
[12]	validation_0-rmse:0.296803
[13]	validation_0-rmse:0.290898
[14]	validation_0-rmse:0.28784
[15]	validation_0-rmse:0.283445
[16]	validation_0-rmse:0.28044
[17]	validation_0-rmse:0.27629
[18]	validation_0-rmse:0.27392
[19]	validation_0-rmse:0.270562
[20]	validation_0-rmse:0.268778
[21]	validation_0-rmse:0.266086
[22]	validation_0-rmse:0.264676
[23]	validation_0-rmse:0.262436
[24]	validation_0-rmse:0.261475
[25]	validation_0-rmse:0.259624
[26]	validation_0-rmse:0.258968
[27]	validation_0-rmse:0.257185
[28]	validation_0-rmse:0.256753
[29]	validation_0-rmse