In [1]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=1000, noise=0.2)

In [2]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(probability=True)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft'
)
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()),
                             ('svc', SVC(probability=True))],
                 voting='soft')

In [4]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,accuracy_score(y_test, y_pred))

LogisticRegression 0.865
RandomForestClassifier 0.98
SVC 0.985
VotingClassifier 0.97


In [5]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=50, max_samples=50, bootstrap=True, n_jobs=-1, oob_score=True)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.935


In [6]:
print(bag_clf.oob_score_) # the mean of the leftover scores

0.945


In [7]:
bag_clf.oob_decision_function_.shape

(800, 2)

In [8]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

RandomForestClassifier(max_leaf_nodes=16, n_estimators=500, n_jobs=-1)

In [9]:
print(accuracy_score(y_test, rnd_clf.predict(X_test)))

0.975


In [10]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])

for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)
    

sepal length (cm) 0.09746584265462906
sepal width (cm) 0.02464035514868448
petal length (cm) 0.44510131177155615
petal width (cm) 0.4327924904251303


In [11]:
from sklearn.ensemble import AdaBoostClassifier

In [12]:
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=200, algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train)
y_pred = ada_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.985


In [13]:
from sklearn.tree import DecisionTreeRegressor
tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

DecisionTreeRegressor(max_depth=2)

In [14]:
y2 = y - tree_reg1.predict(X) # MAKE ERROR DATA!

In [15]:
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

DecisionTreeRegressor(max_depth=2)

In [18]:
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

DecisionTreeRegressor(max_depth=2)

In [21]:
y_pred = sum(tree.predict(X) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [22]:
from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)

In [23]:
gbrt.fit(X, y)

GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3)

In [39]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
gnrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gnrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred) for y_pred in gnrt.staged_predict(X_val)] # GradientBoostringRegressor().staged_predict() returns array or predicted values for numbers of estimators
best_number_estimator = np.argmin(errors)

gnrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=best_number_estimator)
gnrt_best.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=116)

In [40]:
print(best_number_estimator)

116


In [50]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True) #setting warm_start to true allows users to train a model after training it -> in a complete way ## no need to set the estimator size! alling fit will grow new one?
min_value_error = float("inf")
error_going_up = 0 # this will be our error counter
count = 0 
for n_estimators in range(1, 1000):
    gbrt.n_estimators = n_estimators # this is how we can modify the class's size?
    gbrt.fit(X_train, y_train) # using warm_start to true lets you keep the original shape -> doesn't train from head to toe
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    count += 1
    if(val_error < min_value_error):
        error_going_up = 0
        min_value_error = val_error
    else:
        error_going_up +=1 
        if ( error_going_up == 5):
            print("count: ", count)
            break
            
print(min_value_error)

count:  126
0.036327423362923446


In [51]:
import xgboost
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train, eval_set=[(X_val, y_val)],early_stopping_rounds=5)
y_pred = xgb_reg.predict(X_val)

[0]	validation_0-rmse:0.36840
[1]	validation_0-rmse:0.27971
[2]	validation_0-rmse:0.22169
[3]	validation_0-rmse:0.19126
[4]	validation_0-rmse:0.16939
[5]	validation_0-rmse:0.16082
[6]	validation_0-rmse:0.15395
[7]	validation_0-rmse:0.15260
[8]	validation_0-rmse:0.15139
[9]	validation_0-rmse:0.15043
[10]	validation_0-rmse:0.15022
[11]	validation_0-rmse:0.14978
[12]	validation_0-rmse:0.14999
[13]	validation_0-rmse:0.14971
[14]	validation_0-rmse:0.14900
[15]	validation_0-rmse:0.14969
[16]	validation_0-rmse:0.15024
[17]	validation_0-rmse:0.15070
[18]	validation_0-rmse:0.15032
[19]	validation_0-rmse:0.14999
