In [2]:
#2-3-6
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
%matplotlib notebook

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_moons

In [4]:
x, y = make_moons(n_samples=100, noise=0.25, random_state=3)

In [7]:
import sklearn.model_selection as ms
x_train, x_test, y_train, y_test = ms.train_test_split(x, y, stratify=y, random_state=42)

In [8]:
forest = RandomForestClassifier(n_estimators=5, random_state=2)
forest.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=5, n_jobs=1, oob_score=False, random_state=2,
            verbose=0, warm_start=False)

In [11]:
fig, axes = plt.subplots(2, 3, figsize=(20, 10))

for i, (ax, tree) in enumerate(zip(axes.ravel(), forest.estimators_)):
    ax.set_title("Tree {}".format(i))
    mglearn.plots.plot_tree_partition(x_train, y_train, tree, ax=ax)
    
mglearn.plots.plot_2d_separator(forest, x_train, fill=True, ax=axes[-1, -1], alpha=.4)
axes[-1, -1].set_title("Random Forest")
mglearn.discrete_scatter(x_train[:, 0], x_train[:, 1], y_train)

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x11dd8b6d8>,
 <matplotlib.lines.Line2D at 0x11dd94588>]

In [15]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [18]:
x_train, x_test, y_train, y_test = ms.train_test_split(cancer.data, cancer.target, random_state=0)

forest = RandomForestClassifier(n_estimators=100, random_state=0)

In [19]:
forest.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [21]:
print("accurary on training set: {:.3f}".format(forest.score(x_train, y_train)))
print("accurary on test set: {:.3f}".format(forest.score(x_test, y_test)))

accurary on training set: 1.000
accurary on test set: 0.972


In [41]:
def plot_feature_importances_cancer(model, file_name):
    n_features = cancer.data.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align="center")
    plt.yticks(np.arange(n_features), cancer.feature_names)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    plt.ylim(-1, n_features)
    plt.savefig(file_name)

In [31]:
# check an image later
plot_feature_importances_cancer(forest)

In [32]:
from sklearn.ensemble import GradientBoostingClassifier

In [33]:
x_train, x_test, y_train, y_test = ms.train_test_split(cancer.data ,cancer.target, random_state=0)

In [36]:
gbrt = GradientBoostingClassifier(random_state=0, max_depth=1)
gbrt.fit(x_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=1,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=0,
              subsample=1.0, verbose=0, warm_start=False)

In [37]:
print("accurary on training set: {:.3f}".format(gbrt.score(x_train, y_train)))
print("accurary on test set: {:.3f}".format(gbrt.score(x_test, y_test)))

accurary on training set: 0.991
accurary on test set: 0.972


In [38]:
gbrt = GradientBoostingClassifier(random_state=0, learning_rate=0.01)
gbrt.fit(x_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=0,
              subsample=1.0, verbose=0, warm_start=False)

In [39]:
print("accurary on training set: {:.3f}".format(gbrt.score(x_train, y_train)))
print("accurary on test set: {:.3f}".format(gbrt.score(x_test, y_test)))

accurary on training set: 0.988
accurary on test set: 0.965


In [43]:
gbrt = GradientBoostingClassifier(random_state=0, max_depth=1)
gbrt.fit(x_train, y_train)

plot_feature_importances_cancer(gbrt, "GradientBoostingClassifier.png")