# Ensemble Learning and Randoms Forests

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [3]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

In [4]:
voting_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)], voting='hard')

voting_clf.fit(X_train, y_train)

In [5]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.896


In [6]:
svm_clf_prob = SVC(probability=True)

In [7]:
voting_clf_prob = VotingClassifier(
    estimators = [('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf_prob)], voting='soft')

voting_clf.fit(X_train, y_train)

In [8]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf_prob, voting_clf_prob):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.904


# Bagging and Pasting

In [9]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1)

bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [10]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True)

In [11]:
bag_clf.fit(X_train, y_train)

In [12]:
bag_clf.oob_score_

0.9253333333333333

In [13]:
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)

In [14]:
accuracy_score(y_test, y_pred)

0.92

In [15]:
bag_clf.oob_decision_function_

array([[0.35233161, 0.64766839],
       [0.40526316, 0.59473684],
       [0.99738903, 0.00261097],
       [0.01591512, 0.98408488],
       [0.02077922, 0.97922078],
       [0.0874036 , 0.9125964 ],
       [0.41012658, 0.58987342],
       [0.05835544, 0.94164456],
       [0.93516209, 0.06483791],
       [0.86772487, 0.13227513],
       [0.57908847, 0.42091153],
       [0.04896907, 0.95103093],
       [0.78036176, 0.21963824],
       [0.83113456, 0.16886544],
       [0.91556728, 0.08443272],
       [0.10621762, 0.89378238],
       [0.03674541, 0.96325459],
       [0.94573643, 0.05426357],
       [0.66161616, 0.33838384],
       [0.92761394, 0.07238606],
       [0.05221932, 0.94778068],
       [0.21578947, 0.78421053],
       [0.87631579, 0.12368421],
       [0.99485861, 0.00514139],
       [0.96221662, 0.03778338],
       [0.00789474, 0.99210526],
       [0.94117647, 0.05882353],
       [1.        , 0.        ],
       [0.02056555, 0.97943445],
       [0.73036649, 0.26963351],
       [0.

# Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [17]:
accuracy_score(y_test, y_pred_rf)

0.912

In [18]:
from sklearn.ensemble import ExtraTreesClassifier

xtr_clf = ExtraTreesClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
xtr_clf.fit(X_train, y_train)

y_pred_rf = xtr_clf.predict(X_test)

In [19]:
accuracy_score(y_test, y_pred_rf)

0.912

In [20]:
rnd_clf.feature_importances_

array([0.42203461, 0.57796539])

In [21]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris['data'], iris['target'])
for name,  score in zip(iris['feature_names'], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.10395151961655495
sepal width (cm) 0.02354472202876671
petal length (cm) 0.4596703414940854
petal width (cm) 0.41283341686059294
