# Ensemble Methods

## Voting Classifiers

In [37]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(10_000, noise=0.3, random_state=42)
xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=42)

In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [40]:
# Hard Voting Classifier: For a given training instance, take the
# mode of all predicted classes.

log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(random_state=42)

voting_clf = VotingClassifier(
    estimators=[
        ("lr", log_clf),
        ("rf", rnd_clf),
        ("svc", svm_clf)
    ],
    voting="hard"
)

voting_clf.fit(xtrain, ytrain)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', RandomFor...f',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [42]:
print("..Hard Voting Classifier Results..")
from sklearn.metrics import accuracy_score
for clf in [log_clf, rnd_clf, svm_clf, voting_clf]:
    clf.fit(xtrain, ytrain)
    ypred = clf.predict(xtest)
    clf_name = clf.__class__.__name__
    print(f"{clf_name:>22}: {accuracy_score(ytest, ypred):0.3%}")

..Hard Voting Classifier Results..
    LogisticRegression: 85.880%
RandomForestClassifier: 90.640%
                   SVC: 91.960%
      VotingClassifier: 91.720%


**Note:** In this example, that the `VotingClassifier` surpassed all other classifiers is mereley probabilisitic. Modifying how the data is split, or the how the data is generated yield to a lower `VotingClassifier` accuracy

In [45]:
# Soft Voting Classifier: Predict the class with the highest probability,
# averaged over all individual classifiers

log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(random_state=42, probability=True)

voting_clf = VotingClassifier(
    estimators=[
        ("lr", log_clf),
        ("rf", rnd_clf),
        ("svc", svm_clf)
    ],
    voting="soft"
)

voting_clf.fit(xtrain, ytrain)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', RandomFor...bf',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [47]:
print("..Soft Classifier Results..")
from sklearn.metrics import accuracy_score
for clf in [log_clf, rnd_clf, svm_clf, voting_clf]:
    clf.fit(xtrain, ytrain)
    ypred = clf.predict(xtest)
    clf_name = clf.__class__.__name__
    print(f"{clf_name:>22}: {accuracy_score(ytest, ypred):0.3%}")

..Soft Classifier Results..
    LogisticRegression: 85.880%
RandomForestClassifier: 90.640%
                   SVC: 91.960%
      VotingClassifier: 91.760%


## Bagging and Pasting

In [48]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), # The model to fit
    n_estimators=500, # The number of trees to work with
    max_samples=100, # Number of training instances for each tree
    bootstrap=True, # Set bagging to True; else pasting
    n_jobs=-1 # Number of CPUs to train the model
)

bag_clf.fit(xtrain, ytrain)
ypred = bag_clf.predict(xtest)

In [50]:
accuracy_score(ytest, ypred)

0.92079999999999995