# Ensemble Methods

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np

# Models
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier

# Reporting
from sklearn.metrics import confusion_matrix, f1_score, classification_report, accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn import tree

# compare standalone models for binary classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from matplotlib import pyplot
from sklearn.ensemble import StackingClassifier


So far we have built and trained a selection of machine learning models. Each using a different algorithm. We tested them for accuracy and chose the best one.

In [3]:
iris = sns.load_dataset('iris')

In [4]:
y = iris.iloc[:,4]
X = iris.iloc[:,0:4]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

### Individual Models

In [24]:
clf = DecisionTreeClassifier().fit(X_train,y_train).predict(X_test)
accuracy_score(y_test, clf, )

0.9666666666666667

In [25]:
knn = KNeighborsClassifier().fit(X_train,y_train).predict(X_test)
accuracy_score(y_test, knn)

0.9333333333333333

In [26]:
gnb = GaussianNB().fit(X_train,y_train).predict(X_test)
accuracy_score(y_test, gnb)

1.0

## Parallel Learning

We can now look at combining these models together into an ensemble. Firstly, we will look at combining models from different algorithms. These are known as heterogenous ensembles.

### Heterogenous Ensembles

#### Hard Voting (Majority Voting)

In [27]:
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
gnb = GaussianNB()

In [28]:
estimators=[('KNN', knn), ('DT', dt), ('GNB', gnb)]

In [29]:
clf_hard = VotingClassifier(estimators)
clf_hard.fit(X_train, y_train)

VotingClassifier(estimators=[('KNN', KNeighborsClassifier()),
                             ('DT', DecisionTreeClassifier()),
                             ('GNB', GaussianNB())])

In [31]:
pred_vote = clf_hard.predict(X_test)

# Calculate the F1-Score of the voting classifier
accuracy_score(y_test, pred_vote)

0.9666666666666667

#### Soft Voting (Averaging)

In [32]:
clf_soft = VotingClassifier(estimators, voting='soft')
clf_soft.fit(X_train, y_train)

VotingClassifier(estimators=[('KNN', KNeighborsClassifier()),
                             ('DT', DecisionTreeClassifier()),
                             ('GNB', GaussianNB())],
                 voting='soft')

In [33]:
pred_vote = clf_soft.predict(X_test)

# Calculate the F1-Score of the voting classifier
accuracy_score(y_test, pred_vote)

0.9666666666666667

### Homogenous Ensembles

#### Bagging

In [34]:
# Instantiate the base model
clf_dt = DecisionTreeClassifier(max_depth=4)

# Build and train the Bagging classifier
clf_bag = BaggingClassifier()
clf_bag.fit(X_train, y_train)

# Predict the labels of the test set
pred = clf_bag.predict(X_test)

In [35]:
accuracy_score(y_test, pred)

0.9666666666666667

#### Random Forest

A random forest is a type of bagging ensemble method. The difference here is that it also randomises a subset of the feature to train on.

In [6]:
# Instantiate the base model
clf_rf = RandomForestClassifier(max_depth=4)

clf_rf.fit(X_train, y_train)

# Predict the labels of the test set
pred = clf_rf.predict(X_test)

In [7]:
accuracy_score(y_test, pred)

0.9333333333333333

## Sequential Learning

### Adaptive Boosting (AdaBoost)

In [36]:
# Instantiate a normalized linear regression model
reg_lm = DecisionTreeClassifier()

# Build and fit an AdaBoost regressor
reg_ada = AdaBoostClassifier()
reg_ada.fit(X_train, y_train)

# Calculate the predictions on the test set
pred = reg_ada.predict(X_test)
accuracy_score(y_test, pred)

0.9

### Gradient Boosting

In [37]:
# Instantiate a normalized linear regression model
reg_lm = DecisionTreeClassifier()

# Build and fit an AdaBoost regressor
reg_ada = GradientBoostingClassifier()
reg_ada.fit(X_train, y_train)

# Calculate the predictions on the test set
pred = reg_ada.predict(X_test)
accuracy_score(y_test, pred)

0.9666666666666667

### Stochastic Gradient Boosting

For stochastic gradient boosting, each tree is trained on a random subset of the training data. We do this by setting the `subsample` paramter to be a value less than `1`. Values between `0.5` and `0.8` have been shown to be best.

In [46]:
# Instantiate a normalized linear regression model
reg_lm = DecisionTreeClassifier()

# Build and fit an AdaBoost regressor
reg_ada = GradientBoostingClassifier(subsample=0.4)
reg_ada.fit(X_train, y_train)

# Calculate the predictions on the test set
pred = reg_ada.predict(X_test)
accuracy_score(y_test, pred)

0.9666666666666667

## Stacking

In [69]:
# define the base models
level0 = []

level0.append(('knn', KNeighborsClassifier()))
level0.append(('cart', DecisionTreeClassifier()))
level0.append(('svm', SVC()))
level0.append(('bayes', GaussianNB()))

# define meta learner model
level1 = KNeighborsClassifier()

# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)

# fit the model on all available data
model.fit(X_train, y_train)

# predict the test data
pred = model.predict(X_test)

# test for accuracy
accuracy_score(y_test, pred)

0.9666666666666667