# Ensemble Methods

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np

# Models
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier

# Reporting
from sklearn.metrics import confusion_matrix, f1_score, classification_report, accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn import tree

# compare standalone models for binary classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from matplotlib import pyplot
from sklearn.ensemble import StackingClassifier

#ignore warnings
import warnings
warnings.filterwarnings('ignore')


So far we have built and trained a selection of machine learning models. Each using a different algorithm. We tested them for accuracy and chose the best one.

In [2]:
iris = sns.load_dataset('iris')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
y = iris.iloc[:,4]
X = iris.iloc[:,0:4]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=3)

In [5]:
len(iris)

150

In [6]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


# Can we guess the species?

### Individual Models

In [7]:
clf = DecisionTreeClassifier().fit(X_train,y_train).predict(X_test)
accuracy_score(y_test, clf, )

1.0

In [8]:
knn = KNeighborsClassifier().fit(X_train,y_train).predict(X_test)
accuracy_score(y_test, knn)

0.9666666666666667

In [9]:
gnb = GaussianNB().fit(X_train,y_train).predict(X_test)
accuracy_score(y_test, gnb)

0.9666666666666667

## Parallel Learning

We can now look at combining these models together into an ensemble. Firstly, we will look at combining models from different algorithms. These are known as heterogenous ensembles.

### Heterogenous Ensembles

#### Hard Voting (Majority Voting)

In [10]:
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
gnb = GaussianNB()

In [11]:
estimators=[('KNN', knn), ('DT', dt), ('GNB', gnb)]

In [12]:
clf_hard = VotingClassifier(estimators, voting="hard")
clf_hard.fit(X_train, y_train)

#default is hard

VotingClassifier(estimators=[('KNN', KNeighborsClassifier()),
                             ('DT', DecisionTreeClassifier()),
                             ('GNB', GaussianNB())])

In [13]:
pred_vote = clf_hard.predict(X_test)

# Calculate the F1-Score of the voting classifier
accuracy_score(y_test, pred_vote)

0.9666666666666667

#### Soft Voting (Averaging)

In [14]:
clf_soft = VotingClassifier(estimators, voting='soft')
clf_soft.fit(X_train, y_train)

#averaging of the probability 

VotingClassifier(estimators=[('KNN', KNeighborsClassifier()),
                             ('DT', DecisionTreeClassifier()),
                             ('GNB', GaussianNB())],
                 voting='soft')

In [15]:
pred_vote = clf_soft.predict(X_test)

#test the accuracy
accuracy_score(y_test, pred_vote)

1.0

### Homogenous Ensembles

#### Bagging

In [16]:
# Instantiate the base model
#clf_dt = DecisionTreeClassifier(max_depth=4)

# Build and train the Bagging classifier
clf_bag = BaggingClassifier() #defauly is decision tree 
clf_bag.fit(X_train, y_train)

# Predict the labels of the test set
pred = clf_bag.predict(X_test)

In [17]:
accuracy_score(y_test, pred)

1.0

# Sequential Learning

### Adaptive Boosting (AdaBoost)

In [18]:
# Instantiate a normalized linear regression model
reg_lm = DecisionTreeClassifier()

# Build and fit an AdaBoost regressor
reg_ada = AdaBoostClassifier()
reg_ada.fit(X_train, y_train)

# Calculate the predictions on the test set
pred = reg_ada.predict(X_test)
accuracy_score(y_test, pred)

0.9666666666666667

### Gradient Boosting

In [19]:
# Instantiate a normalized linear regression model
#reg_lm = DecisionTreeClassifier()

# Build and fit an AdaBoost regressor
reg_ada = GradientBoostingClassifier()
reg_ada.fit(X_train, y_train)

# Calculate the predictions on the test set
pred = reg_ada.predict(X_test)
accuracy_score(y_test, pred)

0.9666666666666667

# XGBoost

In [24]:
import xgboost as xgb

In [25]:
# Init classifier
xgb_cl = xgb.XGBClassifier()

# Fit
xgb_cl.fit(X_train, y_train)

# Predict
preds = xgb_cl.predict(X_test)

# Score
accuracy_score(y_test, preds)



1.0

## Stacking

In [26]:
# define the base models
level0 = []

level0.append(('knn', KNeighborsClassifier()))
level0.append(('cart', DecisionTreeClassifier()))
level0.append(('svm', SVC()))
level0.append(('bayes', GaussianNB()))

# define meta learner model
level1 = KNeighborsClassifier()

# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)

# fit the model on all available data
model.fit(X_train, y_train)

# predict the test data
pred = model.predict(X_test)

# test for accuracy
accuracy_score(y_test, pred)

1.0

In [23]:
#you can change things if you want