# Ensemble Algorithms

Uses Pima Indian Diabetes Dataset from https://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes

### Bagging

The three bagging models covered in this section are as follows:

   1. Bagged Decision Trees
   2. Random Forest
   3. Extra Trees

In [8]:
# Bagged Decision tree classification
import pandas as pd
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'
cols = ['npreg', 'plaglu', 'dpres', 'skin', 'insulin', 'bmi', 'pedg', 'age' , 'class']
dataset = pd.read_csv(url, names=cols)
array = dataset.values
X = array[:, 0:8]
Y = array[:, 8]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.770745044429


2. Random Forest

In [14]:
# Random forest classification
import pandas
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'
cols = ['npreg', 'plaglu', 'dpres', 'skin', 'insulin', 'bmi', 'pedg', 'age' , 'class']
dataset = pd.read_csv(url, names=cols)
array = dataset.values
X = array[:, 0:8]
Y = array[:, 8]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
num_trees = 100
max_features = 3
model = RandomForestClassifier(n_estimators=num_trees, random_state=seed, max_features=max_features)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.773376623377


3. Extra Trees

In [24]:
# Extra trees classification
import pandas
from sklearn import model_selection
from sklearn.ensemble import ExtraTreesClassifier

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'
names = ['preg', 'plas', 'pres', 'skin', 'insulin', 'bmi', 'pedi', 'age', 'class']
dataframe = pd.read_csv(url, names=names)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
num_trees = 100
max_features = 7
model = ExtraTreesClassifier(n_estimators=num_trees, random_state=seed, max_features=max_features)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.762987012987


### Boosting

The two most common ensemble machine learning algorithms are

   1. AdaBoost
   2. Stochastic Gradient Boosting

In [25]:
# AdaBoost classification
import pandas as pd
from sklearn import model_selection
from sklearn.ensemble import AdaBoostClassifier
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'
names = ['preg', 'plas', 'pres', 'skin', 'insulin', 'bmi', 'pedi', 'age', 'class']
dataframe = pd.read_csv(url, names=names)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]
seed = 7
num_trees = 30
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.76045796309


In [26]:
# Stochastic gradient boosting classification
# Uses GradientBoostingClassifier
# Best and most sophisticated ensemble technique
import pandas as pd
from sklearn import model_selection
from sklearn.ensemble import GradientBoostingClassifier
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'
names = ['preg', 'plas', 'pres', 'skin', 'insulin', 'bmi', 'pedi', 'age', 'class']
dataframe = pd.read_csv(url, names=names)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]
seed = 7
num_trees = 100
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.766900205058


### Voting Ensemble

It works by creating two or more standalone models from the training set, A voting classifier is then used to wrap
the models and average the predictions of the sub-models

1. Voting Classifier

In [28]:
# Voting Ensemble Classifier
# Uses VotingClassifier
import pandas as pd
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'
names = ['preg', 'plas', 'pres', 'skin', 'insulin', 'bmi', 'pedi', 'age', 'class']
dataframe = pd.read_csv(url, names=names)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
# Create the submodels
estimators = []
model1 = SVC()
estimators.append(('svm', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = LogisticRegression()
estimators.append(('logistic', model3))
# Create ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X, Y, cv=kfold)
print(results.mean())

0.73950786056
