In [1]:
# - Ensembles can give you a boost in accuracy on your dataset.

# - Bagging ensemble methods: bagged decision trees, random forest and extra trees.
# - Boosting ensemble methods: AdaBoost and stochastic gradient boosting. 
# - Voting ensemble methods: combine the predictions from multiple algorithms.

In [2]:
# 15.1 Combine Models into Ensemble Predictions

In [3]:
# - The three most popular methods for combining the predictions from different models are:
    
    # - Bagging: Building multiple models (typically of the same type) from different subsamples of the 
    # training dataset.
    
    # - Boosting: Building multiple models (typically of the same type) each of which learns to fix the 
    # prediction errors of a prior model in the sequence of models.
    
    # - Voting. Building multiple models (typically of differing types) and simple statistics (like 
    # calculating the mean) are used to combine predictions.

In [4]:
from pandas import read_csv

In [5]:
import numpy

In [6]:
import sys

In [7]:
def print_data(_data):
    return numpy.savetxt(sys.stdout, _data[:5,:], '%5.3f')

In [8]:
_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'

In [9]:
_col_names = ['preg','plas','pres','skin','test','mass','pedi','age','class']

In [10]:
_dataframe = read_csv(_uri, names=_col_names)

In [11]:
_array = _dataframe.values

In [12]:
print_data(_array)

6.000 148.000 72.000 35.000 0.000 33.600 0.627 50.000 1.000
1.000 85.000 66.000 29.000 0.000 26.600 0.351 31.000 0.000
8.000 183.000 64.000 0.000 0.000 23.300 0.672 32.000 1.000
1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000 0.000
0.000 137.000 40.000 35.000 168.000 43.100 2.288 33.000 1.000


In [13]:
_X = _array[:,0:8]

In [14]:
print_data(_X)

6.000 148.000 72.000 35.000 0.000 33.600 0.627 50.000
1.000 85.000 66.000 29.000 0.000 26.600 0.351 31.000
8.000 183.000 64.000 0.000 0.000 23.300 0.672 32.000
1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000
0.000 137.000 40.000 35.000 168.000 43.100 2.288 33.000


In [15]:
_Y = _array[:,8:]

In [16]:
print_data(_Y)

1.000
0.000
1.000
0.000
1.000


In [17]:
_Y = numpy.ravel(_Y)

In [18]:
print(_Y[:5])

[ 1.  0.  1.  0.  1.]


In [19]:
from sklearn.model_selection import KFold

In [20]:
_kfold = KFold(n_splits=10, random_state=7)

In [21]:
# 15.2 Bagging Algorithms

In [22]:
# - Bootstrap Aggregation (or Bagging) involves taking multiple samples from your training dataset 
# (with replacement) and training a model for each sample. 

# - The final output prediction is averaged across the predictions of all of the sub-models.

In [23]:
# 15.2.1 Bagged Decision Trees

In [24]:
# - Bagging performs best with algorithms that have high variance. 

# - A popular example are decision trees, often constructed without pruning.

In [25]:
from sklearn.model_selection import cross_val_score

In [26]:
from sklearn.ensemble import BaggingClassifier

In [27]:
from sklearn.tree import DecisionTreeClassifier

In [28]:
_cart = DecisionTreeClassifier()

In [29]:
_num_trees = 100

In [30]:
_model = BaggingClassifier(base_estimator=_cart, n_estimators=_num_trees, random_state=7)

In [31]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold)

In [32]:
'{:.3%}'.format(_score.mean())

'77.075%'

In [33]:
# 15.2.2 Random Forest

In [34]:
# - Random Forests is an extension of bagged decision trees. 

# - Samples of the training dataset are taken with replacement, but the trees are constructed in a 
# way that reduces the correlation between individual classifiers. 

# - Specifically, rather than greedily choosing the best split point in the construction of each tree, 
# only a random subset of features are considered for each split.

In [35]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
_model = RandomForestClassifier(n_estimators=100, max_features=3)

In [37]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold)

In [38]:
'{:.3%}'.format(_score.mean())

'76.816%'

In [39]:
# 15.2.3 Extra Trees

In [40]:
# - Extra Trees are another modification of bagging where random trees are constructed from samples of 
# the training dataset.

In [41]:
from sklearn.ensemble import ExtraTreesClassifier

In [42]:
_model = ExtraTreesClassifier(n_estimators=100, max_features=7)

In [43]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold)

In [44]:
'{:.3%}'.format(_score.mean())

'75.907%'

In [45]:
# 15.3 Boosting Algorithms

In [46]:
# - Boosting ensemble algorithms creates a sequence of models that attempt to correct the mistakes of 
# the models before them in the sequence. 

# - Once created, the models make predictions which may be weighted by their demonstrated accuracy and 
# the results are combined to create a final output prediction.

In [47]:
# 15.3.1 AdaBoost

In [48]:
# - AdaBoost was perhaps the first successful boosting ensemble algorithm. 

# - It generally works by weighting instances in the dataset by how easy or difficult they are to classify, 
# allowing the algorithm to pay less attention to them in the construction of subsequent models.

In [49]:
from sklearn.ensemble import AdaBoostClassifier

In [50]:
_model = AdaBoostClassifier(n_estimators=30, random_state=7)

In [51]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold)

In [52]:
'{:.3%}'.format(_score.mean())

'76.046%'

In [53]:
# 15.3.2 Stochastic Gradient Boosting

In [54]:
# - Stochastic Gradient Boosting (also called Gradient Boosting Machines) are one of the most sophisticated 
# ensemble techniques. 

# - It is also a technique that is proving to be perhaps one of the best techniques available for improving 
# performance via ensembles.

In [55]:
from sklearn.ensemble import GradientBoostingClassifier

In [56]:
_model = GradientBoostingClassifier(n_estimators=100, random_state=7)

In [57]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold)

In [58]:
'{:.3%}'.format(_score.mean())

'76.690%'

In [59]:
# 15.4 Voting Ensemble

In [60]:
# - Voting is one of the simplest ways of combining the predictions from multiple machine learning algorithms. 

# - It works by first creating two or more standalone models from your training dataset. 

# - A Voting Classifier can then be used to wrap your models and average the predictions of the sub-models 
# when asked to make predictions for new data. 

# - The predictions of the sub-models can be weighted, but specifying the weights for classifiers manually 
# or even heuristically is difficult. 

# - More advanced methods can learn how to best weight the predictions from sub-models, but this is called 
# stacking (stacked aggregation) and is currently not provided in scikit-learn.

In [61]:
from sklearn.linear_model import LogisticRegression

In [62]:
from sklearn.tree import DecisionTreeClassifier

In [63]:
from sklearn.svm import SVC

In [64]:
from sklearn.ensemble import VotingClassifier

In [65]:
# - create the sub models

In [66]:
_estimators = []

In [67]:
_model1 = LogisticRegression()

In [68]:
_estimators.append(('logistic', _model1))

In [69]:
_model2 = DecisionTreeClassifier()

In [70]:
_estimators.append(('cart', _model2))

In [71]:
_model3 = SVC()

In [72]:
_estimators.append(('svm', _model3))

In [73]:
# - create the ensemble model

In [74]:
_model_ensemble = VotingClassifier(_estimators)

In [75]:
_score = cross_val_score(_model_ensemble, _X, _Y, cv=_kfold)

In [76]:
'{:.3%}'.format(_score.mean())

'73.693%'