In [5]:
#Bootstrap Aggregation (or Bagging) involves taking multiple samples from your training dataset (with replacement) 
#and training a model for each sample. The final output prediction is averaged across the predictions of all of the sub-models.
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
filename = "/Users/fengxu/AI/Data/pima-indians-diabetes.data.txt"
names=["preg", "plas", "pres", "skin", "test", "mass", "pedi", "age", "class"]
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
seed = 7
kfold = KFold(n_splits=10, random_state=seed)
cart = DecisionTreeClassifier()
num_trees = 100

#Bagging performs best with algorithms that have high variance. 
#A popular example are decision trees, often constructed without pruning.
#Bagged Decision Trees
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed) 
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())


## Random Forest Classification
from sklearn.ensemble import RandomForestClassifier
num_trees = 100
max_features = 3
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features) 
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())


#Extra Trees
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier(n_estimators=num_trees, max_features=max_features)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.770745044429
0.760338345865
0.76038961039


In [7]:
#Boosting ensemble algorithms creates a sequence of models that attempt to correct the mistakes of 
#the models before them in the sequence. Once created, the models make predictions which may be 
#weighted by their demonstrated accuracy and the results are combined to create a final output prediction.

#AdaBoost
from sklearn.ensemble import AdaBoostClassifier
num_trees = 30
seed=7
kfold = KFold(n_splits=10, random_state=seed)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())


#Stochastic Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed) 
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.76045796309
0.773325358852


In [8]:
#Voting is one of the simplest ways of combining the predictions from multiple machine learning algorithms. 
#It works by first creating two or more standalone models from your training dataset. 
#A Voting Classifier can then be used to wrap your models and average the predictions of the sub-models when asked to make predictions for new data.
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
kfold = KFold(n_splits=10, random_state=7)
# create the sub models
estimators = []
model1 = LogisticRegression()
estimators.append(( 'logistic' , model1))
model2 = DecisionTreeClassifier()
estimators.append(( 'cart' , model2))
model3 = SVC()
estimators.append(( 'svm' , model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = cross_val_score(ensemble, X, Y, cv=kfold)
print(results.mean())

0.730434039645
