### Voting Classifier
* Combine conceptually differnt machine learning classifiers & use majority vote
* Calculate average predicted probabilities
* voting == "hard" means majority wins

In [1]:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

In [2]:
iris = datasets.load_iris()

In [3]:
X, y = iris.data[:, 1:3], iris.target

In [5]:
clf1 = LogisticRegression(random_state=1)

In [6]:
clf2 = RandomForestClassifier(random_state=1)

In [7]:
clf3 = GaussianNB()

In [8]:
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

In [9]:
for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):
    scores = cross_val_score(clf,X,y,cv=5, scoring='accuracy')
    print ("Accuracy : ", scores.mean(), scores.std(), label)

Accuracy :  0.9 0.0471404520791 Logistic Regression
Accuracy :  0.933333333333 0.0471404520791 Random Forest
Accuracy :  0.913333333333 0.04 naive Bayes
Accuracy :  0.946666666667 0.049888765157 Ensemble


### Soft Voting
* Each estimator returns probability for each class
* Different weights are associated for each estimator.
* Final class is derived is derived from 

* w1, w2, w3 = 1
* classifier	   class 1	    class 2	    class 3
* classifier 1	w1 * 0.2	w1 * 0.5	w1 * 0.3
* classifier 2	w2 * 0.6	w2 * 0.3	w2 * 0.1
* classifier 3	w3 * 0.3	w3 * 0.4	w3 * 0.3
* weighted average	0.37	0.4	        0.23

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from itertools import product
from sklearn.ensemble import VotingClassifier

In [11]:
iris = datasets.load_iris()

In [12]:
X = iris.data[:, [0,2]]

In [14]:
y = iris.target

In [18]:
# Training classifiers
clf1 = DecisionTreeClassifier(max_depth=4)
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = SVC(kernel='rbf', probability=True)
eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('svc', clf3)], voting='soft', weights=[1,2,2])

In [19]:
for clf, label in zip([clf1, clf2, clf3, eclf], ['Decision Tree', 'Nearest Neighbour', 'Support Vector', 'Ensemble']):
    scores = cross_val_score(clf,X,y,cv=5, scoring='accuracy')
    print ("Accuracy : ", scores.mean(), scores.std(), label)

Accuracy :  0.946666666667 0.033993463424 Decision Tree
Accuracy :  0.94 0.0442216638714 Nearest Neighbour
Accuracy :  0.946666666667 0.033993463424 Support Vector
Accuracy :  0.953333333333 0.033993463424 Ensemble


### Using the VotingClassifier with GridSearch

In [20]:
from sklearn.model_selection import GridSearchCV

In [21]:
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()

In [22]:
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft')

In [31]:
params = {'lr__C': [1.0, 100.0], 'rf__n_estimators': [20, 200], 'weights':[[2,5,1],[3,4,1]]}

In [32]:
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid = grid.fit(iris.data, iris.target)

In [33]:
cross_val_score(grid.best_estimator_, X,y , cv=5)

array([ 0.93333333,  1.        ,  0.83333333,  0.93333333,  1.        ])

In [34]:
grid.best_params_

{'lr__C': 1.0, 'rf__n_estimators': 20, 'weights': [2, 5, 1]}