### Overview 

A demo of how to set up a classifier ensemble in python using hard voting (majority) and soft voting (probability).

### Data in

In [1]:
import pandas as pd

diabetes_data = pd.read_csv("../dataset/diabetes_processed.csv", index_col=0)

In [2]:
diabetes_data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.639947,0.848324,0.149641,0.907270,-0.692891,0.204013,0.468492,1.425995,1
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672,0
2,1.233880,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584,1
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549,0
4,-1.141852,0.504055,-1.504687,0.907270,0.765836,1.409746,5.484909,-0.020496,1
...,...,...,...,...,...,...,...,...,...
763,1.827813,-0.622642,0.356432,1.722735,0.870031,0.115169,-0.908682,2.532136,0
764,-0.547919,0.034598,0.046245,0.405445,-0.692891,0.610154,-0.398282,-0.531023,0
765,0.342981,0.003301,0.149641,0.154533,0.279594,-0.735190,-0.685193,-0.275760,0
766,-0.844885,0.159787,-0.470732,-1.288212,-0.692891,-0.240205,-0.371101,1.170732,1


### Fit

In [3]:
X = diabetes_data.drop("Outcome", axis=1)
Y = diabetes_data["Outcome"]

In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

In [5]:
from sklearn.ensemble import VotingClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [6]:
log_clf = LogisticRegression(C=1, solver="liblinear")

svc_clf = SVC(C=1, kernel="linear", gamma="auto")

naive_cls = GaussianNB()

#### Hard voting

In [7]:
voting_clf_hard = VotingClassifier(estimators = [("linear", log_clf),
                                                 ("SVC", svc_clf),
                                                 ("naive", naive_cls)],
                                  voting="hard")

In [8]:
voting_clf_hard.fit(x_train, y_train)

VotingClassifier(estimators=[('linear',
                              LogisticRegression(C=1, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('SVC',
                              SVC(C=1, break_ties=False, cache_size=200,
                                  class_weight=None, coef0=0.0,
                                  decision_function_shape='ovr', degree=3,
                                  g

In [9]:
y_pred = voting_clf_hard.predict(x_test)

In [10]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.7272727272727273

In [11]:
for clf_hard in (log_clf, svc_clf, naive_cls, voting_clf_hard):
    clf_hard.fit(x_train, y_train)
    
    y_pred = clf_hard.predict(x_test)
    print(clf_hard.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.7316017316017316
SVC 0.7402597402597403
GaussianNB 0.7229437229437229
VotingClassifier 0.7272727272727273


#### Soft

In [12]:
svc_soft = SVC(C=1, kernel="linear", gamma="auto", probability=True)

voting_clf_soft = VotingClassifier(estimators = [("linear", log_clf),
                                                 ("SVC", svc_soft),
                                                 ("naive", naive_cls)],
                                  voting="soft", 
                                  weights=[0.25, 0.5, 0.25])



In [13]:
for clf_soft in (log_clf, svc_soft, naive_cls, voting_clf_soft):
    clf_soft.fit(x_train, y_train)
    
    y_pred = clf_soft.predict(x_test)
    print(clf_soft.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.7316017316017316
SVC 0.7402597402597403
GaussianNB 0.7229437229437229
VotingClassifier 0.7316017316017316
