In [2]:
import pandas as pd

In [3]:
diabetes_data = pd.read_csv('dataset/PimaIndians_processed.csv')
diabetes_data.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.639947,0.848324,0.149641,0.90727,-0.692891,0.204013,0.468492,1.425995,1
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672,0
2,1.23388,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584,1
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549,0
4,-1.141852,0.504055,-1.504687,0.90727,0.765836,1.409746,5.484909,-0.020496,1
5,0.342981,-0.153185,0.253036,-1.288212,-0.692891,-0.811341,-0.818079,-0.27576,0
6,-0.250952,-1.342476,-0.98771,0.719086,0.071204,-0.125977,-0.676133,-0.616111,1
7,1.827813,-0.184482,-3.572597,-1.288212,-0.692891,0.419775,-1.020427,-0.360847,0
8,-0.547919,2.381884,0.046245,1.534551,4.021922,-0.189437,-0.947944,1.681259,1
9,1.23388,0.128489,1.390387,-1.288212,-0.692891,-4.060474,-0.724455,1.766346,1


In [4]:
X = diabetes_data.drop('Outcome', axis=1)

Y = diabetes_data['Outcome']

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [6]:
x_train.shape, y_train.shape

((614, 8), (614,))

In [7]:
x_test.shape, y_test.shape

((154, 8), (154,))

In [8]:
from sklearn.ensemble import VotingClassifier

# learners are as different as possible and are 
# learning using different learning functions
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [9]:
log_clf = LogisticRegression(C=1, solver='liblinear')

svc_clf = SVC(C=1, kernel='linear', gamma='auto')

naive_clf = GaussianNB()

In [11]:
voting_clf_hard = VotingClassifier(estimators=[('lr', log_clf),
                                             ('svc', svc_clf),
                                             ('naive', naive_clf)],
                                  voting='hard')

In [12]:
voting_clf_hard.fit(x_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('svc',
                              SVC(C=1, cache_size=200, class_weight=None,
                                  coef0=0.0, decision_function_shape='ovr',
                                  degree=3, gamma='auto', kernel='linear',
                          

In [13]:
y_pred = voting_clf_hard.predict(x_test)

In [15]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.6883116883116883

In [16]:
for clf_hard in (log_clf, svc_clf, naive_clf, voting_clf_hard):
    
    clf_hard.fit(x_train, y_train)
    y_pred = clf_hard.predict(x_test)
    
    print(clf_hard.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.6948051948051948
SVC 0.6948051948051948
GaussianNB 0.6688311688311688
VotingClassifier 0.6883116883116883


In [17]:
svc_clf_soft = SVC(C=1, kernel='linear', gamma='auto', probability=True)

In [19]:
voting_clf_soft = VotingClassifier(estimators=[('lr', log_clf),
                                             ('svc', svc_clf_soft),
                                             ('naive', naive_clf)],
                                  voting='soft',
                                  weights = [0.25, 0.5, 0.25])

In [20]:
for clf_soft in (log_clf, svc_clf_soft, naive_clf, voting_clf_soft):
    
    clf_soft.fit(x_train, y_train)
    y_pred = clf_soft.predict(x_test)
    
    print(clf_soft.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.6948051948051948
SVC 0.6948051948051948
GaussianNB 0.6688311688311688
VotingClassifier 0.6818181818181818
