# How To Achieve a Deep Understanding of Soft and Hard Voting in Emsemble Machine Leaning Methods

## How to fully understand what is going on inside soft and hard voting by building the algorithm that performs the votes from scratch

![element5-digital-T9CXBZLUvic-unsplash.jpg](attachment:element5-digital-T9CXBZLUvic-unsplash.jpg)
Photo by <a href="https://unsplash.com/@element5digital?utm_source=unsplash&utm_medium=referral&utm_content=creditCopyText">Element5 Digital</a> on <a href="https://unsplash.com/s/photos/vote?utm_source=unsplash&utm_medium=referral&utm_content=creditCopyText">Unsplash</a>

### Background


In [1]:
import pandas as pd
import numpy as np
import copy as cp

from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from typing import Tuple
from statistics import mode
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier

RANDOM_STATE : int = 42
N_SAMPLES : int = 10000
N_FEATURES : int = 25
N_CLASSES : int = 3
N_CLUSTERS_PER_CLASS : int = 2
    
FEATURE_NAME_PREFIX : str = "Feature"
TARGET_NAME : str = "Target"
    
N_SPLITS : int = 5

In [2]:
def make_classification_dataframe(n_samples : int = 10000, n_features : int = 25, n_classes : int = 2, n_clusters_per_class : int = 2, feature_name_prefix : str = "Feature", target_name : str = "Target", random_state : int = 42) -> pd.DataFrame:
    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, n_informative = n_classes * n_clusters_per_class, random_state=RANDOM_STATE)

    feature_names = [feature_name_prefix + " " + str(v) for v in np.arange(1, N_FEATURES+1)]
    return pd.concat([pd.DataFrame(X, columns=feature_names), pd.DataFrame(y, columns=[target_name])], axis=1)

df_data = make_classification_dataframe(n_samples=N_SAMPLES, n_features=N_FEATURES, n_classes=N_CLASSES, n_clusters_per_class=N_CLUSTERS_PER_CLASS, feature_name_prefix=FEATURE_NAME_PREFIX, target_name=TARGET_NAME, random_state=RANDOM_STATE)

X = df_data.drop([TARGET_NAME], axis=1).to_numpy()
y = df_data[TARGET_NAME].to_numpy()

df_data.head()

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,...,Feature 17,Feature 18,Feature 19,Feature 20,Feature 21,Feature 22,Feature 23,Feature 24,Feature 25,Target
0,-0.131637,2.281512,0.46881,0.707735,1.628051,0.622273,-0.434003,-0.992722,0.053795,-1.764985,...,-1.673779,0.276305,-1.685462,-0.801336,0.806151,0.369108,-0.843748,0.966868,-0.547149,2
1,-1.231544,-1.58088,0.684543,-0.343771,0.498176,-0.008396,-0.859592,-0.666477,-0.832989,-0.287655,...,0.341136,1.116596,1.134896,1.232907,1.295312,-0.253926,-0.528711,0.502124,0.896065,1
2,-1.301585,-1.922563,-0.623878,-0.740534,-0.723667,1.484827,1.227018,-0.050878,0.164059,0.301672,...,-0.90029,0.682905,0.680959,-0.02355,0.932216,0.109495,0.500366,0.956182,-2.268742,0
3,-0.899385,0.991619,0.494529,-0.672954,0.421605,-0.271674,1.245351,0.146567,0.389313,1.479558,...,-0.285753,-1.446158,-0.062296,0.583408,1.588965,0.412651,-1.891714,-0.575163,0.786847,0
4,-3.026721,0.745777,0.18845,-0.794256,1.40257,1.057481,0.454773,-0.174391,0.951417,-0.403872,...,0.959229,-1.964891,-0.296422,-0.755737,-0.489769,0.516726,-4.807225,1.215506,0.799321,1


In [3]:
def cross_val_predict(model, kfold : KFold, X : np.array, y : np.array) -> Tuple[np.array, np.array, np.array]:

    model_ = cp.deepcopy(model)
    
    no_classes = len(np.unique(y))
    
    actual_classes = np.empty([0], dtype=int)
    predicted_classes = np.empty([0], dtype=int)
    predicted_proba = np.empty([0, no_classes]) 

    for train_ndx, test_ndx in kfold.split(X):

        train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]

        actual_classes = np.append(actual_classes, test_y)

        model_.fit(train_X, train_y)
        predicted_classes = np.append(predicted_classes, model_.predict(test_X))

        try:
            predicted_proba = np.append(predicted_proba, model_.predict_proba(test_X), axis=0)
        except:
            predicted_proba = np.append(predicted_proba, np.zeros((len(test_X), no_classes), dtype=float), axis=0)

    return actual_classes, predicted_classes, predicted_proba

In [4]:
lr = LogisticRegression(random_state=RANDOM_STATE)
kfold = KFold(n_splits=N_SPLITS, random_state=RANDOM_STATE, shuffle=True)

%time actual, lr_predicted, lr_predicted_proba = cross_val_predict(lr, kfold, X, y)
print(f"Accuracy of Logistic Regression: {accuracy_score(actual, lr_predicted)}")
lr_predicted

Wall time: 284 ms
Accuracy of Logistic Regression: 0.6821


array([0, 0, 1, ..., 0, 2, 1])

In [5]:
classifiers = dict()
classifiers["Random Forrest"] = RandomForestClassifier(random_state=RANDOM_STATE)
classifiers["XG Boost"] = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE)
classifiers["Extra Random Trees"] = ExtraTreesClassifier(random_state=RANDOM_STATE)

predictions = [None] * len(classifiers)
predicted_probas = [None] * len(classifiers)

for i, (name, classifier) in enumerate(classifiers.items()):
    %time actual, predictions[i], predicted_probas[i] = cross_val_predict(classifier, kfold, X, y)
    print(f"Accuracy of {name}: {accuracy_score(actual, predictions[i])}")

predictions    

Wall time: 13.8 s
Accuracy of Random Forrest: 0.8742
Wall time: 15.6 s
Accuracy of XG Boost: 0.8838
Wall time: 5.14 s
Accuracy of Extra Random Trees: 0.8754


[array([2, 0, 0, ..., 0, 2, 1]),
 array([2, 0, 2, ..., 0, 2, 1], dtype=int64),
 array([2, 0, 0, ..., 0, 2, 1])]

In [6]:
def soft_voting(predicted_probas : np.array) -> np.array:

    no_voters = predicted_probas.shape[0]
    no_rows = predicted_probas.shape[1]
    no_cols = predicted_probas.shape[2]
    
    sv_predicted_proba = np.empty(shape=(no_rows, no_cols))
    sv_predicted_proba.fill(0)
    
    for i in range(0, no_cols - 1):
        for j in range(0, no_voters):
            sv_predicted_proba[:, i] += predicted_probas[j][:, i]
        sv_predicted_proba[:, i] /= no_voters
    
    sv_predicted_proba[:,-1] = 1 - sv_predicted_proba.sum(axis=1)
    sv_predicted = sv_predicted_proba.argmax(axis=1)
    
    return sv_predicted_proba, sv_predicted

def hard_voting(predictions : np.array) -> np.array:
    return [mode(v) for v in predictions.T]

In [88]:
def soft_voting_v2(predicted_probas : np.array) -> np.array:

    sv_predicted_proba = np.mean(predicted_probas, axis=0)
    
    no_classes = sv_predicted_proba.shape[1]

    sv_predicted_proba[:, no_classes-1] = 0
    sv_predicted_proba[:, cols-1] = 1- np.sum(GH, axis=1)    

    sv_predicted = sv_predicted_proba.argmax(axis=1)
    
    return sv_predicted_proba, sv_predicted

In [7]:
predicted_probas = np.array(predicted_probas)
predictions = np.array(predictions)

sv_predicted_proba, sv_predictions = soft_voting(predicted_probas)
hv_predictions = hard_voting(predictions)

print(f"Accuracy of Soft Voting: {accuracy_score(actual, sv_predictions)}")
print(f"Accuracy of Hard Voting: {accuracy_score(actual, hv_predictions)}")

Accuracy of Soft Voting: 0.8868
Accuracy of Hard Voting: 0.881


In [49]:
sv_predicted_proba

array([[0.18537103, 0.01361911, 0.80100986],
       [0.69101244, 0.07062499, 0.23836257],
       [0.50513057, 0.09451146, 0.40035797],
       ...,
       [0.57736211, 0.05994137, 0.36269652],
       [0.09022216, 0.01052167, 0.89925617],
       [0.02026342, 0.96622669, 0.01350989]])

In [69]:
GH = np.mean(predicted_probas, axis=0)

In [73]:
cols = GH.shape[1]

In [83]:
GH[:, cols-1] = 0

In [84]:
GH[:, cols-1] = 1- np.sum(GH, axis=1)

In [85]:
GH

array([[0.18537103, 0.01361911, 0.80100986],
       [0.69101244, 0.07062499, 0.23836257],
       [0.50513057, 0.09451146, 0.40035797],
       ...,
       [0.57736211, 0.05994137, 0.36269652],
       [0.09022216, 0.01052167, 0.89925617],
       [0.02026342, 0.96622669, 0.01350989]])

In [87]:
np.sum(GH[1])

1.0

In [68]:
GH[:, 2]

array([0.80100984, 0.23836258, 0.40035798, ..., 0.36269651, 0.89925616,
       0.01350989])

In [28]:
alg_0_class_0 = predicted_probas[0][:,0]
alg_1_class_0 = predicted_probas[1][:,0]
alg_2_class_0 = predicted_probas[2][:,0]

In [35]:
all_algs_class_0 = np.array([alg_0_class_0, alg_1_class_0, alg_2_class_0]).T

In [41]:
alg_2_class_0

array([0.33, 0.54, 0.51, ..., 0.52, 0.1 , 0.05])

In [43]:
np.set_printoptions(suppress=True)

In [44]:
all_algs_class_0

array([[0.17      , 0.05611309, 0.33      ],
       [0.58      , 0.95303732, 0.54      ],
       [0.54      , 0.4653917 , 0.51      ],
       ...,
       [0.46      , 0.75208634, 0.52      ],
       [0.15      , 0.02066649, 0.1       ],
       [0.01      , 0.00079027, 0.05      ]])

In [48]:
np.mean(all_algs_class_0[1])

0.6910124405225119

In [45]:
np.mean(all_algs_class_0, axis=1)

array([0.18537103, 0.69101244, 0.50513057, ..., 0.57736211, 0.09022216,
       0.02026342])

Firstly we need to understand the ``predicted_probas`` array by taking a look at its shape ...

In [8]:
predicted_probas.shape

(3, 10000, 3)

It is a 3 dimensional array. The first dimension is the number of classification algorithms, the second the number of rows in the data that predictions are held for and the third is the number of classes. Remember this is not a binary classification, we set the number of classes to 3 when the data was generated.

This sets things up to set the 3 variables for the number of voters, rows and columns which is not strictly speaking necessary but it does make the subsequent code more readable ...

In [9]:
no_voters = predicted_probas.shape[0]
no_rows = predicted_probas.shape[1]
no_cols = predicted_probas.shape[2]

print(no_voters, no_rows, no_cols)

3 10000 3


We also need to remind ourselves what the ``predicted_proba`` property looks like for a scikit-learn classifier.

The logistic regression ``predicted_proba`` property contains an array with 10,000 rows and 3 columns. It has one row for each data point and one column for each of the classifications, remembering that this is not a binary classification, our classification can have 3 values ...

In [10]:
df_data[TARGET_NAME].value_counts()

0    3353
1    3330
2    3317
Name: Target, dtype: int64

In [11]:
lr_predicted_proba

array([[0.44910829, 0.24542133, 0.30547038],
       [0.3697233 , 0.34403066, 0.28624605],
       [0.26560338, 0.43007386, 0.30432276],
       ...,
       [0.38909524, 0.26281518, 0.34808958],
       [0.15035778, 0.03440465, 0.81523757],
       [0.02246867, 0.84043645, 0.13709489]])

So, for the first row of data the logistic regression has predicted a 44.9% probability of the row belonging to class 0, a 24.5% probability of class 1 and a 30.5% probability of class 2. This means that the data in the first row is predicted as belonging to class 0 ...

The next thing the code in the helpder function does is to create an empty array that is going to hold the result of the soft voting. This needs to have 10,000 rows (one for every data point) and three columns (one for the probability of the first class, one for the second and one for the third) ...

In [12]:
sv_predicted_proba = np.empty(shape=(no_rows, no_cols))
sv_predicted_proba.fill(0)

print(sv_predicted_proba.shape)
sv_predicted_proba

(10000, 3)


array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       ...,
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

Next for the main iteration. It will iterate over all the columns except the last one. This seems a bit strange as we need a probability for all of the classifiers but hold on for a bit and we will come back to that ...

In [13]:
print(*range(0, no_cols - 1)) # see https://stackoverflow.com/questions/18424899/print-range-of-numbers-on-same-line 

0 1


Let's take the whole iteration ...

In [14]:
for i in range(0, no_cols - 1):
    for j in range(0, no_voters):
        sv_predicted_proba[:, i] += predicted_probas[j][:, i]
    sv_predicted_proba[:, i] /= no_voters

We are iterating over every column / classifier except the last one. For each column we are adding the predicted probability of each of our algorithms together and dividing by the number of algorithms. When the iteration completes, the result looks like this ...

In [15]:
sv_predicted_proba

array([[0.18537103, 0.01361911, 0.        ],
       [0.69101244, 0.07062499, 0.        ],
       [0.50513057, 0.09451146, 0.        ],
       ...,
       [0.57736211, 0.05994137, 0.        ],
       [0.09022216, 0.01052167, 0.        ],
       [0.02026342, 0.96622669, 0.        ]])

The final column / classification is blank where as if we had iterated over all of the columns / classifiers it would have been populated. The problem is that had we done so the sum of each row would not always be exactly one due to rounding errors.

Therefore, rather than using the iteration to set all three columns / classifiers an additional line of code sets the third one to be 1 minus the sum of the others ...

In [16]:
sv_predicted_proba[:,-1] = 1 - sv_predicted_proba.sum(axis=1)
sv_predicted_proba

array([[0.18537103, 0.01361911, 0.80100986],
       [0.69101244, 0.07062499, 0.23836257],
       [0.50513057, 0.09451146, 0.40035797],
       ...,
       [0.57736211, 0.05994137, 0.36269652],
       [0.09022216, 0.01052167, 0.89925617],
       [0.02026342, 0.96622669, 0.01350989]])

And there we have it! The ``sv_predicted_proba`` array contains a soft-voted version of the probabilities for each class based on the average of the ``predicted_probas`` property from each of the contributing algorithms,

However, we still need to know the class prediction (i.e. for each row did the soft vote predict class 0, 1 or 2). Fortunately the wonderful ``argmax`` function in the ``numpy`` library enables to do this in a single line of code ...

In [17]:
sv_predicted = sv_predicted_proba.argmax(axis=1)
sv_predicted

array([2, 0, 0, ..., 0, 2, 1], dtype=int64)

The ``argmax`` function simply picks the index of the highest value in an array along the axis specified in the ``axis`` parameter, so it picks 2 for the first row, 0 for the second, zero for the third etc.

At this point we have hand crafted the predicted probabilities and the predicted classes using the soft voting algorithm and in the process of writing the code from scratch we have attained a full understanding of exactly how soft voting works.

Hard voting is subtly different. Whereas soft voting averages the probability, hard voting picks the class that the majority of the algorithms voted for, for example ...

- Class 2, Class 2, Class 0 = Class 2
- Class 1, Class 1, Class 2 = Class 1
- Class 0, Class 0, Class 0 = Class 0
- Class 0, Class 1, Class 2 = Class 0 (actually, it would not matter which one is picked here as they have one vote each)

The hard voting algorithm can be implemented in a single line of code using Python list comprehension and ``numpy`` array functions ...

In [18]:
hv_predicted = [mode(v) for v in predictions.T] # Single line of code to implement the hard voting algorithm ...

The ``predictions.T`` syntax just transposes the array of arrays so that instead of 3 rows and 10,000 columns it is 10,000 rowns and 3 columns ...

In [19]:
print(predictions.shape)
predictions

(3, 10000)


array([[2, 0, 0, ..., 0, 2, 1],
       [2, 0, 2, ..., 0, 2, 1],
       [2, 0, 0, ..., 0, 2, 1]], dtype=int64)

In [20]:
print(predictions.T.shape)
predictions.T

(10000, 3)


array([[2, 2, 2],
       [0, 0, 0],
       [0, 2, 0],
       ...,
       [0, 0, 0],
       [2, 2, 2],
       [1, 1, 1]], dtype=int64)

The list comprehension then effectively takes each element (row) and applies ``statistics.mode`` to it, thereby selecting the classification that received the most votes from the algorithms ...

In [21]:
np.array(hv_predicted) # Shows the result for the 1st 3 and last 3 rows as displayed in the previous code cell

array([2, 0, 0, ..., 0, 2, 1], dtype=int64)

Now that we have a full understanding of the helpder functions and a deep understanding of how soft and hard voting works those helpder functions can be re-used to generate the results.

Note that through trial-and-error I found that the logistic regression has low accuracy for this dataset, hence I have excluded it in the final run to help illustrate the point that soft and hard voting genuinely do improve algorithm accuracy ...

In [22]:
for i, (name, classifier) in enumerate(classifiers.items()):
    print(f"Accuracy of {name}: {accuracy_score(actual, predictions[i])}")

print(f"Accuracy of Soft Voting: {accuracy_score(actual, sv_predicted)}")
print(f"Accuracy of Hard Voting: {accuracy_score(actual, hv_predicted)}")  

Accuracy of Random Forrest: 0.8742
Accuracy of XG Boost: 0.8838
Accuracy of Extra Random Trees: 0.8754
Accuracy of Soft Voting: 0.8868
Accuracy of Hard Voting: 0.881


Armed with that understanding we can revert to using the implementations found in the scikit-learn library safe with the knowledge that we fully understand what they are doing and how they work ...

In [23]:
estimators = list(classifiers.items())

vc_sv = VotingClassifier(estimators=estimators, voting="soft")
vc_hv = VotingClassifier(estimators=estimators, voting="hard")

%time actual, vc_sv_predicted, vc_sv_predicted_proba = cross_val_predict(vc_sv, kfold, X, y)
%time actual, vc_hv_predicted, _ = cross_val_predict(vc_hv, kfold, X, y)

print(f"Accuracy of SciKit-Learn Soft Voting: {accuracy_score(actual, vc_sv_predicted)}")
print(f"Accuracy of SciKit-Learn Hard Voting: {accuracy_score(actual, vc_hv_predicted)}")

Wall time: 35.3 s
Wall time: 40.4 s
Accuracy of SciKit-Learn Soft Voting: 0.8868
Accuracy of SciKit-Learn Hard Voting: 0.881


https://www.kaggle.com/saurabhshahane/voting-classifier