In [1]:
import pandas as pd
import numpy as np
import copy as cp

from sklearn.datasets import make_classification

from sklearn.model_selection import KFold, train_test_split
from typing import Tuple

from sklearn.linear_model import LogisticRegression
#from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier


from statistics import mode
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score

from sklearn.metrics import roc_curve, precision_recall_curve

In [2]:
RANDOM_STATE : int = 42
N_SAMPLES : int = 10000
N_FEATURES : int = 25
N_CLASSES : int = 3
N_CLUSTERS_PER_CLASS : int = 2
    
FEATURE_NAME_PREFIX : str = "Feature"
TARGET_NAME : str = "Target"
    
N_SPLITS : int = 5

In [3]:
#X, y = make_classification(n_samples=N_SAMPLES, n_features=N_FEATURES, n_classes=N_CLASSES, n_informative=(N_CLASSES * N_CLUSTERS_PER_CLASS), random_state=RANDOM_STATE)

In [4]:
def make_classification_dataframe(n_samples : int = 10000, n_features : int = 25, n_classes : int = 2, n_clusters_per_class : int = 2, feature_name_prefix : str = "Feature", target_name : str = "Target", random_state : int = 42) -> pd.DataFrame:
    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, n_informative = n_classes * n_clusters_per_class, random_state=RANDOM_STATE)

    feature_names = [feature_name_prefix + " " + str(v) for v in np.arange(1, N_FEATURES+1)]
    return pd.concat([pd.DataFrame(X, columns=feature_names), pd.DataFrame(y, columns=[target_name])], axis=1)

In [5]:
df_data = make_classification_dataframe(n_samples=N_SAMPLES, n_features=N_FEATURES, n_classes=N_CLASSES, n_clusters_per_class=N_CLUSTERS_PER_CLASS, feature_name_prefix=FEATURE_NAME_PREFIX, target_name=TARGET_NAME, random_state=RANDOM_STATE)
df_data.head()

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,...,Feature 17,Feature 18,Feature 19,Feature 20,Feature 21,Feature 22,Feature 23,Feature 24,Feature 25,Target
0,-0.131637,2.281512,0.46881,0.707735,1.628051,0.622273,-0.434003,-0.992722,0.053795,-1.764985,...,-1.673779,0.276305,-1.685462,-0.801336,0.806151,0.369108,-0.843748,0.966868,-0.547149,2
1,-1.231544,-1.58088,0.684543,-0.343771,0.498176,-0.008396,-0.859592,-0.666477,-0.832989,-0.287655,...,0.341136,1.116596,1.134896,1.232907,1.295312,-0.253926,-0.528711,0.502124,0.896065,1
2,-1.301585,-1.922563,-0.623878,-0.740534,-0.723667,1.484827,1.227018,-0.050878,0.164059,0.301672,...,-0.90029,0.682905,0.680959,-0.02355,0.932216,0.109495,0.500366,0.956182,-2.268742,0
3,-0.899385,0.991619,0.494529,-0.672954,0.421605,-0.271674,1.245351,0.146567,0.389313,1.479558,...,-0.285753,-1.446158,-0.062296,0.583408,1.588965,0.412651,-1.891714,-0.575163,0.786847,0
4,-3.026721,0.745777,0.18845,-0.794256,1.40257,1.057481,0.454773,-0.174391,0.951417,-0.403872,...,0.959229,-1.964891,-0.296422,-0.755737,-0.489769,0.516726,-4.807225,1.215506,0.799321,1


In [6]:
def cross_val_predict_old(model, kfold : KFold, X : np.array, y : np.array, target_type : type = int) -> Tuple[np.array, np.array, np.array]:

    model_ = cp.deepcopy(model)
    
    no_rows = X.shape[0]
    no_classifications = len(np.unique(y))
    
    actual_classes = np.array([])
    predicted_classes = np.array([])
    predicted_proba = np.array([])

    splits = kfold.split(X)
    
    for train_ndx, test_ndx in splits:

        train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]

        actual_classes = np.append(actual_classes, test_y)

        model_.fit(train_X, train_y)
        predicted_classes = np.append(predicted_classes, model_.predict(test_X))
        try:
            predicted_proba = np.append(predicted_proba, model_.predict_proba(test_X))
            print(predicted_proba.shape)
            print(model_.predict_proba(test_X).shape)
        except AttributeError:
            # When the model is a VotingClassifier with voting="hard" the following error is generated - AttributeError: predict_proba is not available when voting='hard'. In this instance the predicted_probas array is filled with float zeroes with the same number of rows as the source data set and a number of columns equal to the number of classes in the classification
            predicted_proba = np.zeros((no_rows, no_classifications), dtype=float)

    return actual_classes.astype(target_type), predicted_classes.astype(target_type), predicted_proba.reshape(no_rows, no_classifications) # Reshape to the number of rows in the source features and the number of unique classes that appear in the target. For example 10,000 data points with y = 0 or 1 will have produced an array (20000,) in shape that needs to be reshaped to (10000,2)

In [66]:
def cross_val_predict(model, kfold : KFold, X : np.array, y : np.array, target_type : type = int) -> Tuple[np.array, np.array, np.array]:

    model_ = cp.deepcopy(model)
    
    #no_rows = X.shape[0]
    no_classifications = len(np.unique(y))
    
    actual_classes = np.array([])
    predicted_classes = np.array([])
    #predicted_proba = np.array([])
    predicted_proba = np.empty([0, no_classifications])

    splits = kfold.split(X)
    
    for train_ndx, test_ndx in splits:

        train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]

        actual_classes = np.append(actual_classes, test_y)

        model_.fit(train_X, train_y)
        predicted_classes = np.append(predicted_classes, model_.predict(test_X))
        try:
            predicted_proba = np.append(predicted_proba, model_.predict_proba(test_X), axis=0)
            print(predicted_proba.shape)
            print(model_.predict_proba(test_X).shape)
        except AttributeError:
            # When the model is a VotingClassifier with voting="hard" the following error is generated - AttributeError: predict_proba is not available when voting='hard'. In this instance the predicted_probas array is filled with float zeroes with the same number of rows as the source data set and a number of columns equal to the number of classes in the classification
            #predicted_proba = np.zeros((no_rows, no_classifications), dtype=float)
            predicted_proba = np.append(predicted_proba, np.zeros_like(test_X, dtype=float))

    #return actual_classes.astype(target_type), predicted_classes.astype(target_type), predicted_proba.reshape(no_rows, no_classifications) # Reshape to the number of rows in the source features and the number of unique classes that appear in the target. For example 10,000 data points with y = 0 or 1 will have produced an array (20000,) in shape that needs to be reshaped to (10000,2)
    return actual_classes.astype(target_type), predicted_classes.astype(target_type), predicted_proba 

In [None]:
rf_predicted_proba.shape
GH_ = np.zeros()

In [44]:
rf_predicted_proba.shape

(10000, 3)

In [51]:
test1 = rf_predicted_proba[0:2000]
test2 = rf_predicted_proba[2000:4000]
test3 = rf_predicted_proba[4000:6000]

test1.shape, test2.shape, test3.shape

((2000, 3), (2000, 3), (2000, 3))

In [65]:
GH = np.empty([0, 3])
GH

array([], shape=(0, 3), dtype=float64)

In [62]:
GH = np.append(GH, test1, axis=0)
GH.shape

(2000, 3)

In [63]:
GH = np.append(GH, test2, axis=0)
GH.shape

(4000, 3)

In [9]:
#GH = np.empty([0,2])
#GH.shape

In [10]:
#predicted_proba = np.array([0,2])
#predicted_proba.shape

In [11]:
kfold = KFold(n_splits=N_SPLITS, random_state=RANDOM_STATE, shuffle=True)

In [12]:
lr = LogisticRegression(random_state=RANDOM_STATE)
rf = RandomForestClassifier(random_state=RANDOM_STATE)
xg = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE)
#cb = CatBoostClassifier(silent=True, random_state=RANDOM_STATE)
xt = ExtraTreesClassifier(random_state=RANDOM_STATE)

In [13]:
X = df_data.drop([TARGET_NAME], axis=1)
y = df_data[TARGET_NAME]

In [67]:
%%time
actual, lr_predicted, lr_predicted_proba = cross_val_predict(lr, kfold, X.to_numpy(), y.to_numpy())

(2000, 3)
(2000, 3)
(4000, 3)
(2000, 3)
(6000, 3)
(2000, 3)
(8000, 3)
(2000, 3)
(10000, 3)
(2000, 3)
Wall time: 865 ms


In [68]:
lr_predicted_proba

array([[0.44910829, 0.24542133, 0.30547038],
       [0.3697233 , 0.34403066, 0.28624605],
       [0.26560338, 0.43007386, 0.30432276],
       ...,
       [0.38909524, 0.26281518, 0.34808958],
       [0.15035778, 0.03440465, 0.81523757],
       [0.02246867, 0.84043645, 0.13709489]])

In [15]:
lr_predicted_proba

array([[0.44910829, 0.24542133, 0.30547038],
       [0.3697233 , 0.34403066, 0.28624605],
       [0.26560338, 0.43007386, 0.30432276],
       ...,
       [0.38909524, 0.26281518, 0.34808958],
       [0.15035778, 0.03440465, 0.81523757],
       [0.02246867, 0.84043645, 0.13709489]])

In [16]:
lr_predicted_proba.shape

(10000, 3)

In [69]:
%%time
actual, rf_predicted, rf_predicted_proba = cross_val_predict(rf, kfold, X.to_numpy(), y.to_numpy())

(2000, 3)
(2000, 3)
(4000, 3)
(2000, 3)
(6000, 3)
(2000, 3)
(8000, 3)
(2000, 3)
(10000, 3)
(2000, 3)
Wall time: 1min 2s


In [18]:
%%time
actual, xg_predicted, xg_predicted_proba = cross_val_predict(xg, kfold, X.to_numpy(), y.to_numpy())

(6000,)
(2000, 3)
(12000,)
(2000, 3)
(18000,)
(2000, 3)
(24000,)
(2000, 3)
(30000,)
(2000, 3)
Wall time: 1min 4s


In [19]:
%%time
actual, xt_predicted, xt_predicted_proba = cross_val_predict(xt, kfold, X.to_numpy(), y.to_numpy())

(6000,)
(2000, 3)
(12000,)
(2000, 3)
(18000,)
(2000, 3)
(24000,)
(2000, 3)
(30000,)
(2000, 3)
Wall time: 25.2 s


In [20]:
def soft_voting(predicted_probas : np.array) -> np.array:

    no_voters = predicted_probas.shape[0]
    no_rows = predicted_probas.shape[1]
    no_cols = predicted_probas.shape[2]
    
    sv_predicted_proba = np.empty(shape=(no_rows, no_cols))
    sv_predicted_proba.fill(0)
    
    for i in range(0, no_cols - 1):
        for j in range(0, no_voters):
            sv_predicted_proba[:, i] += predicted_probas[j][:, i]
        sv_predicted_proba[:, i] /= no_voters
    
    sv_predicted_proba[:,-1] = 1 - sv_predicted_proba.sum(axis=1)
    sv_predicted = sv_predicted_proba.argmax(axis=1)

    
    return sv_predicted_proba, sv_predicted

In [21]:
def hard_voting(predictions : np.array) -> np.array:
    return [mode(v) for v in predictions.T]

Firstly we need to understand the ``predicted_probas`` array by taking a look at its shape ...

In [22]:
predicted_probas = np.array([lr_predicted_proba, rf_predicted_proba, xg_predicted_proba, xt_predicted_proba])
predicted_probas.shape

(4, 10000, 3)

It is a 3 dimensional array. The first dimension is the number of classification algorithms, the second the number of rows in the data that predictions are held for and the third is the number of classes. Remember this is not a binary classification, we set the number of classes to 3 when the data was generated.

This sets things up to set the 3 variables for the number of voters, rows and columns which is not strictly speaking necessary but it does make the subsequent code more readable ...

In [23]:
no_voters = predicted_probas.shape[0]
no_rows = predicted_probas.shape[1]
no_cols = predicted_probas.shape[2]

print(no_voters, no_rows, no_cols)

4 10000 3


We also need to remind ourselves what the ``predicted_proba`` property looks like for a scikit-learn classifier ...

In [24]:
rf_predicted_proba

array([[0.17, 0.02, 0.81],
       [0.58, 0.07, 0.35],
       [0.54, 0.1 , 0.36],
       ...,
       [0.46, 0.08, 0.46],
       [0.15, 0.  , 0.85],
       [0.01, 0.97, 0.02]])

The Random Forest ``predicted_proba`` property contains an array with 10,000 rows and 3 columns. It has one row for each data point and one column for each of the classifications, remembering that this is not a binary classification, our classification can have 3 values ...

In [25]:
df_data[TARGET_NAME].value_counts()

0    3353
1    3330
2    3317
Name: Target, dtype: int64

So, for the first row of data the Random Forest has predicted a 17% probability of the row belonging to class 0, a 2% probability of class 1 and an 81% probability of class 2. This means that the data in the first row is predicted as belonging to class 2 ...

In [26]:
rf_predicted[0]

2

The next thing the code in the helpder function does is to create an empty array that is going to hold the result of the soft voting. This needs to have 10,000 rows (one for every data point) and three columns (one for the probability of the first class, one for the second and one for the third) ...

In [27]:
sv_predicted_proba = np.empty(shape=(no_rows, no_cols))
sv_predicted_proba.fill(0)

print(sv_predicted_proba.shape)
sv_predicted_proba

(10000, 3)


array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       ...,
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

Next for the main iteration. It will iterate over all the columns except the last one. This seems a bit strange as we need a probability for all of the classifiers but hold on for a bit and we will come back to that ...

In [28]:
print(*range(0, no_cols - 1)) # see https://stackoverflow.com/questions/18424899/print-range-of-numbers-on-same-line 

0 1


Let's take the whole iteration ...

In [29]:
for i in range(0, no_cols - 1):
    for j in range(0, no_voters):
        sv_predicted_proba[:, i] += predicted_probas[j][:, i]
    sv_predicted_proba[:, i] /= no_voters

We are iterating over every column / classifier except the last one. For each column we are adding the predicted probability of each of our algorithms together and dividing by the number of algorithms. When the iteration completes, the result looks like this ...

In [30]:
sv_predicted_proba

array([[0.25130534, 0.07156967, 0.        ],
       [0.61069015, 0.13897641, 0.        ],
       [0.44524877, 0.17840206, 0.        ],
       ...,
       [0.53029539, 0.11065982, 0.        ],
       [0.10525607, 0.01649241, 0.        ],
       [0.02081474, 0.93477913, 0.        ]])

The final column / classification is blank where as if we had iterated over all of the columns / classifiers it would have been populated. The problem is that had we done so the sum of each row would not always be exactly one due to rounding errors.

Therefore, rather than using the iteration to set all three columns / classifiers an additional line of code sets the third one to be 1 minus the sum of the others ...

In [31]:
sv_predicted_proba[:,-1] = 1 - sv_predicted_proba.sum(axis=1)

In [32]:
sv_predicted_proba

array([[0.25130534, 0.07156967, 0.67712499],
       [0.61069015, 0.13897641, 0.25033344],
       [0.44524877, 0.17840206, 0.37634917],
       ...,
       [0.53029539, 0.11065982, 0.35904478],
       [0.10525607, 0.01649241, 0.87825152],
       [0.02081474, 0.93477913, 0.04440614]])

And there we have it! The ``sv_predicted_proba`` array contains a soft-voted version of the probabilities for each class based on the average of the ``predicted_probas`` property from each of the contributing algorithms,

However, we still need to know the class prediction (i.e. for each row did the soft vote predict class 0, 1 or 2). Fortunately the wonderful ``argmax`` function in the ``numpy`` library enables to do this in a single line of code ...

In [33]:
sv_predicted = sv_predicted_proba.argmax(axis=1)
sv_predicted

array([2, 0, 0, ..., 0, 2, 1], dtype=int64)

The ``argmax`` function simply picks the index of the highest value in an array along the axis specified in the ``axis`` parameter, so it picks 2 for the first row, 0 for the second, zero for the third etc.

At this point we have hand crafted the predicted probabilities and the predicted classes using the soft voting algorithm and in the process of writing the code from scratch we have attained a full understanding of exactly how soft voting works.

Hard voting is subtly different. Whereas soft voting averages the probability, hard voting picks the class that the majority of the algorithms voted for, for example ...

- Class 2, Class 2, Class 0 = Class 2
- Class 1, Class 1, Class 2 = Class 1
- Class 0, Class 0, Class 0 = Class 0
- Class 0, Class 1, Class 2 = Class 0 (actually, it would not matter which one is picked here as they have one vote each)

The hard voting algorithm can be implemented in a single line of code using Python list comprehension and ``numpy`` array functions ...

In [34]:
predictions = np.array([rf_predicted, xg_predicted, xt_predicted])

hv_predicted = [mode(v) for v in predictions.T] # Single line of code to implement the hard voting algorithm ...

The ``predictions.T`` syntax just transposes the array of arrays so that instead of 3 rows and 10,000 columns it is 10,000 rowns and 3 columns ...

In [35]:
print(predictions.shape)
predictions

(3, 10000)


array([[2, 0, 0, ..., 0, 2, 1],
       [2, 0, 2, ..., 0, 2, 1],
       [2, 0, 0, ..., 0, 2, 1]])

In [36]:
print(predictions.T.shape)
predictions.T

(10000, 3)


array([[2, 2, 2],
       [0, 0, 0],
       [0, 2, 0],
       ...,
       [0, 0, 0],
       [2, 2, 2],
       [1, 1, 1]])

The list comprehension then effectively takes each element (row) and applies ``statistics.mode`` to it, thereby selecting the classification that received the most votes from the algorithms ...

In [37]:
np.array(hv_predicted) # Shows the result for the 1st 3 and last 3 rows as displayed in the previous code cell

array([2, 0, 0, ..., 0, 2, 1])

Now that we have a full understanding of the helpder functions and a deep understanding of how soft and hard voting works those helpder functions can be re-used to generate the results.

Note that I included the Logistic Regression in the example code above to purposely create an array with shape 4, 10000, 3 to avoid confusion in the subsequent explanation.

Through trial-and-error I found that the logistic regression has low accuracy for this dataset, hence I have excluded it in the final run to help illustrate the point that soft and hard voting genuinely does improve algorithm accuracy ...

In [38]:
sv_predicted_proba, sv_predicted = soft_voting(np.array([rf_predicted_proba, xg_predicted_proba, xt_predicted_proba]))
hv_predicted = hard_voting(np.array([rf_predicted, xg_predicted, xt_predicted]))

In [39]:
print(f"Accuracy of Logistic Regression: {accuracy_score(actual, lr_predicted)}")
print(f"Accuracy of Random Forest: {accuracy_score(actual, rf_predicted)}")
print(f"Accuracy of XG Boost: {accuracy_score(actual, xg_predicted)}")
print(f"Accuracy of Extra Random Trees: {accuracy_score(actual, xt_predicted)}")
print(f"Accuracy of Soft Voting: {accuracy_score(actual, sv_predicted)}")
print(f"Accuracy of Hard Voting: {accuracy_score(actual, hv_predicted)}")

Accuracy of Logistic Regression: 0.6821
Accuracy of Random Forest: 0.8742
Accuracy of XG Boost: 0.8838
Accuracy of Extra Random Trees: 0.8754
Accuracy of Soft Voting: 0.8868
Accuracy of Hard Voting: 0.881


Armed with that understanding we can revert to using the implementations found in the scikit-learn library safe with the knowledge that we fully understand what they are doing and how they work ...

In [40]:
from sklearn.ensemble import VotingClassifier

estimators=[('rf', rf), ('xg', xg), ('xt', xt)]

vc_sv = VotingClassifier(estimators=estimators, voting="soft")
vc_hv = VotingClassifier(estimators=estimators, voting="hard")

In [41]:
%%time
actual, vc_sv_predicted, vc_sv_predicted_proba = cross_val_predict(vc_sv, kfold, X.to_numpy(), y.to_numpy())

(6000,)
(2000, 3)
(12000,)
(2000, 3)
(18000,)
(2000, 3)
(24000,)
(2000, 3)
(30000,)
(2000, 3)
Wall time: 2min 45s


In [42]:
%%time
actual, vc_hv_predicted, _ = cross_val_predict(vc_hv, kfold, X.to_numpy(), y.to_numpy())

Wall time: 2min 28s


https://www.kaggle.com/saurabhshahane/voting-classifier

In [43]:
print(f"Accuracy of SciKit-Learn Soft Voting: {accuracy_score(actual, vc_sv_predicted)}")
print(f"Accuracy of SciKit-Learn Hard Voting: {accuracy_score(actual, vc_hv_predicted)}")

Accuracy of SciKit-Learn Soft Voting: 0.8868
Accuracy of SciKit-Learn Hard Voting: 0.881
