https://www.kaggle.com/saurabhshahane/voting-classifier/notebook

In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier

In [2]:
RANDOM_STATE : int = 42
N_SAMPLES : int = 10000
N_FEATURES : int = 25
N_CLASSES : int = 3
N_CLUSTERS_PER_CLASS : int = 3
    
FEATURE_NAME_PREFIX : str = "Feature"
TARGET_NAME : str = "Target"

In [3]:
def make_classification_dataframe(n_samples : int = 10000, n_features : int = 25, n_classes : int = 2, n_clusters_per_class : int = 2, feature_name_prefix : str = "Feature", target_name : str = "Target", random_state : int = 42) -> pd.DataFrame:
    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, n_informative = n_classes * n_clusters_per_class, random_state=RANDOM_STATE)

    feature_names = [feature_name_prefix + " " + str(v) for v in np.arange(1, N_FEATURES+1)]
    return pd.concat([pd.DataFrame(X, columns=feature_names), pd.DataFrame(y, columns=[target_name])], axis=1)

In [4]:
df_data = make_classification_dataframe(n_samples=N_SAMPLES, n_features=N_FEATURES, n_classes=N_CLASSES, n_clusters_per_class=N_CLUSTERS_PER_CLASS, feature_name_prefix=FEATURE_NAME_PREFIX, target_name=TARGET_NAME, random_state=RANDOM_STATE)
df_data.head()

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,...,Feature 17,Feature 18,Feature 19,Feature 20,Feature 21,Feature 22,Feature 23,Feature 24,Feature 25,Target
0,2.60207,-0.074587,1.002719,-1.001478,1.44714,0.212351,-0.20057,0.646371,2.696937,-0.470317,...,0.961013,1.288696,3.254023,-1.654725,-4.036566,-0.664342,0.672532,1.011852,1.195234,2
1,-3.904959,1.227717,-2.337397,-2.334397,0.862484,1.415096,1.214655,-0.462356,1.762339,1.465091,...,-0.494576,-0.175983,-2.093386,1.014129,-0.014273,1.469496,0.566356,2.050482,-0.038749,0
2,3.246662,0.446997,0.242862,1.033087,-0.385963,0.262805,2.084988,-2.308492,-1.545419,-1.367411,...,0.743195,-0.215827,-1.580021,3.148203,-1.088466,0.411244,-0.252259,-0.656534,-0.663363,2
3,-8.104921,-0.047367,-4.663056,-1.609434,2.703973,-1.841072,-2.726065,6.055944,-0.176138,-1.320823,...,1.146603,-1.079501,-0.155818,-3.836376,2.212267,-0.780577,0.186297,0.018554,-0.302452,1
4,1.669653,0.352665,-0.380804,0.808218,-1.521313,1.236502,0.237053,-1.589812,0.339948,0.364597,...,-0.019907,0.63828,0.641545,2.059628,3.622518,-0.581747,-0.504654,-0.902726,-1.089139,1


In [5]:
X = np.array(df_data.drop([TARGET_NAME], axis=1))
y = np.array(df_data[TARGET_NAME], dtype=int)

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [14]:
classifiers = [['Neural Network :', MLPClassifier(max_iter = 1000, random_state=RANDOM_STATE)],
               ['LogisticRegression :', LogisticRegression(max_iter = 1000, random_state=RANDOM_STATE)],
               ['ExtraTreesClassifier :', ExtraTreesClassifier(random_state=RANDOM_STATE)],
               ['DecisionTree :',DecisionTreeClassifier(random_state=RANDOM_STATE)],
               ['RandomForest :',RandomForestClassifier(random_state=RANDOM_STATE)], 
               ['Naive Bayes :', GaussianNB()],
               ['KNeighbours :', KNeighborsClassifier()],
               ['SVM :', SVC(probability=True, random_state=RANDOM_STATE)],
               ['AdaBoostClassifier :', AdaBoostClassifier(random_state=RANDOM_STATE)],
               ['GradientBoostingClassifier: ', GradientBoostingClassifier(random_state=RANDOM_STATE)],
               ['XGB :', XGBClassifier(eval_metric='mlogloss', use_label_encoder=False, random_state=RANDOM_STATE)],
#               ['CatBoost :', CatBoostClassifier(logging_level='Silent, random_state=RANDOM_STATE')]
              ]

In [16]:
for name,classifier in classifiers:
    classifier = classifier
    classifier.fit(X_train, y_train.ravel())
    predictions = classifier.predict(X_test)
    print(name, accuracy_score(y_test, predictions))

Neural Network : 0.8993333333333333
LogisticRegression : 0.7326666666666667
ExtraTreesClassifier : 0.8933333333333333
DecisionTree : 0.746
RandomForest : 0.879
Naive Bayes : 0.6966666666666667
KNeighbours : 0.8773333333333333
SVM : 0.9053333333333333
AdaBoostClassifier : 0.689
GradientBoostingClassifier:  0.8383333333333334
XGB : 0.8886666666666667


In [17]:
eclf1 = VotingClassifier(estimators=classifiers, voting='soft')
eclf1.fit(X_train, y_train)
predictions = eclf1.predict(X_test)
print("soft", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

soft 0.9033333333333333
              precision    recall  f1-score   support

           0       0.89      0.92      0.91       991
           1       0.91      0.88      0.89       980
           2       0.91      0.91      0.91      1029

    accuracy                           0.90      3000
   macro avg       0.90      0.90      0.90      3000
weighted avg       0.90      0.90      0.90      3000



In [18]:
eclf1 = VotingClassifier(estimators=classifiers, voting='hard')
eclf1.fit(X_train, y_train)
predictions = eclf1.predict(X_test)
print("hard", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

hard 0.89
              precision    recall  f1-score   support

           0       0.87      0.92      0.89       991
           1       0.89      0.86      0.88       980
           2       0.91      0.89      0.90      1029

    accuracy                           0.89      3000
   macro avg       0.89      0.89      0.89      3000
weighted avg       0.89      0.89      0.89      3000



In [12]:
classifiers = [['RandomForest :',RandomForestClassifier(random_state=RANDOM_STATE)], 
               ['XGB :', XGBClassifier(eval_metric='mlogloss', use_label_encoder=False, random_state=RANDOM_STATE)],
               ['CatBoost :', CatBoostClassifier(logging_level='Silent', random_state=RANDOM_STATE)]]

In [13]:
eclf1 = VotingClassifier(estimators=classifiers, voting='hard')
eclf1.fit(X_train, y_train)
predictions = eclf1.predict(X_test)
print("hard", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

  return array(a, dtype, copy=False, order=order)


ValueError: could not broadcast input array from shape (3000,1) into shape (3000)

The ``CatBoost`` algorithm cannot be used in a hard voting classifier when there are more than 2 classes. If this is tried it throws an error - ``ValueError: could not broadcast input array from shape (3000,1) into shape (3000)``