In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, log_loss, accuracy_score
from sklearn.preprocessing import  OneHotEncoder, LabelEncoder
from sklearn.compose import make_column_transformer, make_column_selector

In [2]:
hr = pd.read_csv("C:/Python/Cases/human-resources-analytics/HR_comma_sep.csv")
X = hr.drop('left', axis=1)
y = hr['left']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25,
                                                    stratify=y)

In [3]:
ohe = OneHotEncoder(sparse_output=False, drop='first').set_output(transform='pandas')
trns = make_column_transformer((ohe, make_column_selector(dtype_include=object) ), 
                               remainder='passthrough', verbose_feature_names_out=False)
trns = trns.set_output(transform='pandas')
X_train_trns = trns.fit_transform(X_train)
X_test_trns = trns.transform(X_test)

In [4]:
dtc = DecisionTreeClassifier(random_state=25)
knn = KNeighborsClassifier()
nb = GaussianNB()
voting = VotingClassifier(estimators=[('TREE',dtc),('KNN',knn),('NB',nb)])
voting.fit(X_train_trns, y_train)
y_pred = voting.predict(X_test_trns)
print( classification_report(y_test, y_pred) )

              precision    recall  f1-score   support

           0       0.99      0.96      0.97      3429
           1       0.87      0.96      0.91      1070

    accuracy                           0.96      4499
   macro avg       0.93      0.96      0.94      4499
weighted avg       0.96      0.96      0.96      4499



In [5]:
dtc1 = DecisionTreeClassifier(random_state=25)
dtc2 = DecisionTreeClassifier(random_state=25, max_depth=3)
knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=3)
nb = GaussianNB()
voting = VotingClassifier(estimators=[('TREE1',dtc1),('TREE2',dtc2),
                                      ('KNN1',knn1),('KNN2',knn2),
                                      ('NB',nb)])
voting.fit(X_train_trns, y_train)
y_pred = voting.predict(X_test_trns)
print( classification_report(y_test, y_pred) )

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      3429
           1       0.90      0.95      0.92      1070

    accuracy                           0.96      4499
   macro avg       0.94      0.96      0.95      4499
weighted avg       0.96      0.96      0.96      4499



Evaluating the individual estimators

In [7]:
for i in range(len(voting.estimators_)):
    print("Estimator: ", voting.estimators_[i])
    print("Accuracy Score = ", accuracy_score(y_test, voting.estimators_[i].predict(X_test_trns)))

Estimator:  DecisionTreeClassifier(random_state=25)
Accuracy Score =  0.9813291842631696
Estimator:  DecisionTreeClassifier(max_depth=3, random_state=25)
Accuracy Score =  0.9559902200488998
Estimator:  KNeighborsClassifier()
Accuracy Score =  0.9373194043120694
Estimator:  KNeighborsClassifier(n_neighbors=3)
Accuracy Score =  0.937541675927984
Estimator:  GaussianNB()
Accuracy Score =  0.7141587019337631


`soft` voting

In [8]:
voting = VotingClassifier(estimators=[('TREE1',dtc1),('TREE2',dtc2),
                                      ('KNN1',knn1),('KNN2',knn2),
                                      ('NB',nb)], voting='soft')
voting.fit(X_train_trns, y_train)
y_pred = voting.predict(X_test_trns)
print( classification_report(y_test, y_pred) )
y_pred_prob = voting.predict_proba(X_test_trns)
print( log_loss(y_test, y_pred_prob) )

              precision    recall  f1-score   support

           0       0.99      0.98      0.98      3429
           1       0.93      0.95      0.94      1070

    accuracy                           0.97      4499
   macro avg       0.96      0.97      0.96      4499
weighted avg       0.97      0.97      0.97      4499

0.1403693382202133


Specifying weights

In [11]:
voting = VotingClassifier(estimators=[('TREE1',dtc1),('TREE2',dtc2),
                                      ('KNN1',knn1),('KNN2',knn2),
                                      ('NB',nb)], voting='soft',
                          weights=[4, 2.5, 1, 1, 0])
voting.fit(X_train_trns, y_train)
y_pred = voting.predict(X_test_trns)
print( classification_report(y_test, y_pred) )
y_pred_prob = voting.predict_proba(X_test_trns)
print( log_loss(y_test, y_pred_prob) )

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3429
           1       0.98      0.97      0.97      1070

    accuracy                           0.99      4499
   macro avg       0.98      0.98      0.98      4499
weighted avg       0.99      0.99      0.99      4499

0.07071178612705878
