In [21]:
import pandas as pd
import numpy as np
import copy as cp

from sklearn.datasets import make_classification

from sklearn.model_selection import KFold, train_test_split
from typing import Tuple

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from statistics import mode
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score

from sklearn.metrics import roc_curve, precision_recall_curve

In [2]:
RANDOM_STATE : int = 42
N_SAMPLES : int = 10000
N_FEATURES : int = 25
N_CLASSES : int = 3
N_CLUSTERS_PER_CLASS : int = 2
    
FEATURE_NAME_PREFIX : str = "Feature"
TARGET_NAME : str = "Target"
    
N_SPLITS : int = 5

In [3]:
X, y = make_classification(n_samples=N_SAMPLES, n_features=N_FEATURES, n_classes=N_CLASSES, n_informative=(N_CLASSES * N_CLUSTERS_PER_CLASS), random_state=RANDOM_STATE)

In [4]:
def make_classification_dataframe(n_samples : int = 10000, n_features : int = 25, n_classes : int = 2, n_clusters_per_class : int = 2, feature_name_prefix : str = "Feature", target_name : str = "Target", random_state : int = 42) -> pd.DataFrame:
    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, n_informative = n_classes * n_clusters_per_class, random_state=RANDOM_STATE)

    feature_names = [feature_name_prefix + " " + str(v) for v in np.arange(1, N_FEATURES+1)]
    return pd.concat([pd.DataFrame(X, columns=feature_names), pd.DataFrame(y, columns=[target_name])], axis=1)

In [5]:
df_data = make_classification_dataframe(n_samples=N_SAMPLES, n_features=N_FEATURES, n_classes=N_CLASSES, n_clusters_per_class=N_CLUSTERS_PER_CLASS, feature_name_prefix=FEATURE_NAME_PREFIX, target_name=TARGET_NAME, random_state=RANDOM_STATE)
df_data.head()
#df_data_train, df_data_val = train_test_split(df_data, test_size=0.2, random_state=RANDOM_STATE)
#df_data_train.head()

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,...,Feature 17,Feature 18,Feature 19,Feature 20,Feature 21,Feature 22,Feature 23,Feature 24,Feature 25,Target
0,-0.131637,2.281512,0.46881,0.707735,1.628051,0.622273,-0.434003,-0.992722,0.053795,-1.764985,...,-1.673779,0.276305,-1.685462,-0.801336,0.806151,0.369108,-0.843748,0.966868,-0.547149,2
1,-1.231544,-1.58088,0.684543,-0.343771,0.498176,-0.008396,-0.859592,-0.666477,-0.832989,-0.287655,...,0.341136,1.116596,1.134896,1.232907,1.295312,-0.253926,-0.528711,0.502124,0.896065,1
2,-1.301585,-1.922563,-0.623878,-0.740534,-0.723667,1.484827,1.227018,-0.050878,0.164059,0.301672,...,-0.90029,0.682905,0.680959,-0.02355,0.932216,0.109495,0.500366,0.956182,-2.268742,0
3,-0.899385,0.991619,0.494529,-0.672954,0.421605,-0.271674,1.245351,0.146567,0.389313,1.479558,...,-0.285753,-1.446158,-0.062296,0.583408,1.588965,0.412651,-1.891714,-0.575163,0.786847,0
4,-3.026721,0.745777,0.18845,-0.794256,1.40257,1.057481,0.454773,-0.174391,0.951417,-0.403872,...,0.959229,-1.964891,-0.296422,-0.755737,-0.489769,0.516726,-4.807225,1.215506,0.799321,1


In [6]:
def cross_val_predict(model, kfold : KFold, X : np.array, y : np.array, target_type : type = int) -> Tuple[np.array, np.array, np.array]:

    model_ = cp.deepcopy(model)
    
    actual_classes = np.array([])
    predicted_classes = np.array([])
    predicted_proba = np.array([])

    splits = kfold.split(X)
    
    for train_ndx, test_ndx in splits:

        train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]

        actual_classes = np.append(actual_classes, test_y)

        model_.fit(train_X, train_y)
        predicted_classes = np.append(predicted_classes, model_.predict(test_X))
        predicted_proba = np.append(predicted_proba, model_.predict_proba(test_X))

    return actual_classes.astype(target_type), predicted_classes.astype(target_type), predicted_proba.reshape(X.shape[0], len(np.unique(y))) # Reshape to the number of rows in the source features and the number of unique classes that appear in the target. For example 10,000 data points with y = 0 or 1 will have produced an array (20000,) in shape that needs to be reshaped to (10000,2)

In [7]:
kfold = KFold(n_splits=N_SPLITS, random_state=RANDOM_STATE, shuffle=True)

In [8]:
lr = LogisticRegression(random_state=RANDOM_STATE)
rf = RandomForestClassifier(random_state=RANDOM_STATE)
xg = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE)
cb = CatBoostClassifier(silent=True, random_state=RANDOM_STATE)

In [9]:
X = df_data.drop([TARGET_NAME], axis=1)
y = df_data[TARGET_NAME]

In [10]:
%%time
actual, lr_predicted, lr_predicted_proba = cross_val_predict(lr, kfold, X.to_numpy(), y.to_numpy())

Wall time: 248 ms


In [11]:
%%time
actual, rf_predicted, rf_predicted_proba = cross_val_predict(rf, kfold, X.to_numpy(), y.to_numpy())

Wall time: 13.4 s


In [12]:
%%time
actual, xg_predicted, xg_predicted_proba = cross_val_predict(xg, kfold, X.to_numpy(), y.to_numpy())

Wall time: 16.1 s


In [13]:
%%time
actual, cb_predicted, cb_predicted_proba = cross_val_predict(cb, kfold, X.to_numpy(), y.to_numpy())

Wall time: 56.2 s


In [14]:
def soft_voting(predicted_probas : np.array) -> np.array:
    """[summary]

        Args:
            predicted_probas (np.array): [description]

        Returns:
            np.array: [description]
        """
    
    no_voters = predicted_probas.shape[0]
    no_rows = predicted_probas.shape[1]
    no_cols = predicted_probas.shape[2]
    
    sv_predicted_proba = np.empty(shape=(no_rows, no_cols))
    sv_predicted_proba.fill(0)
    
    for i in range(0, no_cols - 1):
        for j in range(0, no_voters):
            sv_predicted_proba[:, i] += predicted_probas[j][:, i]
        sv_predicted_proba[:, i] /= no_voters
    
    sv_predicted_proba[:,-1] = 1 - sv_predicted_proba.sum(axis=1)
    sv_predicted = sv_predicted_proba.argmax(axis=1)

    
    return sv_predicted_proba, sv_predicted

In [15]:
def hard_voting(predictions : np.array) -> np.array:
    return [mode(v) for v in np.vstack(predictions).T]

In [16]:
sv_predicted_proba, sv_predicted = soft_voting(np.array([rf_predicted_proba, xg_predicted_proba, cb_predicted_proba]))
hv_predicted = hard_voting(np.array([rf_predicted, xg_predicted, cb_predicted]))

In [17]:
print(f"Accuracy of Logistic Regression: {accuracy_score(actual, lr_predicted)}")
print(f"Accuracy of Random Forest: {accuracy_score(actual, rf_predicted)}")
print(f"Accuracy of XG Boost: {accuracy_score(actual, xg_predicted)}")
print(f"Accuracy of Cat Boost: {accuracy_score(actual, cb_predicted)}")
print(f"Accuracy of Soft Voting: {accuracy_score(actual, sv_predicted)}")
print(f"Accuracy of Hard Voting: {accuracy_score(actual, hv_predicted)}")

Accuracy of Logistic Regression: 0.6821
Accuracy of Random Forest: 0.8742
Accuracy of XG Boost: 0.8838
Accuracy of Cat Boost: 0.8864
Accuracy of Soft Voting: 0.8889
Accuracy of Hard Voting: 0.8876


https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html?highlight=roc_curve#sklearn.metrics.roc_curve

In [20]:
fpr, tpr, thresholds = roc_curve(actual, lr_predicted_proba)

ValueError: multiclass format is not supported

In [22]:
precisions, recalls, thresholds = precision_recall_curve(actual, lr_predicted_proba)

ValueError: multiclass format is not supported