In [1]:
import pandas as pd
import numpy as np
import copy as cp

from sklearn.datasets import make_classification

from sklearn.model_selection import KFold, train_test_split
from typing import Tuple

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
RANDOM_STATE : int = 42
N_SAMPLES : int = 10000
N_FEATURES : int = 25
N_CLASSES : int = 3
N_CLUSTERS_PER_CLASS : int = 2
    
FEATURE_NAME_PREFIX : str = "Feature"
TARGET_NAME : str = "Target"
    
N_SPLITS : int = 5

In [3]:
n_informative = N_CLASSES * N_CLUSTERS_PER_CLASS
X, y = make_classification(n_samples=N_SAMPLES, n_features=N_FEATURES, n_classes=N_CLASSES, n_informative=n_informative, random_state=RANDOM_STATE)

In [4]:
def make_classification_dataframe(n_samples : int = 10000, n_features : int = 25, n_classes : int = 2, n_clusters_per_class : int = 2, feature_name_prefix : str = "Feature", target_name : str = "Target", random_state : int = 42) -> pd.DataFrame:
    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, n_informative = n_classes * n_clusters_per_class, random_state=RANDOM_STATE)

    feature_names = [feature_name_prefix + " " + str(v) for v in np.arange(1, N_FEATURES+1)]
    return pd.concat([pd.DataFrame(X, columns=feature_names), pd.DataFrame(y, columns=[target_name])], axis=1)

In [5]:
df_data = make_classification_dataframe(n_samples=N_SAMPLES, n_features=N_FEATURES, n_classes=N_CLASSES, n_clusters_per_class=N_CLUSTERS_PER_CLASS, feature_name_prefix=FEATURE_NAME_PREFIX, target_name=TARGET_NAME, random_state=RANDOM_STATE)

df_data_train, df_data_val = train_test_split(df_data, test_size=0.2, random_state=RANDOM_STATE)
df_data_train.head()

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,...,Feature 17,Feature 18,Feature 19,Feature 20,Feature 21,Feature 22,Feature 23,Feature 24,Feature 25,Target
9254,-2.025205,-0.089634,-2.833473,0.315723,-0.254786,-1.873841,-1.082022,0.375549,-1.766212,-0.635775,...,0.450156,0.127255,0.964195,-0.57025,-1.121593,-0.859178,-0.390989,-1.91687,-2.367061,0
1561,-2.989839,-1.155186,-0.239581,0.043799,0.410022,-0.34829,-0.758383,1.274005,0.306502,0.080855,...,-1.91351,0.232358,0.684569,-0.683173,0.240665,1.259787,-1.251941,-0.059789,-0.655588,0
1670,-1.947663,0.520725,0.106356,0.019951,1.670833,-0.674143,-0.678134,0.382928,-1.743136,0.115776,...,-1.886197,0.989632,0.165237,1.709442,-1.82747,2.403309,-0.809622,-1.238595,-0.869119,0
6087,-0.460435,1.280978,0.722993,0.344352,0.32657,-0.939769,0.13007,0.324532,-0.052836,0.087012,...,-0.155966,-0.299105,0.262876,0.506887,0.535087,-0.920843,0.187716,0.51918,-0.095456,0
6669,-0.053909,1.356961,-1.431071,0.039278,2.191362,-0.511725,0.822338,-0.284092,-0.188173,0.436858,...,0.506829,-0.175423,0.582515,0.03094,-0.239184,0.015029,0.86423,-2.424158,0.160253,0
5933,-1.575353,1.720075,2.444045,-1.141698,-1.363971,0.411163,-2.059946,-0.468634,-1.369811,0.964523,...,0.05188,1.075895,-0.197272,0.786871,0.329109,-0.296101,-0.708101,0.32849,-0.901907,2
8829,-0.834904,-0.187435,-0.687576,0.633883,-1.973791,-0.560825,-1.535265,-0.376026,1.748435,2.627702,...,0.33047,-0.571012,-0.993603,-0.033413,-0.468113,-1.066455,3.255038,0.450539,0.964673,1
7945,-1.791395,-0.823484,2.062622,-2.14228,1.404236,-0.418201,-0.492328,-2.349094,-0.046197,-0.284203,...,0.492133,-0.662076,0.452899,0.86335,0.26921,0.020738,-0.822121,-0.710431,-0.187985,0
3508,-0.667945,-0.325658,1.083363,0.573356,-0.646951,0.219695,-0.782316,-2.298399,0.856737,0.587982,...,0.581827,-0.158002,0.493709,0.590627,0.100678,1.107653,2.093519,1.080767,1.413545,0
2002,-2.311731,1.759782,-0.157885,-0.964555,0.781729,0.370209,-2.711403,-1.182277,-0.785427,2.392231,...,0.352044,-0.624004,-0.850969,1.036668,-1.668777,0.943739,-1.338617,1.536833,-0.556903,0


In [6]:
def cross_val_predict(model, kfold : KFold, X : np.array, y : np.array, target_type : type = int) -> Tuple[np.array, np.array, np.array]:

    model_ = cp.deepcopy(model)
    
    actual_classes = np.array([])
    predicted_classes = np.array([])
    predicted_proba = np.array([])

    splits = kfold.split(X)
    
    for train_ndx, test_ndx in splits:

        train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]

        actual_classes = np.append(actual_classes, test_y)

        model_.fit(train_X, train_y)
        predicted_classes = np.append(predicted_classes, model_.predict(test_X))
        predicted_proba = np.append(predicted_proba, model_.predict_proba(test_X))

    return actual_classes.astype(target_type), predicted_classes.astype(target_type), predicted_proba.reshape(X.shape[0], len(np.unique(y))) # Reshape to the number of rows in the source features and the number of unique classes that appear in the target. For example 10,000 data points with y = 0 or 1 will have produced an array (20000,) in shape that needs to be reshaped to (10000,2)

In [7]:
kfold = KFold(n_splits=N_SPLITS, random_state=RANDOM_STATE, shuffle=True)

In [8]:
lr = LogisticRegression(random_state=RANDOM_STATE)
rf = RandomForestClassifier(random_state=RANDOM_STATE)
xg = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE)

In [9]:
X = df_data_train.drop([TARGET_NAME], axis=1)
y = df_data_train[TARGET_NAME]

In [10]:
%%time
lr_actual, lr_predicted, lr_predicted_proba = cross_val_predict(lr, kfold, X.to_numpy(), y.to_numpy())

Wall time: 245 ms


In [11]:
%%time
rf_actual, rf_predicted, rf_predicted_proba = cross_val_predict(rf, kfold, X.to_numpy(), y.to_numpy())

Wall time: 10.7 s


In [12]:
%%time
xg_actual, xg_predicted, xg_predicted_proba = cross_val_predict(xg, kfold, X.to_numpy(), y.to_numpy())

Wall time: 12.5 s


In [13]:
lr.fit(X, y)

LogisticRegression(random_state=42)

In [14]:
lr.decision_function(X)

array([[ 1.18407243, -0.56677271, -0.61729971],
       [ 0.81343434, -0.16398338, -0.64945096],
       [ 1.61010708, -1.12451957, -0.48558751],
       ...,
       [ 1.24665246, -1.44689942,  0.20024696],
       [-0.24804334,  0.23050748,  0.01753586],
       [-0.10821081,  1.00913904, -0.90092823]])

In [15]:
def soft_voting(predicted_probas : np.array) -> np.array:
    """[summary]

        Args:
            predicted_probas (np.array): [description]

        Returns:
            np.array: [description]
        """
    
    no_voters = predicted_probas.shape[0]
    no_rows = predicted_probas.shape[1]
    no_cols = predicted_probas.shape[2]
    
    soft_voting_probas = np.empty(shape=(no_rows, no_cols))
    soft_voting_probas.fill(0)
    
    for i in range(0, no_cols - 1):
        for j in range(0, no_voters):
            soft_voting_probas[:, i] += predicted_probas[j][:, i]
        soft_voting_probas[:, i] /= no_voters
    
    soft_voting_probas[:,-1] = 1 - soft_voting_probas.sum(axis=1)
    
    return soft_voting_probas

In [16]:
predicted_probas = np.array([lr_predicted_proba, rf_predicted_proba, xg_predicted_proba])
soft_voting_probas = soft_voting(predicted_probas)

In [17]:
soft_voting_probas

array([[0.06155353, 0.92809167, 0.01035481],
       [0.75616003, 0.16764789, 0.07619208],
       [0.66218234, 0.05924803, 0.27856963],
       ...,
       [0.32121056, 0.56244027, 0.11634917],
       [0.18353267, 0.03270084, 0.7837665 ],
       [0.58621584, 0.0869719 , 0.32681226]])

In [18]:
lr_predicted_proba

array([[0.12441844, 0.84454805, 0.03103352],
       [0.56246366, 0.27943497, 0.15810137],
       [0.46456839, 0.15673804, 0.37869357],
       ...,
       [0.40873642, 0.38146858, 0.209795  ],
       [0.23654082, 0.08590648, 0.67755271],
       [0.42188846, 0.22544336, 0.35266818]])

In [19]:
rf_predicted_proba

array([[0.06, 0.94, 0.  ],
       [0.71, 0.22, 0.07],
       [0.76, 0.02, 0.22],
       ...,
       [0.42, 0.49, 0.09],
       [0.19, 0.01, 0.8 ],
       [0.65, 0.03, 0.32]])

In [20]:
xg_predicted_proba

array([[2.42143142e-04, 9.99726951e-01, 3.08621820e-05],
       [9.96016443e-01, 3.50869191e-03, 4.74897592e-04],
       [7.61978626e-01, 1.00605446e-03, 2.37015381e-01],
       ...,
       [1.34895250e-01, 8.15852225e-01, 4.92525101e-02],
       [1.24057181e-01, 2.19603023e-03, 8.73746812e-01],
       [6.86759055e-01, 5.47233177e-03, 3.07768643e-01]])

In [21]:
(lr_predicted_proba[0][0] + rf_predicted_proba[0][0] + xg_predicted_proba[0][0]) / 3

0.061553526266699236

0.061553526266699236

In [22]:
tot = 0
for i in range(0, soft_voting_probas.shape[1]):
    tot += soft_voting_probas[0][i]
print(tot)

1.0


In [23]:
soft_voting_probas[0][0] + soft_voting_probas[0][1] + + soft_voting_probas[0][2]

1.0