In [117]:
import pandas as pd
import numpy as np
import copy as cp

from sklearn.datasets import make_classification

from sklearn.model_selection import KFold, train_test_split
from typing import Tuple

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from statistics import mode
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score

In [2]:
RANDOM_STATE : int = 42
N_SAMPLES : int = 10000
N_FEATURES : int = 25
N_CLASSES : int = 3
N_CLUSTERS_PER_CLASS : int = 2
    
FEATURE_NAME_PREFIX : str = "Feature"
TARGET_NAME : str = "Target"
    
N_SPLITS : int = 5

In [3]:
n_informative = N_CLASSES * N_CLUSTERS_PER_CLASS
X, y = make_classification(n_samples=N_SAMPLES, n_features=N_FEATURES, n_classes=N_CLASSES, n_informative=n_informative, random_state=RANDOM_STATE)

In [4]:
def make_classification_dataframe(n_samples : int = 10000, n_features : int = 25, n_classes : int = 2, n_clusters_per_class : int = 2, feature_name_prefix : str = "Feature", target_name : str = "Target", random_state : int = 42) -> pd.DataFrame:
    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, n_informative = n_classes * n_clusters_per_class, random_state=RANDOM_STATE)

    feature_names = [feature_name_prefix + " " + str(v) for v in np.arange(1, N_FEATURES+1)]
    return pd.concat([pd.DataFrame(X, columns=feature_names), pd.DataFrame(y, columns=[target_name])], axis=1)

In [5]:
df_data = make_classification_dataframe(n_samples=N_SAMPLES, n_features=N_FEATURES, n_classes=N_CLASSES, n_clusters_per_class=N_CLUSTERS_PER_CLASS, feature_name_prefix=FEATURE_NAME_PREFIX, target_name=TARGET_NAME, random_state=RANDOM_STATE)

df_data_train, df_data_val = train_test_split(df_data, test_size=0.2, random_state=RANDOM_STATE)
df_data_train.head()

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,...,Feature 17,Feature 18,Feature 19,Feature 20,Feature 21,Feature 22,Feature 23,Feature 24,Feature 25,Target
9254,-2.025205,-0.089634,-2.833473,0.315723,-0.254786,-1.873841,-1.082022,0.375549,-1.766212,-0.635775,...,0.450156,0.127255,0.964195,-0.57025,-1.121593,-0.859178,-0.390989,-1.91687,-2.367061,0
1561,-2.989839,-1.155186,-0.239581,0.043799,0.410022,-0.34829,-0.758383,1.274005,0.306502,0.080855,...,-1.91351,0.232358,0.684569,-0.683173,0.240665,1.259787,-1.251941,-0.059789,-0.655588,0
1670,-1.947663,0.520725,0.106356,0.019951,1.670833,-0.674143,-0.678134,0.382928,-1.743136,0.115776,...,-1.886197,0.989632,0.165237,1.709442,-1.82747,2.403309,-0.809622,-1.238595,-0.869119,0
6087,-0.460435,1.280978,0.722993,0.344352,0.32657,-0.939769,0.13007,0.324532,-0.052836,0.087012,...,-0.155966,-0.299105,0.262876,0.506887,0.535087,-0.920843,0.187716,0.51918,-0.095456,0
6669,-0.053909,1.356961,-1.431071,0.039278,2.191362,-0.511725,0.822338,-0.284092,-0.188173,0.436858,...,0.506829,-0.175423,0.582515,0.03094,-0.239184,0.015029,0.86423,-2.424158,0.160253,0


In [143]:
def cross_val_predict(model, kfold : KFold, X : np.array, y : np.array, target_type : type = int) -> Tuple[np.array, np.array, np.array]:

    model_ = cp.deepcopy(model)
    
    actual_classes = np.array([])
    predicted_classes = np.array([])
    predicted_proba = np.array([])

    splits = kfold.split(X)
    
    for train_ndx, test_ndx in splits:

        train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]

        actual_classes = np.append(actual_classes, test_y)

        model_.fit(train_X, train_y)
        predicted_classes = np.append(predicted_classes, model_.predict(test_X))
        try:
            predicted_proba = np.append(predicted_proba, model_.predict_proba(test_X))
        except AttributeError:
            pass

#    try:
    predicted_proba.reshape(X.shape[0], len(np.unique(y)))
#    except:
#        pass
        
    return actual_classes.astype(target_type), predicted_classes.astype(target_type), predicted_proba # Reshape to the number of rows in the source features and the number of unique classes that appear in the target. For example 10,000 data points with y = 0 or 1 will have produced an array (20000,) in shape that needs to be reshaped to (10000,2)

In [149]:
def cross_val_predict(model, kfold : KFold, X : np.array, y : np.array, target_type : type = int) -> Tuple[np.array, np.array, np.array]:

    model_ = cp.deepcopy(model)
    
    actual_classes = np.array([])
    predicted_classes = np.array([])
    predicted_proba = np.array([])

    splits = kfold.split(X)
    
    for train_ndx, test_ndx in splits:

        train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]

        actual_classes = np.append(actual_classes, test_y)

        model_.fit(train_X, train_y)
        predicted_classes = np.append(predicted_classes, model_.predict(test_X))
        predicted_proba = np.append(predicted_proba, model_.predict_proba(test_X))

    return actual_classes.astype(target_type), predicted_classes.astype(target_type), predicted_proba.reshape(X.shape[0], len(np.unique(y))) # Reshape to the number of rows in the source features and the number of unique classes that appear in the target. For example 10,000 data points with y = 0 or 1 will have produced an array (20000,) in shape that needs to be reshaped to (10000,2)

In [152]:
def cross_val_predict(model, kfold : KFold, X : np.array, y : np.array, target_type : type = int) -> Tuple[np.array, np.array, np.array]:

    model_ = cp.deepcopy(model)
    
    actual_classes = np.array([])
    predicted_classes = np.array([])
    predicted_proba = np.array([])

    splits = kfold.split(X)
    
    for train_ndx, test_ndx in splits:

        train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]

        actual_classes = np.append(actual_classes, test_y)

        model_.fit(train_X, train_y)
        predicted_classes = np.append(predicted_classes, model_.predict(test_X))
        predicted_proba = np.append(predicted_proba, model_.predict_proba(test_X))

    return actual_classes.astype(target_type), predicted_classes.astype(target_type), predicted_proba.reshape(X.shape[0], len(np.unique(y))) # Reshape to the numbe

In [139]:
predicted_proba = np.array([])
predicted_proba.shape

(0,)

In [132]:
kfold = KFold(n_splits=N_SPLITS, random_state=RANDOM_STATE, shuffle=True)

In [133]:
lr = LogisticRegression(random_state=RANDOM_STATE)
rf = RandomForestClassifier(random_state=RANDOM_STATE)
xg = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE)

In [134]:
X = df_data_train.drop([TARGET_NAME], axis=1)
y = df_data_train[TARGET_NAME]

In [153]:
%%time
lr_actual, lr_predicted, lr_predicted_proba = cross_val_predict_old(lr, kfold, X.to_numpy(), y.to_numpy())

Wall time: 265 ms


In [157]:
lr_predicted_proba.shape

(24000,)

In [155]:
%%time
rf_actual, rf_predicted, rf_predicted_proba = cross_val_predict(rf, kfold, X.to_numpy(), y.to_numpy())

Wall time: 11 s


In [158]:
rf_predicted_proba.shape

(8000, 3)

In [12]:
%%time
xg_actual, xg_predicted, xg_predicted_proba = cross_val_predict(xg, kfold, X.to_numpy(), y.to_numpy())

Wall time: 14 s


In [40]:
def soft_voting(predicted_probas : np.array) -> np.array:
    """[summary]

        Args:
            predicted_probas (np.array): [description]

        Returns:
            np.array: [description]
        """
    
    no_voters = predicted_probas.shape[0]
    no_rows = predicted_probas.shape[1]
    no_cols = predicted_probas.shape[2]
    
    sv_predicted_proba = np.empty(shape=(no_rows, no_cols))
    sv_predicted_proba.fill(0)
    
    for i in range(0, no_cols - 1):
        for j in range(0, no_voters):
            sv_predicted_proba[:, i] += predicted_probas[j][:, i]
        sv_predicted_proba[:, i] /= no_voters
    
    sv_predicted_proba[:,-1] = 1 - sv_predicted_proba.sum(axis=1)
    sv_predicted = sv_predicted_proba.argmax(axis=1)
    
    return sv_predicted_proba, sv_predicted

In [112]:
def hard_voting(predictions : np.array) -> np.array:
    return [mode(v) for v in np.vstack(predictions).T]

In [113]:
predicted_probas = np.array([lr_predicted_proba, rf_predicted_proba, xg_predicted_proba])
predictions = np.array([lr_predicted, rf_predicted, xg_predicted])

actual = lr_actual
sv_predicted_proba, sv_predicted = soft_voting(np.array([lr_predicted_proba, rf_predicted_proba, xg_predicted_proba]))
hv_predicted = hard_voting(np.array([lr_predicted, rf_predicted, xg_predicted]))

In [114]:
print(f"Accuracy of Logistic Regression: {accuracy_score(actual, lr_predicted)}")
print(f"Accuracy of Random Forest: {accuracy_score(actual, rf_predicted)}")
print(f"Accuracy of XG Boost: {accuracy_score(actual, xg_predicted)}")
print(f"Accuracy of Soft Voting: {accuracy_score(actual, sv_predicted)}")
print(f"Accuracy of Hard Voting: {accuracy_score(actual, hv_predicted)}")

Accuracy of Logistic Regression: 0.681125
Accuracy of Random Forest: 0.864625
Accuracy of XG Boost: 0.8745
Accuracy of Soft Voting: 0.8655
Accuracy of Hard Voting: 0.862


In [115]:
sv_predicted_proba, sv_predicted = soft_voting(np.array([rf_predicted_proba, xg_predicted_proba]))
hv_predicted = hard_voting(np.array([rf_predicted, xg_predicted]))

In [116]:
print(f"Accuracy of Soft Voting: {accuracy_score(actual, sv_predicted)}")
print(f"Accuracy of Hard Voting: {accuracy_score(actual, hv_predicted)}")

Accuracy of Soft Voting: 0.875
Accuracy of Hard Voting: 0.864625


In [119]:
hard = VotingClassifier(estimators=[('lr', lr), ('rf', rf), ('xg', xg)], voting='hard')

In [124]:
hard_actual, hard_predicted, _ = cross_val_predict(hard, kfold, X.to_numpy(), y.to_numpy())

ValueError: cannot reshape array of size 0 into shape (8000,3)

In [21]:
(lr_predicted_proba[0][0] + rf_predicted_proba[0][0] + xg_predicted_proba[0][0]) / 3

0.061553526266699236

0.061553526266699236

In [22]:
tot = 0
for i in range(0, soft_voting_probas.shape[1]):
    tot += soft_voting_probas[0][i]
print(tot)

1.0


In [23]:
soft_voting_probas[0][0] + soft_voting_probas[0][1] + + soft_voting_probas[0][2]

1.0

In [28]:
soft_voting_probas.argmax(axis=1).shape

(8000,)

In [99]:
predictions = np.array([lr_predicted, rf_predicted, xg_predicted])
predictions.shape

(3, 8000)

In [100]:
np.vstack(predicteds).T

array([[1, 1, 1],
       [0, 0, 0],
       [0, 0, 0],
       ...,
       [0, 1, 1],
       [2, 2, 2],
       [0, 0, 0]])

In [74]:
pred_
all_predictions = np.vstack((lr_predicted, rf_predicted, xg_predicted)).T
all_predictions

array([[1, 1, 1],
       [0, 0, 0],
       [0, 0, 0],
       ...,
       [0, 1, 1],
       [2, 2, 2],
       [0, 0, 0]])

In [101]:
from statistics import mode

In [82]:
all_predictions[0]

array([1, 1, 1])

In [86]:
hv_predictions = [st.mode(v) for v in all_predictions]

In [92]:
hv_predictions[2]

0

In [85]:
st.mode(all_predictions[7998])


2

In [70]:
from scipy.stats import mode

In [71]:
mode(all_predictions, axis=1)

ModeResult(mode=array([[1],
       [0],
       [0],
       ...,
       [1],
       [2],
       [0]]), count=array([[3],
       [3],
       [3],
       ...,
       [2],
       [3],
       [3]]))

In [66]:
stats.mode(all_predictions)

ModeResult(mode=array([[0, 1, 1]]), count=array([[2707, 2822, 2818]]))

In [65]:
import numpy as np
from scipy import stats

a = np.array([[1, 3, 4, 2, 2, 7],
              [5, 2, 2, 1, 4, 1],
              [3, 3, 2, 2, 1, 1]])

m = stats.mode(a)
print(m)

ModeResult(mode=array([[1, 3, 2, 2, 1, 1]]), count=array([[1, 2, 2, 2, 1, 2]]))
