In [54]:
import numpy as np
import pandas as pd
import time

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.linear_model import LogisticRegression

### 1. BALANCED ACTUAL AND PREDICTION

In [55]:
t0 = time.time()

bankloan = pd.read_csv('bankloan.csv')
features = MinMaxScaler().fit_transform(bankloan.values[:,:8])
target = bankloan.values[:,8].reshape(-1,1)
values = np.concatenate([features, target], axis = 1)

for i in range(11):
    values = np.append(values, values, axis = 0)
np.random.shuffle(values)
np.random.shuffle(values)
display(values[0])
print('Shape:', values.shape)

print('\nTime execution:', time.time() - t0)

array([0.36111111, 0.25      , 0.38709677, 0.23529412, 0.10185185,
       0.4400978 , 0.14951677, 0.27946508, 0.        ])

Shape: (1433600, 9)

Time execution: 4.443114995956421


In [56]:
# Splitting the values into X and y 
X = values[:,:8] 
y = values[:,8].astype('int16')

In [57]:
# Here is the 0s and 1s from y:

display(y[:10])
print('Shape:', y.shape)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int16)

Shape: (1433600,)


In [58]:
# Taking a glance of y info:

print(np.unique(y, return_counts=True))
print('Weight of 1s:', 374784/(1058816 + 374784))

(array([0, 1], dtype=int16), array([1058816,  374784], dtype=int64))
Weight of 1s: 0.26142857142857145


In [59]:
t0 = time.time() # measuring time of execution

list_test_index = []
result_kfold = []
actual = []
for train_index, test_index in KFold(n_splits=5).split(y):
    logreg = LogisticRegression()
    X_train = X[train_index]
    y_train = y[train_index]
    X_test = X[test_index]
    # y_test = not needed. we are not doing any performance test. instead, only need y_pred later from model.predict(X_test)
    
    logreg.fit(X_train, y_train)
    result_kfold += list(logreg.predict(X_test))
    list_test_index.append(test_index)
    
result_kfold = np.array(result_kfold) # transforming list() into NumPy array, we need it for indexing
print(result_kfold.shape) # must be the same as shape of y, since  we are predicting all the 5 folds
print(len(list_test_index)) # must be 5, which is filled by 5 groups of indexing values

print('\nTime execution:', time.time() - t0)

(1433600,)
5

Time execution: 34.272103786468506


In [61]:
# ACTUAL
print('Labels:', np.unique(y))
for i in list_test_index:
    print(np.unique(y[i], return_counts=True)[1])

Labels: [0 1]
[211702  75018]
[211769  74951]
[211937  74783]
[211522  75198]
[211886  74834]


In [62]:
# PREDICTION BY THE BUILT IN SKLEARN CROSS VALIDATION (cross_val_predic())

t0 = time.time() # measuring time of execution

logreg = LogisticRegression()
result = cross_val_predict(logreg, X, y, cv = 5) # cv is the same as n_splits = 5 

print('Labels:', np.unique(result))
for i in list_test_index:
    print(np.unique(result[i], return_counts=True)[1])
    
print('\nTime execution:', time.time() - t0)

Labels: [0 1]
[230439  56281]
[230621  56099]
[230699  56021]
[230131  56589]
[230299  56421]

Time execution: 31.87156343460083


In [63]:
# PREDICTION WITH SEQUENTIAL INDEX (by the built-in sklearn KFold())

print('Labels:', np.unique(result_kfold))
for i in list_test_index:
    print(np.unique(np.array(result_kfold)[i], return_counts=True)[1])

Labels: [0 1]
[230439  56281]
[230621  56099]
[230284  56436]
[230548  56172]
[230299  56421]


In [64]:
# CALCULATING TP, FN, FP, RECALL, PRECISION, F1-SCORE

t0 = time.time()

list_precision, list_recall, list_f1score = [], [], []
sum_tp, sum_fn, sum_fp = 0, 0, 0
TP, FN, FP, TN = [], [], [], []

for count,i in enumerate(list_test_index):
    print('Fold',count+1,'\n')
    tp, fn, fp, tn  = 0, 0, 0, 0
    for a,b in zip(y[i], result_kfold[i]): # (actual, prediction)
        if a == b:
            if (a == 1) and (b == 1):
                tp += 1
            elif (a == 0) and (b == 0):
                tn += 1
        elif a != b:
            if (a == 1) and (b == 0):
                fn += 1
            elif (a == 0) and (b == 1):
                fp += 1
    sum_tp += tp
    sum_fn += fn
    sum_fp += fp
    
    TP.append(tp); FN.append(fn); FP.append(fp); TN.append(tn)
     
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    f1_score = 2 / (1/precision + 1/recall)
    specificity = tn/(tn+fp)


    list_precision.append(precision); list_recall.append(recall); list_f1score.append(f1_score)
        
    print('  TP:', tp)
    print('  FN:', fn)
    print('  FP:', fp)
    print('  TN:', tn)
    print()
    print('  Precision:', precision)
    print('  Recall:', recall)
    print('  F1-Score:', f1_score)
    # *optional addition, since they don't appear in Confusion Matrix (used for plotting ROC curve instead)
    print(' *Sensitivity:', precision)  # sensitivity and precision are the same
    print(' *Specificity:', specificity) 
    print('='*100)
    
print('\nTime execution:', time.time() - t0)

Fold 1 

  TP: 38240
  FN: 36778
  FP: 18041
  TN: 193661

  Precision: 0.679447771006201
  Recall: 0.50974432802794
  F1-Score: 0.5824872999794363
 *Sensitivity: 0.679447771006201
 *Specificity: 0.9147811546419023
Fold 2 

  TP: 37870
  FN: 37081
  FP: 18229
  TN: 193540

  Precision: 0.675056596374267
  Recall: 0.5052634387800029
  F1-Score: 0.5779473483403281
 *Sensitivity: 0.675056596374267
 *Specificity: 0.9139203566149908
Fold 3 

  TP: 37978
  FN: 36805
  FP: 18458
  TN: 193479

  Precision: 0.6729392586292438
  Recall: 0.5078426915208002
  F1-Score: 0.5788490995968573
 *Sensitivity: 0.6729392586292438
 *Specificity: 0.9129080811750662
Fold 4 

  TP: 38238
  FN: 36960
  FP: 17934
  TN: 193588

  Precision: 0.6807306131168553
  Recall: 0.508497566424639
  F1-Score: 0.5821420415620004
 *Sensitivity: 0.6807306131168553
 *Specificity: 0.915214493055096
Fold 5 

  TP: 38138
  FN: 36696
  FP: 18283
  TN: 193603

  Precision: 0.6759539887630492
  Recall: 0.5096346580431355
  F1-Score: 

In [65]:
# RECALL, PRECISION, F1-SCORE IF BEING AVERAGED DIRECTLY (np.sum(list))

print('Precision:', np.mean(list_precision))
print('Recall:', np.mean(list_recall))
print('F1-Score:', np.mean(list_f1score))

Precision: 0.6768256455779232
Recall: 0.5081965365593035
F1-Score: 0.58051082548934


In [66]:
# RECALL, PRECISION, F1-SCORE IF NOT BEING AVERAGED DIRECTLY
# example: Precision = sum_tp / (sum_tp + sum_fp)
# so the overall result of each Fold, will be summed first

precision_ = sum_tp/(sum_tp + sum_fp)
recall_ = sum_tp/(sum_tp + sum_fn)

print('Precision:', precision_)
print('Recall:', recall_)
print('F1-Score:', 2 / (1/precision_ + 1/recall_))

Precision: 0.676822702898628
Recall: 0.5081967213114754
F1-Score: 0.5805121359112334


# CONCLUSION:
As we can see before when separating the values of the Actual and Predicted 1s, they don't differ that much, <br>just around 133 - 146 (yes i had their standard deviation) out of 50000 - 70000 of 1s in each fold<br>
Hence, by directly averaging their metrics (Precision, Recall, F1-Score) almost precisely* the same as summing their properties first and do the math. <br>
*(Precisely the same, because their results are the same for the first 5-6 digits) 

What about on the imbalance 0s and 1s? We're about to figure out.

### 2. IMBALANCED ACTUAL AND PREDICTION

In [89]:
t0 = time.time()

bankloan = pd.read_csv('bankloan.csv')
features = MinMaxScaler().fit_transform(bankloan.values[:,:8])
target = bankloan.values[:,8].reshape(-1,1)
values = np.concatenate([features, target], axis = 1)

for i in range(11):
    values = np.append(values, values, axis = 0)
np.random.shuffle(values)
np.random.shuffle(values)
display(values[0])
print('Shape:', values.shape)

print('\nTime execution:', time.time() - t0)

array([0.41666667, 0.25      , 0.4516129 , 0.08823529, 0.15740741,
       0.00977995, 0.02222368, 0.00526278, 0.        ])

Shape: (1433600, 9)

Time execution: 4.440523862838745


In [112]:
# sort the last column for 0 to 1
values = values[values[:, 8].argsort()]
print(values[:5, -1]) # first 5 rows
print(values[-5:, -1]) # last 5 rows

[0. 0. 0. 0. 0.]
[1. 1. 1. 1. 1.]


In [113]:
# Splitting the values into X and y 
X = values[:,:8] 
y = values[:,8].astype('int16')

In [114]:
# Here is the 0s and 1s from y:

display(y[:10])
print('Shape:', y.shape)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int16)

Shape: (1433600,)


In [115]:
# Taking a glance of y info:

print(np.unique(y, return_counts=True))
print('Weight of 1s:', 374784/(1058816 + 374784))

(array([0, 1], dtype=int16), array([1058816,  374784], dtype=int64))
Weight of 1s: 0.26142857142857145


In [116]:
t0 = time.time() # measuring time of execution

list_test_index = []
result_kfold = []
actual = []
for train_index, test_index in KFold(n_splits=5).split(y):
    logreg = LogisticRegression()
    X_train = X[train_index]
    y_train = y[train_index]
    X_test = X[test_index]
    # y_test = not needed. we are not doing any performance test. instead, only need y_pred later from model.predict(X_test)
    
    logreg.fit(X_train, y_train)
    result_kfold += list(logreg.predict(X_test))
    list_test_index.append(test_index)
    
result_kfold = np.array(result_kfold) # transforming list() into NumPy array, we need it for indexing
print(result_kfold.shape) # must be the same as shape of y, since  we are predicting all the 5 folds
print(len(list_test_index)) # must be 5, which is filled by 5 groups of indexing values

print('\nTime execution:', time.time() - t0)

(1433600,)
5

Time execution: 32.54437518119812


In [118]:
# ACTUAL
# NOTE: The result is significantly weird, since all of the 1s are on the last rows
# So just ignore the 'Labels:', and let it be known that the 1st - 3rd rows are all 0s, then the 5th rows is all 1s

print('Labels:', np.unique(y))
for i in list_test_index:
    print(np.unique(y[i], return_counts=True)[1])

Labels: [0 1]
[286720]
[286720]
[286720]
[198656  88064]
[286720]


In [119]:
# PREDICTION BY THE BUILT IN SKLEARN CROSS VALIDATION (cross_val_predic())

t0 = time.time() # measuring time of execution

logreg = LogisticRegression()
result = cross_val_predict(logreg, X, y, cv = 5) # cv is the same as n_splits = 5 

print('Labels:', np.unique(result))
for i in list_test_index:
    print(np.unique(result[i], return_counts=True)[1])
    
print('\nTime execution:', time.time() - t0)

Labels: [0 1]
[261677  25043]
[261638  25082]
[262235  24485]
[224643  62077]
[141192 145528]

Time execution: 36.78186011314392


In [120]:
# PREDICTION WITH SEQUENTIAL INDEX (by the built-in sklearn KFold())

print('Labels:', np.unique(result_kfold))
for i in list_test_index:
    print(np.unique(np.array(result_kfold)[i], return_counts=True)[1])

Labels: [0 1]
[255514  31206]
[255487  31233]
[255833  30887]
[228861  57859]
[243061  43659]


#### Note: The prediction value counts  above are neat again. And can it be seen that the 1s are not uniform anymore. IMBALANCE

In [125]:
# CALCULATING TP, FN, FP, RECALL, PRECISION, F1-SCORE

t0 = time.time()

list_precision, list_recall, list_f1score = [], [], []
sum_tp, sum_fn, sum_fp = 0, 0, 0
TP, FN, FP, TN = [], [], [], []

for count,i in enumerate(list_test_index):
    print('Fold',count+1,'\n')
    tp, fn, fp, tn  = 0.01, 0.01, 0.01, 0.01 # initialized by 0.01 to prevent division by zero. since our goal is just to see the effect of averaging metrics properties
    for a,b in zip(y[i], result_kfold[i]): # (actual, prediction)
        if a == b:
            if (a == 1) and (b == 1):
                tp += 1
            elif (a == 0) and (b == 0):
                tn += 1
        elif a != b:
            if (a == 1) and (b == 0):
                fn += 1
            elif (a == 0) and (b == 1):
                fp += 1
    sum_tp += tp
    sum_fn += fn
    sum_fp += fp
    
    TP.append(tp); FN.append(fn); FP.append(fp); TN.append(tn)
     
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    f1_score = 2 / (1/precision + 1/recall)
    specificity = tn/(tn+fp)


    list_precision.append(precision); list_recall.append(recall); list_f1score.append(f1_score)
        
    print('  TP:', tp)
    print('  FN:', fn)
    print('  FP:', fp)
    print('  TN:', tn)
    print()
    print('  Precision:', precision)
    print('  Recall:', recall)
    print('  F1-Score:', f1_score)
    # *optional addition, since they don't appear in Confusion Matrix (used for plotting ROC curve instead)
    print(' *Sensitivity:', precision)  # sensitivity and precision are the same
    print(' *Specificity:', specificity) 
    print('='*100)
    
print('\nTime execution:', time.time() - t0)

Fold 1 

  TP: 0.01
  FN: 0.01
  FP: 31206.010000000002
  TN: 255514.01

  Precision: 3.2045098990515294e-07
  Recall: 0.5
  F1-Score: 6.409015690552213e-07
 *Sensitivity: 3.2045098990515294e-07
 *Specificity: 0.8911620820896985
Fold 2 

  TP: 0.01
  FN: 0.01
  FP: 31233.010000000002
  TN: 255487.01

  Precision: 3.201739697281915e-07
  Recall: 0.5
  F1-Score: 6.40347529411162e-07
 *Sensitivity: 3.201739697281915e-07
 *Specificity: 0.8910679135694815
Fold 3 

  TP: 0.01
  FN: 0.01
  FP: 30887.010000000002
  TN: 255833.01

  Precision: 3.237605958749015e-07
  Recall: 0.5
  F1-Score: 6.475207724663807e-07
 *Sensitivity: 3.237605958749015e-07
 *Specificity: 0.892274665717448
Fold 4 

  TP: 43024.01
  FN: 45040.01
  FP: 14835.01
  TN: 183821.01

  Precision: 0.7436007384846822
  Recall: 0.48855378166929014
  F1-Score: 0.5896808345001584
 *Sensitivity: 0.7436007384846822
 *Specificity: 0.9253231288938538
Fold 5 

  TP: 43659.01
  FN: 243061.01
  FP: 0.01
  TN: 0.01

  Precision: 0.999999770

In [126]:
# RECALL, PRECISION, F1-SCORE IF BEING AVERAGED DIRECTLY (np.sum(list))

print('Precision:', np.mean(list_precision))
print('Recall:', np.mean(list_recall))
print('F1-Score:', np.mean(list_f1score))

Precision: 0.3487202947644989
Recall: 0.42816486274749455
F1-Score: 0.17079585026562188


In [127]:
# RECALL, PRECISION, F1-SCORE IF NOT BEING AVERAGED DIRECTLY
# example: Precision = sum_tp / (sum_tp + sum_fp)
# so the overall result of each Fold, will be summed first

precision_ = sum_tp/(sum_tp + sum_fp)
recall_ = sum_tp/(sum_tp + sum_fn)

print('Precision:', precision_)
print('Recall:', recall_)
print('F1-Score:', 2 / (1/precision_ + 1/recall_))

Precision: 0.4448841407053127
Recall: 0.23128796018827907
F1-Score: 0.30434957398527673


# FINAL CONCLUSION
Now it's been proven that averaging metrics is not trivial. So do not take the mean of cross validation scores unless you check that the amount of Actual 1s and Prediction 1s are correspondingly uniform. Or the easiest way is just by looking at the standard deviation of the scores. If it's low, it's okay to take the average of it. Else, go sum each of its TP, FP, FN in order to find the true average.