In [116]:
import numpy as np
import pandas as pd
from scipy.stats import rankdata
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import recall_score

from multi_imbalance.ensemble.ovo import OVO
from multi_imbalance.datasets import load_datasets

In [117]:
minority = dict()
minority['1czysty-cut'] = [1, 2]
minority['2delikatne-cut'] = [1, 2]
minority['3mocniej-cut'] = [1, 2]
minority['4delikatne-bezover-cut'] = [1, 2]
minority['balance-scale'] = [0]
minority['cleveland'] = [4, 3, 2]
minority['cleveland_v2'] = [3, 2, 1]
minority['cmc'] = [1]
minority['dermatology'] = [5, 3, 4, 1]
minority['glass'] = [4, 2, 5, 3]
minority['hayes-roth'] = [0]
minority['new_ecoli'] = [3, 2, 4]
minority['new_led7digit'] = [1, 4]
minority['new_vehicle'] = [2, 0]
minority['new_winequality-red'] = [3, 2]
minority['new_yeast'] = [2, 3, 4, 5, 6]
minority['thyroid-newthyroid'] = [2, 1]

In [118]:
datasets = load_datasets()
datasets_names = [dsn for dsn, _ in datasets.items()]

binary_classifiers = ['CART', 'KNN', 'NB']
binary_oversamplings = [None, 'globalCS', 'SMOTE']
oversample_between_strategies = ['all', 'maj-min']

results_g_mean = dict()
results_acc = dict()
results_avg_tpr = dict()

methods=[(bc,bo,obs) for bc in binary_classifiers for bo in binary_oversamplings for obs in oversample_between_strategies]

for res in (results_g_mean, results_acc, results_avg_tpr):
    res['dataset'] = datasets_names
    for method in methods:
        res[method] = list()

In [119]:
for dataset_name, dataset_values in datasets.items():
    X, y = dataset_values.data, dataset_values.target

    for binary_classifier in binary_classifiers:
        for oversample_binary in binary_oversamplings:
            for oversample_between_strategy in oversample_between_strategies:

                print(dataset_name, oversample_binary, binary_classifier, oversample_between_strategy)
                acc, g_mean, avg_tpr = list(), list(), list()

                for i in range(10):
                    fold_no = 0
                    skf = StratifiedKFold(n_splits=5, random_state=i)

                    for train_index, test_index in skf.split(X, y):
                        X_train, X_test = X[train_index], X[test_index]
                        y_train, y_test = y[train_index], y[test_index]

                        nn = min(np.unique(y_train, return_counts=True)[1]) - 1
                        nn = min(nn, 3)
                        if nn == 2: nn = 1

                        ovo = OVO(binary_classifier=binary_classifier, oversample_binary=oversample_binary, n_neighbors=nn,
                                  oversample_between=oversample_between_strategy)
                        ovo.fit(X_train, y_train, minority_classes=minority[dataset_name])
                        y_pred = ovo.predict(X_test)

                        g_mean.append(geometric_mean_score(y_test, y_pred, correction=0.001))
                        acc.append(accuracy_score(y_test, y_pred))
                        avg_tpr.append(recall_score(y_test, y_pred, average='macro'))

                        fold_no += 1


                results_g_mean[(binary_classifier, oversample_binary, oversample_between_strategy)].append(round(np.mean(g_mean), 3))
                results_acc[(binary_classifier, oversample_binary, oversample_between_strategy)].append(round(np.mean(acc), 3))
                results_avg_tpr[(binary_classifier, oversample_binary, oversample_between_strategy)].append(round(np.mean(avg_tpr), 3))


1czysty-cut None CART all
1czysty-cut None CART maj-min
1czysty-cut globalCS CART all
1czysty-cut globalCS CART maj-min
1czysty-cut SMOTE CART all
1czysty-cut SMOTE CART maj-min
1czysty-cut None KNN all
1czysty-cut None KNN maj-min
1czysty-cut globalCS KNN all
1czysty-cut globalCS KNN maj-min
1czysty-cut SMOTE KNN all
1czysty-cut SMOTE KNN maj-min
1czysty-cut None NB all
1czysty-cut None NB maj-min
1czysty-cut globalCS NB all
1czysty-cut globalCS NB maj-min
1czysty-cut SMOTE NB all
1czysty-cut SMOTE NB maj-min
2delikatne-cut None CART all
2delikatne-cut None CART maj-min
2delikatne-cut globalCS CART all
2delikatne-cut globalCS CART maj-min
2delikatne-cut SMOTE CART all
2delikatne-cut SMOTE CART maj-min
2delikatne-cut None KNN all
2delikatne-cut None KNN maj-min
2delikatne-cut globalCS KNN all
2delikatne-cut globalCS KNN maj-min
2delikatne-cut SMOTE KNN all
2delikatne-cut SMOTE KNN maj-min
2delikatne-cut None NB all
2delikatne-cut None NB maj-min
2delikatne-cut globalCS NB all
2delikatn

new_yeast SMOTE KNN maj-min
new_yeast None NB all
new_yeast None NB maj-min
new_yeast globalCS NB all
new_yeast globalCS NB maj-min
new_yeast SMOTE NB all
new_yeast SMOTE NB maj-min
thyroid-newthyroid None CART all
thyroid-newthyroid None CART maj-min
thyroid-newthyroid globalCS CART all
thyroid-newthyroid globalCS CART maj-min
thyroid-newthyroid SMOTE CART all
thyroid-newthyroid SMOTE CART maj-min
thyroid-newthyroid None KNN all
thyroid-newthyroid None KNN maj-min
thyroid-newthyroid globalCS KNN all
thyroid-newthyroid globalCS KNN maj-min
thyroid-newthyroid SMOTE KNN all
thyroid-newthyroid SMOTE KNN maj-min
thyroid-newthyroid None NB all
thyroid-newthyroid None NB maj-min
thyroid-newthyroid globalCS NB all
thyroid-newthyroid globalCS NB maj-min
thyroid-newthyroid SMOTE NB all
thyroid-newthyroid SMOTE NB maj-min


In [120]:
df_gmean = pd.DataFrame(results_g_mean)
df_acc = pd.DataFrame(results_acc)
df_avg_tpr = pd.DataFrame(results_avg_tpr)

for df in (df_gmean, df_acc, df_avg_tpr):
    df.set_index('dataset')
    df.columns = pd.MultiIndex.from_tuples(
        [('dataset', '')] + [(str(bc), str(bo), str(obs)) for bc in binary_classifiers for bo in binary_oversamplings for obs in oversample_between_strategies])
    
import os
import datetime
    
current_date = str(datetime.datetime.today())[:-7].replace(' ', '_').replace(':','_')

directory='OVO-'+current_date
os.makedirs(directory)

df_gmean.to_csv(f'./{directory}/ovo_kfold_gmean.csv', index=False)
df_acc.to_csv(f'./{directory}/ovo_kfold_acc.csv', index=False)
df_avg_tpr.to_csv(f'./{directory}/ovo_kfold_avg_tpr.csv', index=False)

In [121]:
df_gmean

Unnamed: 0_level_0,dataset,CART,CART,CART,CART,CART,CART,KNN,KNN,KNN,KNN,KNN,KNN,NB,NB,NB,NB,NB,NB
Unnamed: 0_level_1,Unnamed: 1_level_1,None,None,globalCS,globalCS,SMOTE,SMOTE,None,None,globalCS,globalCS,SMOTE,SMOTE,None,None,globalCS,globalCS,SMOTE,SMOTE
Unnamed: 0_level_2,NaN,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min
0,1czysty-cut,0.94,0.94,0.945,0.947,0.954,0.953,0.965,0.965,0.977,0.977,0.98,0.978,0.721,0.721,0.935,0.935,0.938,0.934
1,2delikatne-cut,0.663,0.674,0.69,0.678,0.704,0.694,0.666,0.666,0.726,0.705,0.699,0.702,0.43,0.43,0.803,0.805,0.81,0.806
2,3mocniej-cut,0.43,0.428,0.378,0.407,0.435,0.467,0.384,0.384,0.444,0.446,0.457,0.463,0.018,0.018,0.61,0.583,0.616,0.583
3,4delikatne-bezover-cut,0.729,0.739,0.75,0.749,0.779,0.782,0.757,0.757,0.803,0.803,0.804,0.817,0.653,0.653,0.877,0.877,0.878,0.879
4,balance-scale,0.242,0.247,0.13,0.129,0.281,0.265,0.278,0.278,0.265,0.274,0.347,0.352,0.08,0.08,0.419,0.419,0.436,0.442
5,cleveland,0.08,0.055,0.073,0.073,0.084,0.067,0.04,0.04,0.068,0.078,0.139,0.087,0.096,0.096,0.145,0.099,0.227,0.094
6,cleveland_v2,0.05,0.047,0.043,0.066,0.084,0.05,0.026,0.026,0.196,0.109,0.159,0.132,0.079,0.079,0.217,0.084,0.283,0.083
7,cmc,0.431,0.429,0.419,0.425,0.442,0.422,0.46,0.46,0.459,0.46,0.469,0.468,0.484,0.484,0.467,0.477,0.467,0.468
8,dermatology,0.937,0.913,0.919,0.921,0.913,0.908,0.859,0.859,0.852,0.852,0.856,0.857,0.761,0.761,0.777,0.761,0.779,0.761
9,glass,0.521,0.586,0.573,0.535,0.611,0.524,0.49,0.49,0.525,0.504,0.518,0.498,0.207,0.207,0.199,0.2,0.256,0.298


In [122]:
df_acc

Unnamed: 0_level_0,dataset,CART,CART,CART,CART,CART,CART,KNN,KNN,KNN,KNN,KNN,KNN,NB,NB,NB,NB,NB,NB
Unnamed: 0_level_1,Unnamed: 1_level_1,None,None,globalCS,globalCS,SMOTE,SMOTE,None,None,globalCS,globalCS,SMOTE,SMOTE,None,None,globalCS,globalCS,SMOTE,SMOTE
Unnamed: 0_level_2,NaN,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min
0,1czysty-cut,0.959,0.959,0.96,0.961,0.963,0.964,0.973,0.973,0.963,0.963,0.97,0.967,0.903,0.903,0.878,0.878,0.883,0.881
1,2delikatne-cut,0.828,0.834,0.843,0.843,0.828,0.825,0.846,0.846,0.811,0.82,0.81,0.815,0.819,0.819,0.784,0.791,0.8,0.8
2,3mocniej-cut,0.7,0.703,0.72,0.71,0.64,0.668,0.731,0.731,0.636,0.659,0.624,0.635,0.705,0.705,0.572,0.592,0.582,0.596
3,4delikatne-bezover-cut,0.836,0.842,0.85,0.848,0.842,0.843,0.864,0.864,0.838,0.838,0.838,0.842,0.87,0.87,0.809,0.809,0.821,0.82
4,balance-scale,0.54,0.543,0.526,0.519,0.497,0.529,0.728,0.728,0.629,0.641,0.636,0.637,0.732,0.732,0.613,0.613,0.61,0.623
5,cleveland,0.568,0.545,0.561,0.567,0.525,0.551,0.478,0.478,0.337,0.419,0.33,0.386,0.554,0.554,0.518,0.544,0.539,0.549
6,cleveland_v2,0.7,0.693,0.683,0.701,0.69,0.677,0.683,0.683,0.535,0.538,0.476,0.497,0.673,0.673,0.624,0.62,0.648,0.65
7,cmc,0.48,0.476,0.468,0.474,0.49,0.473,0.49,0.49,0.457,0.461,0.471,0.477,0.471,0.471,0.453,0.464,0.453,0.457
8,dermatology,0.947,0.937,0.935,0.936,0.939,0.935,0.88,0.88,0.866,0.866,0.872,0.876,0.872,0.872,0.874,0.872,0.876,0.872
9,glass,0.716,0.74,0.733,0.701,0.697,0.716,0.654,0.654,0.641,0.636,0.634,0.629,0.432,0.432,0.353,0.358,0.371,0.393


In [123]:
df_avg_tpr

Unnamed: 0_level_0,dataset,CART,CART,CART,CART,CART,CART,KNN,KNN,KNN,KNN,KNN,KNN,NB,NB,NB,NB,NB,NB
Unnamed: 0_level_1,Unnamed: 1_level_1,None,None,globalCS,globalCS,SMOTE,SMOTE,None,None,globalCS,globalCS,SMOTE,SMOTE,None,None,globalCS,globalCS,SMOTE,SMOTE
Unnamed: 0_level_2,NaN,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min
0,1czysty-cut,0.941,0.941,0.946,0.948,0.955,0.954,0.966,0.966,0.977,0.977,0.98,0.978,0.757,0.757,0.939,0.939,0.941,0.937
1,2delikatne-cut,0.708,0.718,0.73,0.727,0.736,0.728,0.715,0.715,0.747,0.738,0.727,0.732,0.567,0.567,0.811,0.813,0.817,0.814
2,3mocniej-cut,0.51,0.511,0.509,0.507,0.493,0.525,0.502,0.502,0.492,0.514,0.499,0.514,0.342,0.342,0.636,0.616,0.64,0.617
3,4delikatne-bezover-cut,0.769,0.775,0.785,0.784,0.801,0.804,0.797,0.797,0.825,0.825,0.825,0.832,0.735,0.735,0.885,0.885,0.885,0.886
4,balance-scale,0.449,0.451,0.412,0.407,0.415,0.444,0.568,0.568,0.511,0.523,0.52,0.521,0.53,0.53,0.592,0.592,0.585,0.603
5,cleveland,0.336,0.299,0.325,0.316,0.31,0.307,0.227,0.227,0.233,0.244,0.243,0.245,0.329,0.329,0.332,0.329,0.346,0.324
6,cleveland_v2,0.347,0.351,0.298,0.333,0.373,0.337,0.272,0.272,0.314,0.288,0.296,0.3,0.354,0.354,0.38,0.352,0.396,0.357
7,cmc,0.451,0.448,0.44,0.446,0.461,0.443,0.47,0.47,0.463,0.464,0.47,0.473,0.495,0.495,0.491,0.498,0.49,0.491
8,dermatology,0.94,0.921,0.925,0.925,0.924,0.921,0.874,0.874,0.866,0.866,0.869,0.871,0.866,0.866,0.869,0.866,0.87,0.866
9,glass,0.683,0.707,0.695,0.671,0.706,0.684,0.638,0.638,0.663,0.649,0.659,0.644,0.501,0.501,0.489,0.495,0.502,0.519


In [124]:
ranks_gmean = dict()
ranks_acc = dict()
ranks_avg_tpr = dict()

for results, ranks in zip((results_g_mean, results_acc, results_avg_tpr), (ranks_gmean, ranks_acc, ranks_avg_tpr)):
    for ds_idx, ds in enumerate(datasets_names):
        ranks[ds] = dict()
        for bc in binary_classifiers:
            ranks[ds][bc] = rankdata(
                [-results[method][ds_idx] for method in methods if method[0]==bc])


In [145]:
avg_ranks_gmean=dict()
avg_ranks_acc=dict()
avg_ranks_avg_tpr=dict()

for avg_ranks, ranks in zip((avg_ranks_gmean, avg_ranks_acc, avg_ranks_avg_tpr), (ranks_gmean, ranks_acc, ranks_avg_tpr)):
    for bc in binary_classifiers:
        avg_ranks[bc]=dict()
        for method_idx, method in enumerate([m for m in methods if m[0]==bc]):
            avg_ranks[bc][method] = round(np.mean([ranks[ds][bc][method_idx] for ds in datasets_names]),3)
            
for avg_ranks in (avg_ranks_gmean, avg_ranks_acc, avg_ranks_avg_tpr):
    for bc in binary_classifiers:
        for method in methods:
            if method[0]==bc:
                avg_ranks[bc][str(method[1:])] = avg_ranks[bc][method]
                avg_ranks[bc].pop(method)

df1 = pd.DataFrame(avg_ranks_gmean).T
df2 = pd.DataFrame(avg_ranks_acc).T
df3 = pd.DataFrame(avg_ranks_avg_tpr).T


In [146]:
avg_ranks_gmean

{'CART': {"(None, 'all')": 3.971,
  "(None, 'maj-min')": 4.059,
  "('globalCS', 'all')": 4.059,
  "('globalCS', 'maj-min')": 3.647,
  "('SMOTE', 'all')": 2.147,
  "('SMOTE', 'maj-min')": 3.118},
 'KNN': {"(None, 'all')": 4.941,
  "(None, 'maj-min')": 4.941,
  "('globalCS', 'all')": 3.0,
  "('globalCS', 'maj-min')": 3.176,
  "('SMOTE', 'all')": 1.941,
  "('SMOTE', 'maj-min')": 3.0},
 'NB': {"(None, 'all')": 4.735,
  "(None, 'maj-min')": 4.735,
  "('globalCS', 'all')": 3.059,
  "('globalCS', 'maj-min')": 3.088,
  "('SMOTE', 'all')": 2.559,
  "('SMOTE', 'maj-min')": 2.824}}

In [147]:
avg_ranks_avg_tpr

{'CART': {"(None, 'all')": 3.765,
  "(None, 'maj-min')": 3.676,
  "('globalCS', 'all')": 3.5,
  "('globalCS', 'maj-min')": 4.029,
  "('SMOTE', 'all')": 2.853,
  "('SMOTE', 'maj-min')": 3.176},
 'KNN': {"(None, 'all')": 4.529,
  "(None, 'maj-min')": 4.529,
  "('globalCS', 'all')": 3.059,
  "('globalCS', 'maj-min')": 2.853,
  "('SMOTE', 'all')": 2.941,
  "('SMOTE', 'maj-min')": 3.088},
 'NB': {"(None, 'all')": 4.559,
  "(None, 'maj-min')": 4.559,
  "('globalCS', 'all')": 3.029,
  "('globalCS', 'maj-min')": 3.235,
  "('SMOTE', 'all')": 2.706,
  "('SMOTE', 'maj-min')": 2.912}}

## Mean ranks in g-mean:

In [149]:
df1

Unnamed: 0,"(None, 'all')","(None, 'maj-min')","('globalCS', 'all')","('globalCS', 'maj-min')","('SMOTE', 'all')","('SMOTE', 'maj-min')"
CART,3.971,4.059,4.059,3.647,2.147,3.118
KNN,4.941,4.941,3.0,3.176,1.941,3.0
NB,4.735,4.735,3.059,3.088,2.559,2.824


## Mean ranks in accuracy:

In [150]:
df2

Unnamed: 0,"(None, 'all')","(None, 'maj-min')","('globalCS', 'all')","('globalCS', 'maj-min')","('SMOTE', 'all')","('SMOTE', 'maj-min')"
CART,3.353,3.382,3.353,2.912,3.882,4.118
KNN,1.794,1.794,4.618,3.971,4.588,4.235
NB,2.353,2.353,4.647,4.324,3.971,3.353


## Mean ranks in average accuracy (average recall):

In [151]:
df3

Unnamed: 0,"(None, 'all')","(None, 'maj-min')","('globalCS', 'all')","('globalCS', 'maj-min')","('SMOTE', 'all')","('SMOTE', 'maj-min')"
CART,3.765,3.676,3.5,4.029,2.853,3.176
KNN,4.529,4.529,3.059,2.853,2.941,3.088
NB,4.559,4.559,3.029,3.235,2.706,2.912
