In [1]:
import numpy as np
import pandas as pd
from scipy.stats import rankdata
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import recall_score

from multi_imbalance.ensemble.ovo import OVO
from multi_imbalance.utils import data

In [2]:
minority = dict()
minority['1czysty-cut'] = [1, 2]
minority['2delikatne-cut'] = [1, 2]
minority['3mocniej-cut'] = [1, 2]
minority['4delikatne-bezover-cut'] = [1, 2]
minority['balance-scale'] = [0]
minority['car'] = [3, 1]
minority['cleveland'] = [4, 3, 2, 1]
minority['cleveland_v2'] = [3, 2, 1]
minority['cmc'] = [1]
minority['dermatology'] = [5]
minority['flare'] = [4, 1]
minority['glass'] = [4, 2, 5]
minority['hayes-roth'] = [0]
minority['new_ecoli'] = [3, 2, 4]
minority['new_led7digit'] = [1, 4]
minority['new_vehicle'] = [2, 0]
minority['new_winequality-red'] = [3, 2]
minority['new_yeast'] = [2, 3, 4, 5, 6]
minority['thyroid-newthyroid'] = [2, 1]

In [3]:
datasets = data.load_arff_datasets()
datasets_names = [dsn for dsn, _ in datasets.items()]

binary_classifiers = ['tree', 'KNN', 'NB']
preprocessings = [None, 'globalCS', 'SMOTE', 'SOUP']
preprocessing_between_strategies = ['all', 'maj-min']

results_g_mean = dict()
results_gmean_min = dict()

methods=[(bc,pre,pre_btwn) for bc in binary_classifiers for pre in preprocessings for pre_btwn in preprocessing_between_strategies]

for res in (results_g_mean, results_gmean_min):
    res['dataset'] = datasets_names
    for method in methods:
        res[method] = list()

In [4]:
for dataset_name, dataset_values in datasets.items():
    X, y = dataset_values.data, dataset_values.target
    
    dataset_name = dataset_name[5:]

    for col_idx, col in enumerate(X.T):
        if len(set(col)) > 2:
            scaled = (col - np.mean(col)) / np.std(col)
            X[:, col_idx] = scaled
            
    for binary_classifier, preprocessing, preprocessing_between_strategy in methods:       
        method = binary_classifier, preprocessing, preprocessing_between_strategy
        print(dataset_name, binary_classifier, preprocessing, preprocessing_between_strategy)
        
        g_mean, g_mean_min = list(), list()

        for i in range(10):
            skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)

            for train_index, test_index in skf.split(X, y):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                ovo = OVO(binary_classifier=binary_classifier, preprocessing=preprocessing, n_neighbors=3,
                          preprocessing_between=preprocessing_between_strategy)
                ovo.fit(X_train, y_train, minority_classes=minority[dataset_name])
                y_pred = ovo.predict(X_test)

                g_mean.append(geometric_mean_score(y_test, y_pred, correction=0.001))
                g_mean_min.append(geometric_mean_score(y_test, y_pred, correction=0.001, labels=minority[dataset_name]))
                
        method = (binary_classifier, preprocessing, preprocessing_between_strategy)

        results_g_mean[method].append(round(np.mean(g_mean), 3))
        results_gmean_min[method].append(round(np.mean(g_mean_min), 3))

        print('g-mean:', results_g_mean[method][-1])
        print('gmean-min:', results_gmean_min[method][-1]) 

1czysty-cut tree None all
g-mean: 0.94
gmean-min: 0.929
1czysty-cut tree None maj-min
g-mean: 0.94
gmean-min: 0.929
1czysty-cut tree globalCS all
g-mean: 0.942
gmean-min: 0.931
1czysty-cut tree globalCS maj-min
g-mean: 0.942
gmean-min: 0.931
1czysty-cut tree SMOTE all
g-mean: 0.951
gmean-min: 0.945
1czysty-cut tree SMOTE maj-min
g-mean: 0.951
gmean-min: 0.945
1czysty-cut tree SOUP all
g-mean: 0.956
gmean-min: 0.975
1czysty-cut tree SOUP maj-min
g-mean: 0.956
gmean-min: 0.975
1czysty-cut KNN None all
g-mean: 0.959
gmean-min: 0.952
1czysty-cut KNN None maj-min
g-mean: 0.959
gmean-min: 0.952
1czysty-cut KNN globalCS all
g-mean: 0.965
gmean-min: 0.975
1czysty-cut KNN globalCS maj-min
g-mean: 0.965
gmean-min: 0.975
1czysty-cut KNN SMOTE all
g-mean: 0.966
gmean-min: 0.973
1czysty-cut KNN SMOTE maj-min
g-mean: 0.966
gmean-min: 0.973
1czysty-cut KNN SOUP all
g-mean: 0.955
gmean-min: 1.0
1czysty-cut KNN SOUP maj-min
g-mean: 0.955
gmean-min: 1.0
1czysty-cut NB None all
g-mean: 0.716
gmean-min: 0

g-mean: 0.701
gmean-min: 0.831
car KNN SOUP maj-min
g-mean: 0.701
gmean-min: 0.884
car NB None all
g-mean: 0.856
gmean-min: 0.93
car NB None maj-min
g-mean: 0.856
gmean-min: 0.93
car NB globalCS all
g-mean: 0.856
gmean-min: 0.93
car NB globalCS maj-min
g-mean: 0.856
gmean-min: 0.93
car NB SMOTE all
g-mean: 0.015
gmean-min: 0.001
car NB SMOTE maj-min
g-mean: 0.03
gmean-min: 0.001
car NB SOUP all
g-mean: 0.856
gmean-min: 0.93
car NB SOUP maj-min
g-mean: 0.856
gmean-min: 0.93
cleveland tree None all
g-mean: 0.141
gmean-min: 0.095
cleveland tree None maj-min
g-mean: 0.141
gmean-min: 0.095
cleveland tree globalCS all
g-mean: 0.068
gmean-min: 0.038
cleveland tree globalCS maj-min
g-mean: 0.183
gmean-min: 0.133
cleveland tree SMOTE all
g-mean: 0.078
gmean-min: 0.045
cleveland tree SMOTE maj-min
g-mean: 0.173
gmean-min: 0.122
cleveland tree SOUP all
g-mean: 0.157
gmean-min: 0.114
cleveland tree SOUP maj-min
g-mean: 0.169
gmean-min: 0.128
cleveland KNN None all
g-mean: 0.119
gmean-min: 0.084
cl

g-mean: 0.351
gmean-min: 0.396
glass NB SMOTE maj-min
g-mean: 0.348
gmean-min: 0.396
glass NB SOUP all
g-mean: 0.263
gmean-min: 0.389
glass NB SOUP maj-min
g-mean: 0.235
gmean-min: 0.389
hayes-roth tree None all
g-mean: 0.848
gmean-min: 0.862
hayes-roth tree None maj-min
g-mean: 0.848
gmean-min: 0.862
hayes-roth tree globalCS all
g-mean: 0.848
gmean-min: 0.862
hayes-roth tree globalCS maj-min
g-mean: 0.848
gmean-min: 0.862
hayes-roth tree SMOTE all
g-mean: 0.848
gmean-min: 0.862
hayes-roth tree SMOTE maj-min
g-mean: 0.848
gmean-min: 0.862
hayes-roth tree SOUP all
g-mean: 0.792
gmean-min: 0.862
hayes-roth tree SOUP maj-min
g-mean: 0.806
gmean-min: 0.862
hayes-roth KNN None all
g-mean: 0.55
gmean-min: 0.6
hayes-roth KNN None maj-min
g-mean: 0.55
gmean-min: 0.6
hayes-roth KNN globalCS all
g-mean: 0.645
gmean-min: 0.631
hayes-roth KNN globalCS maj-min
g-mean: 0.597
gmean-min: 0.677
hayes-roth KNN SMOTE all
g-mean: 0.625
gmean-min: 0.615
hayes-roth KNN SMOTE maj-min
g-mean: 0.584
gmean-min:

g-mean: 0.413
gmean-min: 0.378
new_yeast KNN SOUP all
g-mean: 0.452
gmean-min: 0.445
new_yeast KNN SOUP maj-min
g-mean: 0.447
gmean-min: 0.456
new_yeast NB None all
g-mean: 0.091
gmean-min: 0.237
new_yeast NB None maj-min
g-mean: 0.091
gmean-min: 0.237
new_yeast NB globalCS all
g-mean: 0.082
gmean-min: 0.243
new_yeast NB globalCS maj-min
g-mean: 0.082
gmean-min: 0.237
new_yeast NB SMOTE all
g-mean: 0.095
gmean-min: 0.241
new_yeast NB SMOTE maj-min
g-mean: 0.093
gmean-min: 0.236
new_yeast NB SOUP all
g-mean: 0.111
gmean-min: 0.246
new_yeast NB SOUP maj-min
g-mean: 0.097
gmean-min: 0.242
thyroid-newthyroid tree None all
g-mean: 0.892
gmean-min: 0.854
thyroid-newthyroid tree None maj-min
g-mean: 0.892
gmean-min: 0.854
thyroid-newthyroid tree globalCS all
g-mean: 0.916
gmean-min: 0.891
thyroid-newthyroid tree globalCS maj-min
g-mean: 0.916
gmean-min: 0.891
thyroid-newthyroid tree SMOTE all
g-mean: 0.923
gmean-min: 0.912
thyroid-newthyroid tree SMOTE maj-min
g-mean: 0.923
gmean-min: 0.912
t

In [17]:
df_gmean = pd.DataFrame(results_g_mean)
df_gmean_min = pd.DataFrame(results_gmean_min)

for df in (df_gmean, df_gmean_min):
    df.set_index('dataset')
    df.columns = pd.MultiIndex.from_tuples(
        [('classifier', 'preprocessing', 'preprocessing between')] + [(str(m[0]), str(m[1]), str(m[2])) for m in methods])
    
import os
import datetime
    
current_date = str(datetime.datetime.today())[:-7].replace(' ', '_').replace(':','_')

directory='OVO-' + current_date
os.makedirs(directory)

df_gmean.to_csv(f'./{directory}/ovo_kfold_gmean.csv', index=False)
df_gmean_min.to_csv(f'./{directory}/ovo_kfold_gmean_min.csv', index=False)

## G-mean

In [18]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 4000
df_gmean

Unnamed: 0_level_0,classifier,tree,tree,tree,tree,tree,tree,tree,tree,KNN,KNN,KNN,KNN,KNN,KNN,KNN,KNN,NB,NB,NB,NB,NB,NB,NB,NB
Unnamed: 0_level_1,preprocessing,None,None,globalCS,globalCS,SMOTE,SMOTE,SOUP,SOUP,None,None,globalCS,globalCS,SMOTE,SMOTE,SOUP,SOUP,None,None,globalCS,globalCS,SMOTE,SMOTE,SOUP,SOUP
Unnamed: 0_level_2,preprocessing between,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min
0,arff\1czysty-cut,0.943,0.943,0.944,0.944,0.953,0.953,0.954,0.954,0.961,0.961,0.968,0.968,0.969,0.969,0.954,0.954,0.714,0.714,0.935,0.935,0.936,0.936,0.931,0.931
1,arff\2delikatne-cut,0.703,0.703,0.707,0.706,0.738,0.734,0.802,0.799,0.684,0.684,0.734,0.726,0.734,0.729,0.805,0.786,0.476,0.476,0.82,0.815,0.825,0.821,0.813,0.807
2,arff\3mocniej-cut,0.486,0.486,0.485,0.484,0.504,0.506,0.613,0.594,0.44,0.44,0.51,0.499,0.496,0.499,0.585,0.558,0.01,0.01,0.63,0.614,0.638,0.622,0.609,0.596
3,arff\4delikatne-bezover-cut,0.772,0.772,0.77,0.77,0.798,0.798,0.89,0.89,0.786,0.786,0.819,0.819,0.829,0.829,0.89,0.888,0.717,0.717,0.891,0.891,0.896,0.896,0.881,0.88
4,arff\balance-scale,0.225,0.225,0.183,0.183,0.336,0.336,0.645,0.645,0.12,0.12,0.13,0.121,0.276,0.276,0.699,0.705,0.099,0.099,0.744,0.744,0.713,0.713,0.65,0.65
5,arff\car,0.933,0.933,0.958,0.954,0.959,0.955,0.922,0.932,0.404,0.404,0.761,0.746,0.326,0.33,0.706,0.704,0.857,0.857,0.857,0.857,0.015,0.03,0.857,0.857
6,arff\cleveland,0.096,0.096,0.066,0.085,0.083,0.09,0.114,0.102,0.112,0.112,0.178,0.116,0.177,0.118,0.146,0.121,0.068,0.068,0.059,0.065,0.083,0.071,0.096,0.073
7,arff\cleveland_v2,0.073,0.073,0.058,0.06,0.093,0.085,0.142,0.121,0.053,0.053,0.216,0.146,0.178,0.155,0.24,0.17,0.062,0.062,0.069,0.059,0.145,0.069,0.08,0.065
8,arff\cmc,0.431,0.431,0.429,0.432,0.43,0.428,0.48,0.467,0.463,0.463,0.445,0.445,0.461,0.462,0.48,0.477,0.443,0.443,0.424,0.431,0.423,0.424,0.435,0.434
9,arff\dermatology,0.932,0.932,0.932,0.932,0.933,0.932,0.937,0.941,0.955,0.955,0.948,0.955,0.946,0.955,0.945,0.953,0.908,0.908,0.908,0.908,0.908,0.908,0.918,0.912


## G-mean for minority

In [19]:
df_gmean_min

Unnamed: 0_level_0,classifier,tree,tree,tree,tree,tree,tree,tree,tree,KNN,KNN,KNN,KNN,KNN,KNN,KNN,KNN,NB,NB,NB,NB,NB,NB,NB,NB
Unnamed: 0_level_1,preprocessing,None,None,globalCS,globalCS,SMOTE,SMOTE,SOUP,SOUP,None,None,globalCS,globalCS,SMOTE,SMOTE,SOUP,SOUP,None,None,globalCS,globalCS,SMOTE,SMOTE,SOUP,SOUP
Unnamed: 0_level_2,preprocessing between,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min,all,maj-min
0,arff\1czysty-cut,0.93,0.93,0.932,0.932,0.946,0.946,0.97,0.97,0.956,0.956,0.979,0.979,0.978,0.978,1.0,1.0,0.607,0.607,0.993,0.993,0.991,0.991,0.995,0.995
1,arff\2delikatne-cut,0.62,0.62,0.623,0.622,0.673,0.669,0.794,0.79,0.59,0.59,0.683,0.673,0.68,0.674,0.831,0.802,0.33,0.33,0.852,0.843,0.847,0.839,0.855,0.845
2,arff\3mocniej-cut,0.371,0.371,0.369,0.368,0.42,0.422,0.59,0.56,0.312,0.312,0.429,0.415,0.42,0.423,0.608,0.577,0.001,0.001,0.703,0.668,0.698,0.665,0.702,0.669
3,arff\4delikatne-bezover-cut,0.725,0.725,0.719,0.719,0.769,0.769,0.954,0.954,0.734,0.734,0.821,0.821,0.833,0.833,0.988,0.989,0.62,0.62,0.969,0.969,0.965,0.965,0.975,0.975
4,arff\balance-scale,0.048,0.048,0.036,0.036,0.103,0.103,0.568,0.568,0.009,0.009,0.018,0.013,0.078,0.078,0.717,0.732,0.001,0.001,0.796,0.796,0.698,0.698,0.796,0.796
5,arff\car,0.899,0.899,0.936,0.937,0.939,0.939,0.939,0.946,0.239,0.239,0.83,0.844,0.165,0.17,0.846,0.868,0.931,0.931,0.931,0.931,0.001,0.001,0.931,0.931
6,arff\cleveland,0.061,0.061,0.038,0.052,0.052,0.055,0.08,0.068,0.075,0.075,0.136,0.082,0.136,0.084,0.111,0.088,0.053,0.053,0.046,0.055,0.061,0.05,0.073,0.054
7,arff\cleveland_v2,0.038,0.038,0.026,0.027,0.052,0.046,0.094,0.073,0.025,0.025,0.157,0.097,0.125,0.108,0.185,0.123,0.041,0.041,0.05,0.041,0.108,0.041,0.051,0.042
8,arff\cmc,0.329,0.329,0.332,0.332,0.325,0.325,0.494,0.494,0.398,0.398,0.514,0.514,0.489,0.489,0.538,0.537,0.683,0.683,0.704,0.704,0.718,0.718,0.714,0.714
9,arff\dermatology,0.86,0.86,0.86,0.86,0.86,0.86,0.905,0.91,0.995,0.995,1.0,1.0,1.0,1.0,0.995,0.995,0.985,0.985,0.985,0.985,0.985,0.985,1.0,1.0


In [20]:
ranks_gmean = dict()
ranks_gmean_min = dict()

for results, ranks in zip((results_g_mean, results_gmean_min), (ranks_gmean, ranks_gmean_min)):
    for ds_idx, ds in enumerate(datasets_names):
        ranks[ds] = dict()
        for bc in binary_classifiers:
            ranks[ds][bc] = rankdata(
                [-results[method][ds_idx] for method in methods if method[0]==bc])


avg_ranks_gmean=dict()
avg_ranks_gmean_min =dict()

for avg_ranks, ranks in zip((avg_ranks_gmean, avg_ranks_gmean_min), (ranks_gmean, ranks_gmean_min)):
    for bc in binary_classifiers:
        avg_ranks[bc]=dict()
        for method_idx, method in enumerate([m for m in methods if m[0]==bc]):
            avg_ranks[bc][method] = round(np.mean([ranks[ds][bc][method_idx] for ds in datasets_names]),3)
            
for avg_ranks in (avg_ranks_gmean, avg_ranks_gmean_min):
    for bc in binary_classifiers:
        for method in methods:
            if method[0]==bc:
                avg_ranks[bc][str(method[1:])] = avg_ranks[bc][method]
                avg_ranks[bc].pop(method)

df_gmean_avg_ranks = pd.DataFrame(avg_ranks_gmean).T
df_gmean_min_avg_ranks = pd.DataFrame(avg_ranks_gmean_min).T

## Mean ranks in G-mean

In [21]:
df_gmean_avg_ranks.T.sort_values('tree')

Unnamed: 0,tree,KNN,NB
"('SOUP', 'all')",2.605,2.895,3.342
"('SOUP', 'maj-min')",2.816,3.474,4.553
"('SMOTE', 'all')",4.184,4.105,3.447
"('SMOTE', 'maj-min')",4.342,4.237,3.605
"(None, 'all')",5.421,6.5,6.184
"(None, 'maj-min')",5.421,6.5,6.184
"('globalCS', 'maj-min')",5.447,4.737,4.342
"('globalCS', 'all')",5.763,3.553,4.342


## Mean ranks in G-mean minority

In [22]:
df_gmean_min_avg_ranks.T.sort_values('tree')

Unnamed: 0,tree,KNN,NB
"('SOUP', 'maj-min')",1.605,2.132,3.395
"('SOUP', 'all')",1.921,1.921,2.868
"('SMOTE', 'maj-min')",4.211,4.842,5.079
"('SMOTE', 'all')",4.289,4.658,4.132
"(None, 'all')",5.842,7.132,6.237
"(None, 'maj-min')",5.842,7.132,6.237
"('globalCS', 'maj-min')",6.0,4.395,4.0
"('globalCS', 'all')",6.289,3.789,4.053


## Mean results

In [23]:
mean_results = dict()
for m in methods:
    mean_results[str(m)] = [round(np.mean(results_g_mean[m]),2), round(np.mean(results_gmean_min[m]),2)]

df_mean = pd.DataFrame(mean_results).T
df_mean.columns = 'gmean gmean_min'.split()

df_mean.sort_values(['gmean'], ascending=[False])

Unnamed: 0,gmean,gmean_min
"('KNN', 'SOUP', 'all')",0.66,0.68
"('tree', 'SOUP', 'all')",0.66,0.65
"('tree', 'SOUP', 'maj-min')",0.66,0.65
"('KNN', 'SOUP', 'maj-min')",0.64,0.67
"('KNN', 'globalCS', 'all')",0.62,0.59
"('tree', 'SMOTE', 'all')",0.61,0.54
"('tree', 'SMOTE', 'maj-min')",0.61,0.54
"('KNN', 'SMOTE', 'all')",0.6,0.55
"('KNN', 'globalCS', 'maj-min')",0.6,0.58
"('KNN', 'SMOTE', 'maj-min')",0.59,0.55
