In [43]:
import numpy as np
import pandas as pd
from scipy.stats import rankdata
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import recall_score

from multi_imbalance.ensemble.ecoc import ECOC
from multi_imbalance.datasets import load_datasets

In [44]:
minority = dict()
minority['1czysty-cut'] = [1, 2]
minority['2delikatne-cut'] = [1, 2]
minority['3mocniej-cut'] = [1, 2]
minority['4delikatne-bezover-cut'] = [1, 2]
minority['balance-scale'] = [0]
minority['cleveland'] = [4, 3, 2]
minority['cleveland_v2'] = [3, 2, 1]
minority['cmc'] = [1]
minority['dermatology'] = [5, 3, 4, 1]
minority['glass'] = [4, 2, 5, 3]
minority['hayes-roth'] = [0]
minority['new_ecoli'] = [3, 2, 4]
minority['new_led7digit'] = [1, 4]
minority['new_vehicle'] = [2, 0]
minority['new_winequality-red'] = [3, 2]
minority['new_yeast'] = [2, 3, 4, 5, 6]
minority['thyroid-newthyroid'] = [2, 1]

In [45]:
datasets = load_datasets()
datasets_names = [dsn for dsn, _ in datasets.items()]

binary_classifiers = ['CART', 'KNN', 'NB']
binary_oversamplings = [None, 'globalCS', 'SMOTE']
encodings = ['complete', 'dense', 'sparse', 'OVO', 'OVA']

results_g_mean = dict()
results_acc = dict()
results_avg_tpr = dict()

methods=[(bc,encoding, bo) for bc in binary_classifiers for encoding in encodings for bo in binary_oversamplings ]

for res in (results_g_mean, results_acc, results_avg_tpr):
    res['dataset'] = datasets_names
    for method in methods:
        res[method] = list()

In [None]:
for dataset_name, dataset_values in datasets.items():
    X, y = dataset_values.data, dataset_values.target

    for binary_classifier, encoding, oversample_binary in methods:

                print(dataset_name, oversample_binary, binary_classifier, encoding)
                acc, g_mean, avg_tpr = list(), list(), list()

                for i in range(10):
                    skf = StratifiedKFold(n_splits=5, random_state=i)

                    for train_index, test_index in skf.split(X, y):
                        X_train, X_test = X[train_index], X[test_index]
                        y_train, y_test = y[train_index], y[test_index]

                        nn = min(np.unique(y_train, return_counts=True)[1]) - 1
                        nn = min(nn, 3)
                        if nn == 2: nn = 1

                        ecoc = ECOC(binary_classifier=binary_classifier, oversample_binary=oversample_binary, n_neighbors=nn,
                                  encoding=encoding)
                        ecoc.fit(X_train, y_train)
                        y_pred = ecoc.predict(X_test)

                        g_mean.append(geometric_mean_score(y_test, y_pred, correction=0.001))
                        acc.append(accuracy_score(y_test, y_pred))
                        avg_tpr.append(recall_score(y_test, y_pred, average='macro'))


                results_g_mean[(binary_classifier, encoding, oversample_binary)].append(round(np.mean(g_mean), 3))
                results_acc[(binary_classifier, encoding, oversample_binary)].append(round(np.mean(acc), 3))
                results_avg_tpr[(binary_classifier, encoding, oversample_binary)].append(round(np.mean(avg_tpr), 3))


1czysty-cut None CART complete
1czysty-cut globalCS CART complete
1czysty-cut SMOTE CART complete
1czysty-cut None CART dense
1czysty-cut globalCS CART dense


In [None]:
df_gmean = pd.DataFrame(results_g_mean)
df_acc = pd.DataFrame(results_acc)
df_avg_tpr = pd.DataFrame(results_avg_tpr)

for df in (df_gmean, df_acc, df_avg_tpr):
    df.set_index('dataset')
    df.columns = pd.MultiIndex.from_tuples(
        [('dataset', '', '')] + [(str(bc), str(encoding), str(bo)) for bc in binary_classifiers for encoding in encodings for bo in binary_oversamplings])
    
import os
import datetime
    
current_date = str(datetime.datetime.today())[:-7].replace(' ', '_').replace(':','_')

directory='ECOC-'+current_date
os.makedirs(directory)

df_gmean.to_csv(f'./{directory}/ecoc_kfold_gmean.csv', index=False)
df_acc.to_csv(f'./{directory}/ecoc_kfold_acc.csv', index=False)
df_avg_tpr.to_csv(f'./{directory}/ecoc_kfold_avg_tpr.csv', index=False)

In [None]:
df_gmean

In [None]:
df_acc

In [None]:
df_avg_tpr

In [None]:
ranks_gmean = dict()
ranks_acc = dict()
ranks_avg_tpr = dict()

for results, ranks in zip((results_g_mean, results_acc, results_avg_tpr), (ranks_gmean, ranks_acc, ranks_avg_tpr)):
    for ds_idx, ds in enumerate(datasets_names):
        ranks[ds] = dict()
        for bc in binary_classifiers:
            ranks[ds][bc] = rankdata(
                [-results[method][ds_idx] for method in methods if method[0]==bc])


In [None]:
avg_ranks_gmean=dict()
avg_ranks_acc=dict()
avg_ranks_avg_tpr=dict()

for avg_ranks, ranks in zip((avg_ranks_gmean, avg_ranks_acc, avg_ranks_avg_tpr), (ranks_gmean, ranks_acc, ranks_avg_tpr)):
    for bc in binary_classifiers:
        avg_ranks[bc]=dict()
        for method_idx, method in enumerate([m for m in methods if m[0]==bc]):
            avg_ranks[bc][method] = round(np.mean([ranks[ds][bc][method_idx] for ds in datasets_names]),3)
            
for avg_ranks in (avg_ranks_gmean, avg_ranks_acc, avg_ranks_avg_tpr):
    for bc in binary_classifiers:
        for method in methods:
            if method[0]==bc:
                avg_ranks[bc][str(method[1:])] = avg_ranks[bc][method]
                avg_ranks[bc].pop(method)

df1 = pd.DataFrame(avg_ranks_gmean).T
df2 = pd.DataFrame(avg_ranks_acc).T
df3 = pd.DataFrame(avg_ranks_avg_tpr).T


In [None]:
avg_ranks_gmean

In [None]:
avg_ranks_avg_tpr

## Mean ranks in g-mean:

In [None]:
df1

## Mean ranks in accuracy:

In [None]:
df2

## Mean ranks in average accuracy (average recall):

In [None]:
df3