In [4]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from multi_imbalance.resampling.spider import SPIDER3
from sklearn.model_selection import train_test_split
from multi_imbalance.datasets import load_datasets
from multi_imbalance.resampling.mdo import MDO
from multi_imbalance.resampling.soup import SOUP
from collections import Counter
from imblearn.metrics import geometric_mean_score
from IPython.core.display import display
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from multi_imbalance.resampling.global_cs import GlobalCS
from imblearn.over_sampling import SMOTE
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from multi_imbalance.utils.array_util import union
import xgboost as xgb
from multi_imbalance.utils.data import load_arff_datasets
from sklearn.ensemble import RandomForestClassifier

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def read_train_and_test_data(overlap, imbalance_ratio, i):
    with open(f"data/3class-{imbalance_ratio}-overlap-{overlap}-learn-{i}.arff") as f:
        content = f.readlines()
    content = [x.strip().split(",") for x in content][5:]
    data = np.array(content)
    X_train, y_train = data[:, :-1].astype(float), data[:, -1].astype(object)

    with open(f"data/3class-{imbalance_ratio}-overlap-{overlap}-test-{i}.arff") as f:
        content = f.readlines()
    content = [x.strip().split(",") for x in content][5:]
    data = np.array(content)
    X_test, y_test = data[:, :-1].astype(float), data[:, -1].astype(object)

    return X_train, y_train, X_test, y_test


def train_and_test():
    neigh = KNeighborsClassifier(n_neighbors=1)
    neigh.fit(X_train, y_train)
    y_pred = neigh.predict(X_test)
    labels = ['MIN', 'INT', 'MAJ']
    return [confusion_matrix(y_test, y_pred, labels=labels)[i, i] / confusion_matrix(y_test, y_pred, labels=labels)[i,:].sum() for i,label in enumerate(labels)]


In [None]:
# cost = np.reshape(np.array([0, 2, 3, 3, 0, 2, 7, 5, 0]), (3, 3))
# cost = np.reshape(np.array([0, 3, 7, 2, 0, 5, 3, 2, 0]), (3, 3))
# cost = np.reshape(np.array([0, 1, 1, 3, 0, 1, 7, 5, 0]), (3, 3)) # odkopana
# cost = np.reshape(np.array([0, 1, 1, 2, 0, 1, 6, 3, 0]), (3, 3)) # try 1
# cost = np.reshape(np.array([0, 2, 6, 1, 0, 3, 1, 1, 0]), (3, 3)) # try 1
# cost = np.reshape(np.array([0, 1, 1, 1, 0, 1, 1, 1, 0]), (3, 3))


In [5]:
datasets = load_arff_datasets()

In [8]:
cost_matrix = np.array([[0,1,1],[3,0,1],[7,5,0]])

result = []


for imbalance_ratio in ["70-30-0-0", "40-50-10-0", "30-40-15-15"]:
    print(f"Imbalance ratio: {imbalance_ratio}")
    for overlap in range(0, 3):
        print(f"Overlap: {overlap}")
        for setting in ['base', 'default', 'article']:
            min_tpr = []
            int_tpr = []
            maj_tpr = []
            for i in range(1,11):
                X_train, y_train, X_test, y_test = read_train_and_test_data(overlap, imbalance_ratio, i)
                if setting == 'base':
                    pass
                elif setting == 'default':
                    cost = np.ones((3, 3))
                    for i in range(3):
                        cost[i][i] = 0

                    clf = SPIDER3(k=5, majority_classes=['MAJ'],
                                  intermediate_classes=['INT'], minority_classes=['MIN'], cost=cost)
                    X_train, y_train = clf.fit_transform(X_train.astype(np.float64), y_train)
                elif setting == 'article':
                    clf = SPIDER3(k=5, majority_classes=['MAJ'],
                                  intermediate_classes=['INT'], minority_classes=['MIN'], cost=cost_matrix)
                    X_train, y_train = clf.fit_transform(X_train.astype(np.float64), y_train)
                min_t, int_t, maj_t = train_and_test()
                min_tpr.append(min_t)
                int_tpr.append(int_t)
                maj_tpr.append(maj_t)
            print(setting, [np.array(min_tpr).mean(), np.array(int_tpr).mean(), np.array(maj_tpr).mean()])
            result.append([np.array(min_tpr).mean(), np.array(int_tpr).mean(), np.array(maj_tpr).mean()])
np.savetxt("article_results.csv", np.asarray(result), delimiter=",")

Imbalance ratio: 70-30-0-0
Overlap: 0
base [0.772, 0.8379999999999999, 0.9133333333333333]
default [0.922, 0.9640000000000001, 0.882]
article [0.9719999999999999, 0.992, 0.8556666666666667]
Overlap: 1
base [0.638, 0.792, 0.9286666666666665]
default [0.758, 0.906, 0.9013333333333335]
article [0.788, 0.922, 0.8803333333333334]
Overlap: 2
base [0.434, 0.6809999999999999, 0.9329999999999998]
default [0.532, 0.766, 0.9129999999999999]


KeyboardInterrupt: 

In [None]:
pd.read_csv("article_results.csv", header=None)

In [9]:
cost_matrix = np.array([[0,1,1],[3,0,1],[7,5,0]])

result = []


for imbalance_ratio in ["70-30-0-0", "40-50-10-0", "30-40-15-15"]:
    print(f"Imbalance ratio: {imbalance_ratio}")
    for overlap in range(0, 3):
        print(f"Overlap: {overlap}")
        for setting in ['article', 'custom']:
            min_tpr = []
            int_tpr = []
            maj_tpr = []
            for i in range(1,11):
                X_train, y_train, X_test, y_test = read_train_and_test_data(overlap, imbalance_ratio, i)
                if setting == 'base':
                    pass
                elif setting == 'default':
                    cost = np.ones((3, 3))
                    for i in range(3):
                        cost[i][i] = 0

                    clf = SPIDER3(k=5, majority_classes=['MAJ'],
                                  intermediate_classes=['INT'], minority_classes=['MIN'], cost=cost)
                    X_train, y_train = clf.fit_transform(X_train.astype(np.float64), y_train)
                elif setting == 'article':
                    clf = SPIDER3(k=5, majority_classes=['MAJ'],
                                  intermediate_classes=['INT'], minority_classes=['MIN'], cost=cost_matrix)
                    X_train, y_train = clf.fit_transform(X_train.astype(np.float64), y_train)
                elif setting == 'custom':
                    clf = SPIDER3(k=5, majority_classes=['MAJ'],
                                  intermediate_classes=['INT'], minority_classes=['MIN'])
                    X_train, y_train = clf.fit_transform(X_train.astype(np.float64), y_train)
                min_t, int_t, maj_t = train_and_test()
                min_tpr.append(min_t)
                int_tpr.append(int_t)
                maj_tpr.append(maj_t)
            print(setting, [np.array(min_tpr).mean(), np.array(int_tpr).mean(), np.array(maj_tpr).mean()])
            result.append([np.array(min_tpr).mean(), np.array(int_tpr).mean(), np.array(maj_tpr).mean()])
np.savetxt("custom_results.csv", np.asarray(result), delimiter=",")

Imbalance ratio: 70-30-0-0
Overlap: 0
article [0.9719999999999999, 0.992, 0.8556666666666667]
custom [0.97, 0.9710000000000001, 0.8666666666666666]
Overlap: 1
article [0.788, 0.922, 0.8803333333333334]
custom [0.788, 0.9030000000000001, 0.8936666666666667]
Overlap: 2
article [0.5520000000000002, 0.779, 0.898]
custom [0.55, 0.7699999999999999, 0.9056666666666666]
Imbalance ratio: 40-50-10-0
Overlap: 0
article [0.9319999999999998, 0.969, 0.748]
custom [0.9319999999999998, 0.9450000000000001, 0.7576666666666666]
Overlap: 1
article [0.766, 0.907, 0.7776666666666667]
custom [0.768, 0.897, 0.7863333333333333]
Overlap: 2
article [0.6639999999999999, 0.843, 0.8020000000000002]
custom [0.6639999999999999, 0.836, 0.807]
Imbalance ratio: 30-40-15-15
Overlap: 0
article [0.806, 0.907, 0.5229999999999999]
custom [0.796, 0.885, 0.5466666666666666]
Overlap: 1
article [0.738, 0.8710000000000001, 0.5166666666666667]
custom [0.73, 0.861, 0.5336666666666666]
Overlap: 2
article [0.6679999999999999, 0.825, 

In [142]:
result = []


for imbalance_ratio in ["70-30-0-0", "40-50-10-0", "30-40-15-15"]:
    print(f"Imbalance ratio: {imbalance_ratio}")
    for overlap in range(0, 3):
        print(f"Overlap: {overlap}")
        for setting in ['base', 'default', 'custom', 'xgboost']:
            min_tpr = []
            int_tpr = []
            maj_tpr = []
            for i in range(1,11):
                X_train, y_train, X_test, y_test = read_train_and_test_data(overlap, imbalance_ratio, i)
                if setting == 'base':
                    pass
                elif setting == 'default':
                    cost = np.ones((3, 3))
                    for i in range(3):
                        cost[i][i] = 0

                    clf = SPIDER3(k=5, majority_classes=['MAJ'],
                                  intermediate_classes=['INT'], minority_classes=['MIN'], cost=cost)
                    X_train, y_train = clf.fit_transform(X_train.astype(np.float64), y_train)
                elif setting == 'custom':
                    clf = SPIDER3(k=5, majority_classes=['MAJ'],
                                  intermediate_classes=['INT'], minority_classes=['MIN'])
                    X_train, y_train = clf.fit_transform(X_train.astype(np.float64), y_train)
                elif setting == 'xgboost':
                    model = xgb.XGBClassifier(random_state=1, learning_rate=0.01)
                    model.fit(X_train, y_train)
                    cm = confusion_matrix(y_test, model.predict(X_test))
                    cost = cm.T
                    cost[cost == 0] = 1
                    np.fill_diagonal(cost, 0)
                    clf = SPIDER3(k=5, majority_classes=['MAJ'],
                                  intermediate_classes=['INT'], minority_classes=['MIN'], cost=cost)
                    X_train, y_train = clf.fit_transform(X_train.astype(np.float64), y_train)
                min_t, int_t, maj_t = train_and_test()
                min_tpr.append(min_t)
                int_tpr.append(int_t)
                maj_tpr.append(maj_t)
            print(setting, [np.array(min_tpr).mean(), np.array(int_tpr).mean(), np.array(maj_tpr).mean()])
            result.append([np.array(min_tpr).mean(), np.array(int_tpr).mean(), np.array(maj_tpr).mean()])
np.savetxt("spider-results.csv", np.asarray(result), delimiter=",")

Imbalance ratio: 70-30-0-0
Overlap: 0
base [0.772, 0.8379999999999999, 0.9133333333333333]
default [0.922, 0.9640000000000001, 0.882]
custom [0.97, 0.9710000000000001, 0.8666666666666666]
xgboost [0.9259999999999999, 0.97, 0.8786666666666667]
Overlap: 1
base [0.638, 0.792, 0.9286666666666665]
default [0.758, 0.906, 0.9013333333333335]
custom [0.788, 0.9030000000000001, 0.8936666666666667]
xgboost [0.788, 0.908, 0.8879999999999999]
Overlap: 2
base [0.434, 0.6809999999999999, 0.9329999999999998]
default [0.532, 0.766, 0.9129999999999999]
custom [0.55, 0.7699999999999999, 0.9056666666666666]
xgboost [0.56, 0.769, 0.8986666666666666]
Imbalance ratio: 40-50-10-0
Overlap: 0
base [0.6739999999999999, 0.7689999999999999, 0.8633333333333333]
default [0.89, 0.961, 0.76]
custom [0.9319999999999998, 0.9450000000000001, 0.7576666666666666]
xgboost [0.9139999999999999, 0.9550000000000001, 0.7673333333333334]
Overlap: 1
base [0.568, 0.7299999999999999, 0.8780000000000001]
default [0.74, 0.898, 0.7903

In [143]:
df = pd.read_csv("spider-results.csv", header=None)

### Experiments on synthetic dataset

In [144]:
df

Unnamed: 0,0,1,2
0,0.772,0.838,0.913333
1,0.922,0.964,0.882
2,0.97,0.971,0.866667
3,0.926,0.97,0.878667
4,0.638,0.792,0.928667
5,0.758,0.906,0.901333
6,0.788,0.903,0.893667
7,0.788,0.908,0.888
8,0.434,0.681,0.933
9,0.532,0.766,0.913


In [23]:
#datasets = load_datasets()
datasets = load_arff_datasets()

In [24]:
datasets.keys()

odict_keys(['1czysty-cut', '2delikatne-cut', '3mocniej-cut', '4delikatne-bezover-cut', 'balance-scale', 'car', 'cleveland', 'cleveland_v2', 'cmc', 'dermatology', 'flare', 'glass', 'hayes-roth', 'new_ecoli', 'new_led7digit', 'new_vehicle', 'new_winequality-red', 'new_yeast', 'thyroid-newthyroid'])

In [26]:
len(list(datasets.keys()))

19

In [27]:
for k in datasets.keys():
    print(k, Counter(datasets[k].target))

1czysty-cut Counter({0: 840, 2: 240, 1: 120})
2delikatne-cut Counter({0: 840, 2: 240, 1: 120})
3mocniej-cut Counter({0: 840, 2: 240, 1: 120})
4delikatne-bezover-cut Counter({0: 840, 2: 240, 1: 120})
balance-scale Counter({2: 288, 1: 288, 0: 49})
car Counter({2: 1210, 0: 384, 1: 69, 3: 65})
cleveland Counter({0: 164, 1: 55, 2: 36, 3: 35, 4: 13})
cleveland_v2 Counter({0: 219, 1: 36, 2: 35, 3: 13})
cmc Counter({0: 629, 2: 511, 1: 333})
dermatology Counter({0: 112, 2: 72, 1: 61, 4: 52, 3: 49, 5: 20})
flare Counter({5: 396, 2: 327, 1: 287, 0: 212, 3: 116, 4: 51})
glass Counter({1: 76, 0: 70, 3: 29, 5: 17, 2: 13, 4: 9})
hayes-roth Counter({0: 65, 1: 64, 2: 31})
new_ecoli Counter({0: 145, 1: 77, 4: 52, 2: 37, 3: 25})
new_led7digit Counter({3: 108, 5: 99, 0: 98, 2: 94, 4: 52, 1: 49})
new_vehicle Counter({1: 429, 0: 218, 2: 199})
new_winequality-red Counter({0: 681, 1: 638, 2: 199, 3: 81})
new_yeast Counter({0: 463, 1: 429, 8: 244, 7: 168, 6: 51, 5: 44, 4: 35, 3: 30, 2: 20})
thyroid-newthyroid 

In [21]:
maj_int_min = {
    "1czysty-cut": {'maj': [0], 'int': [2], 'min': [1]},
    "2delikatne-cut": {'maj': [0], 'int': [2], 'min': [1]},
    "3mocniej-cut": {'maj': [0], 'int': [2], 'min': [1]},
    "4delikatne-bezover-cut": {'maj': [0], 'int': [2], 'min': [1]},
    "balance-scale": {'maj': [2, 1], 'int': [], 'min': [0]},
    "cleveland": {'maj': [0], 'int': [], 'min': [1, 2, 3, 4]},
    "cleveland_v2": {'maj': [0], 'int': [], 'min': [1, 2, 3]},
    "car": {'maj': [2, 0], 'int': [], 'min': [1,3]},
    "cmc": {'maj': [0, 2], 'int': [], 'min': [1]},
    "dermatology": {'maj': [0, 2, 1, 4, 3], 'int': [], 'min': [5]},
    "flare": {'maj': [1, 2, 3, 6], 'int': [], 'min': [4, 5]},
    "glass": {'maj': [1, 0, 3], 'int': [], 'min': [5, 2, 4]},
    "hayes-roth": {'maj': [0, 1], 'int': [], 'min': [2]},
    "new_ecoli": {'maj': [0, 1], 'int': [], 'min': [4, 2, 3]},
    "new_led7digit": {'maj': [3, 5, 0, 2], 'int': [], 'min': [4, 1]},
    "new_vehicle": {'maj': [1], 'int': [], 'min': [0, 2]},
    "new_winequality-red": {'maj': [0, 1], 'int': [], 'min': [2, 3]},
    "new_yeast": {'maj': [0, 1, 8, 7], 'int': [], 'min': [6, 5, 4, 3, 2]},
    "thyroid-newthyroid": {'maj': [0], 'int': [], 'min': [1, 2]}
}

In [136]:
def test(cost_strategy):
    np.random.seed(0)

    datasets = load_datasets()
    results_g_mean = dict()
    results_acc = dict()

    for dataset_name, dataset_values in datasets.items():
        #if dataset_name == 'dermatology' or dataset_name == 'new_ecoli':
        #    continue
        print(dataset_name)

        X, y = dataset_values.data, dataset_values.target

        #if len(X)>1000:
        #    continue

        results_g_mean[dataset_name]=dict()
        results_acc[dataset_name]=dict()

        for resample in ['base','global', 'soup', 'mdo', 'spider']:

            skf = StratifiedKFold(n_splits=5, random_state=0)
            acc, g_mean = list(),list()
            for train_index, test_index in skf.split(X, y):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                error_flag = False
                clf_tree = DecisionTreeClassifier(random_state=0)

                if resample == 'base':
                    X_train_resampled, y_train_resampled = X_train, y_train
                elif resample=='soup':
                    soup = SOUP()
                    X_train_resampled, y_train_resampled = soup.fit_transform(np.copy(X_train), np.copy(y_train))
                elif resample=='global':
                    global_cs = GlobalCS()
                    X_train_resampled, y_train_resampled = global_cs.fit_transform(np.copy(X_train), np.copy(y_train))
                elif resample=='smote':
                    try:
                        smote = SMOTE()
                        X_train_resampled, y_train_resampled = smote.fit_sample(np.copy(X_train), np.copy(y_train))
                    except Exception as e:
                        error_flag = True
                        print(resample, dataset_name, e)
                        X_train_resampled, y_train_resampled = X_train, y_train
                elif resample=='mdo':
                    mdo = MDO(k=9, k1_frac=0, seed=0)
                    X_train_resampled, y_train_resampled = mdo.fit_transform(np.copy(X_train), np.copy(y_train))
                elif resample=='spider':
                    cost = calc_cost_matrix(dataset_name, cost_strategy)
                    clf = SPIDER3(k=5, majority_classes=maj_int_min[dataset_name]['maj'], intermediate_classes=maj_int_min[dataset_name]['int'], minority_classes=maj_int_min[dataset_name]['min'], cost=cost)
                    X_train_resampled, y_train_resampled = clf.fit_transform(X_train.astype(np.float64), y_train)

                clf_tree.fit(X_train_resampled, y_train_resampled)
                y_pred = clf_tree.predict(X_test)
                g_mean.append(geometric_mean_score(y_test, y_pred, correction=0.001))
                acc.append(accuracy_score(y_test, y_pred))

            result_g_mean = None if error_flag else round(np.mean(g_mean),3)
            result_acc = None if error_flag else round(np.mean(acc),3)

            results_g_mean[dataset_name][resample]=result_g_mean
            results_acc[dataset_name][resample]=result_acc

    display("G-MEAN")
    df = pd.DataFrame(results_g_mean).T
    display(df)

    display("MEAN G-MEAN")
    df.fillna(df.median(), inplace=True)
    display(df.mean())

In [137]:
test(cost_strategy='default')

1czysty-cut


KeyboardInterrupt: 

In [None]:
test(cost_strategy='custom_1')

In [14]:
def highlight_max(data, color='yellow'):
    '''
    highlight the maximum in a Series or DataFrame
    '''
    attr = 'font-weight: bold'
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)

In [23]:
def calc_cost_matrix(dataset_name, cost_strategy):
    if cost_strategy == 'default':
        #default
        no_classes = np.unique(datasets[dataset_name].target).size
        cost = np.ones((no_classes, no_classes))
        np.fill_diagonal(cost, 0)
        return cost
    elif cost_strategy == 'custom_1' or cost_strategy == 'custom': # IR pod diagonala, wyzej jedynki
        return SPIDER3._estimate_cost_matrix(datasets[dataset_name].target)
    elif cost_strategy == 'custom_2': # odwrotnie, tj IR nad diagonala, nizej jedynki
        y=datasets[dataset_name].target
        class_cardinality = Counter(y)
        classes = list(class_cardinality.keys())
        cost = np.ones([len(classes), len(classes)])
        for i, (c1, card1) in enumerate(class_cardinality.items()):
            for j, (c2, card2) in enumerate(class_cardinality.items()):
                if j < i:
                    cost[i, j] = 1
                else:
                    cost[i, j] = card1 / card2
        np.fill_diagonal(cost, 0)
    elif cost_strategy == 'custom_3': # IR wszedzie
        y=datasets[dataset_name].target
        class_cardinality = Counter(y)
        classes = list(class_cardinality.keys())
        cost = np.ones([len(classes), len(classes)])
        for i, (c1, card1) in enumerate(class_cardinality.items()):
            for j, (c2, card2) in enumerate(class_cardinality.items()):
                cost[i, j] = card1 / card2
        np.fill_diagonal(cost, 0)
    elif cost_strategy == 'custom_4':
        no_classes = np.unique(datasets[dataset_name].target).size
        cost = np.ones((no_classes, no_classes))
        X, y = datasets[dataset_name].data, datasets[dataset_name].target
        element_count = Counter(y)
        cardinality_pairs = list(element_count.items())
        for c1,_ in cardinality_pairs:
            for c2,_ in cardinality_pairs:
                f2_overlap_volume = 1
                for i in range(X.shape[1]):
                    f2_overlap_volume = f2_overlap_volume * (min(X[y == c1][:,i].max(), X[y == c2][:,i].max()) - max(X[y == c1][:,i].min(), X[y == c2][:,i].min())) / (max(X[y == c1][:,i].max(), X[y == c2][:,i].max()) - min(X[y == c1][:,i].min(), X[y == c2][:,i].min()) + 0.000001)
                cost[c2, c1] = f2_overlap_volume
        np.fill_diagonal(cost, 0)
        max_overlap = cost.max() + 1e-6
        cost = cost / max_overlap + np.ones((no_classes, no_classes))
        np.fill_diagonal(cost,0)
        return cost
    elif cost_strategy == 'custom_5':
        no_classes = np.unique(datasets[dataset_name].target).size
        cost = np.ones((no_classes, no_classes))
        X, y = datasets[dataset_name].data, datasets[dataset_name].target
        element_count = Counter(y)
        cardinality_pairs = list(element_count.items())
        for c1,_ in cardinality_pairs:
            for c2,_ in cardinality_pairs:
                f2_overlap_volume = 1
                for i in range(X.shape[1]):
                    f2_overlap_volume = f2_overlap_volume * (min(X[y == c1][:,i].max(), X[y == c2][:,i].max()) - max(X[y == c1][:,i].min(), X[y == c2][:,i].min())) / (max(X[y == c1][:,i].max(), X[y == c2][:,i].max()) - min(X[y == c1][:,i].min(), X[y == c2][:,i].min()) + 0.000001)
                cost[c2, c1] = f2_overlap_volume
        np.fill_diagonal(cost, 0)
        max_overlap = cost.max() + 1e-6
        cost = cost / max_overlap + np.ones((no_classes, no_classes))
        np.fill_diagonal(cost,0)
        for i in range(no_classes):
            for j in range(no_classes):
                if j > i:
                    cost[i, j] = 1
        return cost
    return cost

In [17]:
def fill_up_diag_ones(cost_matrix):
    for i in range(cost_matrix.shape[0]):
        for j in range(cost_matrix.shape[0]):
            if j > i:
                cost_matrix[i, j] = 1
    return cost_matrix

In [18]:
def calc_mean_ranks(another):
    ranks = {}
    for col in another.columns:
        ranks[col] = []
    for row in another.iterrows():
        pairs= list(zip(row[1].index.tolist(), row[1].values.tolist()))
        sorted_pairs = sorted(pairs, key=lambda x: -x[1])
        for i, (name, score) in enumerate(sorted_pairs):
            ranks[name].append(i + 1)

    mean_ranks = {}
    for name, lst in ranks.items():
        mean_ranks[name] = np.mean(lst)
    return mean_ranks

In [50]:
def test_spiders(options):
    np.random.seed(0)

    datasets = load_arff_datasets()
    results_g_mean = dict()
    results_acc = dict()
    results_min_g_mean = dict()

    for dataset_name, dataset_values in datasets.items():
        if dataset_name == "flare" or dataset_name == "car":
            continue
        #if dataset_name == 'dermatology' or dataset_name == 'new_ecoli':
        #    continue
        print(dataset_name)

        X, y = dataset_values.data, dataset_values.target
        labels = sorted(list(Counter(dataset_values.target).keys()), reverse=True)

        #if len(X)>1000:
        #    continue

        results_g_mean[dataset_name]=dict()
        results_acc[dataset_name]=dict()
        results_min_g_mean[dataset_name]=dict()

        for resample in options:

            acc, g_mean, acc2, g_mean2 = list(),list(), [], []
            min_g_mean = []
            for fold in range(10):
                skf = StratifiedKFold(n_splits=5, random_state=fold)
                for train_index, test_index in skf.split(X, y):
                    X_train, X_test = X[train_index], X[test_index]
                    y_train, y_test = y[train_index], y[test_index]
                    error_flag = False
                    clf_tree = DecisionTreeClassifier(random_state=0)

                    if resample == 'base':
                        X_train_resampled, y_train_resampled = X_train, y_train
                    elif resample == 'xgboost':
                        model = xgb.XGBClassifier(random_state=1, learning_rate=0.01)
                        model.fit(X_train, y_train)
                        cm = confusion_matrix(y_test, model.predict(X_test))
                        cost = cm.T
                        cost[cost == 0] = 1
                        np.fill_diagonal(cost, 0)
                        #cost = fill_up_diag_ones(cost)
                        #cost = (cost - np.mean(cost.ravel())) / np.std(cost.ravel()) + 1
                        #np.fill_diagonal(cost, 0)
                        #print(cost)
                        clf = SPIDER3(k=5, majority_classes=maj_int_min[dataset_name]['maj'], intermediate_classes=maj_int_min[dataset_name]['int'], minority_classes=maj_int_min[dataset_name]['min'], cost=cost)
                        X_train_resampled, y_train_resampled = clf.fit_transform(X_train.astype(np.float64), y_train)
                    elif resample == "rforest":
                        model = RandomForestClassifier(n_estimators=100)
                        model.fit(X_train, y_train)
                        cm = confusion_matrix(y_test, model.predict(X_test))
                        cost = cm.T
                        cost[cost == 0] = 1
                        np.fill_diagonal(cost, 0)
                        clf = SPIDER3(k=5, majority_classes=maj_int_min[dataset_name]['maj'], intermediate_classes=maj_int_min[dataset_name]['int'], minority_classes=maj_int_min[dataset_name]['min'], cost=cost)
                        X_train_resampled, y_train_resampled = clf.fit_transform(X_train.astype(np.float64), y_train)
                    elif resample == "knn":
                        model = KNeighborsClassifier(n_neighbors=5)
                        model.fit(X_train, y_train)
                        cm = confusion_matrix(y_test, model.predict(X_test))
                        cost = cm.T
                        cost[cost == 0] = 1
                        np.fill_diagonal(cost, 0)
                        clf = SPIDER3(k=5, majority_classes=maj_int_min[dataset_name]['maj'], intermediate_classes=maj_int_min[dataset_name]['int'], minority_classes=maj_int_min[dataset_name]['min'], cost=cost)
                        X_train_resampled, y_train_resampled = clf.fit_transform(X_train.astype(np.float64), y_train)
                    else:
                        cost = calc_cost_matrix(dataset_name, resample)
                        clf = SPIDER3(k=5, majority_classes=maj_int_min[dataset_name]['maj'], intermediate_classes=maj_int_min[dataset_name]['int'], minority_classes=maj_int_min[dataset_name]['min'], cost=cost)
                        X_train_resampled, y_train_resampled = clf.fit_transform(X_train.astype(np.float64), y_train)

                    clf_tree.fit(X_train_resampled, y_train_resampled)
                    y_pred = clf_tree.predict(X_test)
                    g_mean.append(geometric_mean_score(y_test, y_pred, correction=0.001))
                    acc.append(accuracy_score(y_test, y_pred))

                    neigh = KNeighborsClassifier(n_neighbors=1)
                    for i in range(0, 2):
                        std = union(X_train,X_test)[:,i].std()
                        if std == 0:
                            std = 1e-6
                        mean = union(X_train,X_test)[:,i].mean()
                        X_train_resampled[:, i] = (X_train_resampled[:, i] - mean) / (4 * std)
                        X_test[:, i] = (X_test[:, i] - mean) / (4 * std)
                    neigh.fit(X_train_resampled, y_train_resampled)
                    y_pred = neigh.predict(X_test)
                    g_mean2.append(geometric_mean_score(y_test, y_pred, correction=0.001))
                    acc2.append(accuracy_score(y_test, y_pred))
                    cm = confusion_matrix(y_test, y_pred, labels=labels)
                    tprs = [cm[i, i] / cm[i,:].sum() for i,label in enumerate(labels)]

                    gm=1
                    for tpr, label in zip(tprs, labels):
                        if label in maj_int_min[dataset_name]['min']:
                            if tpr == 0:
                                gm*=0.001
                            else:
                                gm *= tpr
                    min_g_mean.append(gm ** (1/len(maj_int_min[dataset_name]['min'])))
                
                

            result_g_mean = None if error_flag else round(np.mean(g_mean),3)
            result_acc = None if error_flag else round(np.mean(acc),3)
            result_g_mean2 = None if error_flag else round(np.mean(g_mean2),3)
            result_acc2 = None if error_flag else round(np.mean(acc2),3)
            result_min_g_mean = None if error_flag else round(np.mean(min_g_mean),3)

            #results_g_mean[dataset_name][resample + " tree"]=result_g_mean
            #results_acc[dataset_name][resample + " tree"]=result_acc
            results_g_mean[dataset_name][resample + " knn"]=result_g_mean2
            results_acc[dataset_name][resample + " knn"]=result_acc2
            results_min_g_mean[dataset_name][resample + " knn"] = result_min_g_mean

    display("G-MEAN")
    df = pd.DataFrame(results_g_mean).T
    display(df.style.apply(highlight_max, axis=1))

    display("MEAN G-MEAN")
    df.fillna(df.median(), inplace=True)
    display(df.mean())
    
    mean_ranks = calc_mean_ranks(df)
    print("MIN G-MEAN")
    df=pd.DataFrame(results_min_g_mean).T
    display(df.style.apply(highlight_max, axis=1))
    
    display("MEAN MIN G-MEAN")
    df.fillna(df.median(), inplace=True)
    display(df.mean())
    
    print(sorted(list(mean_ranks.items()), key=lambda x: x[1]))
    return df

In [None]:
spdrs = test_spiders()

In [34]:
def visualize_dataset(dataset):
    pca = PCA(n_components=2)
    n = len(Counter(dataset.target).keys())
    p = sns.color_palette("husl", n)
    pca.fit(dataset.data)
    X = pca.transform(dataset.data)
    y = dataset.target
    df = construct_flat_2pc_df(X,y)
    sns.scatterplot(x='x1', y='x2', hue='y', style='y', data=df, alpha=0.7, legend='full', palette=p)

In [None]:
# take XGBoost and see where it makes mistakes

import xgboost as xgb
from multi_imbalance.utils.data import construct_flat_2pc_df
hayes = datasets['hayes-roth']
visualize_dataset(hayes)
print(Counter(hayes.target))
X_train, X_test, y_train, y_test = train_test_split(hayes.data, hayes.target, test_size=0.33, random_state=42)

In [None]:
model = xgb.XGBClassifier(random_state=1, learning_rate=0.01)
model.fit(X_train, y_train)
cm = confusion_matrix(y_test, model.predict(X_test))
costs = cm.T
np.fill_diagonal(costs, 0)
costs

In [None]:
Counter(y_test)

In [None]:
costs[costs == 0] = 1

In [None]:
np.fill_diagonal(costs, 0)
costs

In [None]:
another = test_spiders(['base', 'xgboost'])

In [36]:
another = test_spiders(['base', 'default', 'custom_1', 'custom_2', 'custom_3', 'xgboost'])

1czysty-cut




2delikatne-cut




3mocniej-cut




4delikatne-bezover-cut




balance-scale




cleveland




cleveland_v2




cmc




dermatology




glass




hayes-roth




new_ecoli




new_led7digit




new_vehicle




new_winequality-red




new_yeast




thyroid-newthyroid




'G-MEAN'

Unnamed: 0,base tree,base knn,default tree,default knn,custom_1 tree,custom_1 knn,custom_2 tree,custom_2 knn,custom_3 tree,custom_3 knn,xgboost tree,xgboost knn
1czysty-cut,0.937,0.963,0.954,0.968,0.951,0.971,0.951,0.971,0.947,0.97,0.952,0.973
2delikatne-cut,0.651,0.658,0.727,0.737,0.758,0.751,0.758,0.751,0.752,0.748,0.76,0.768
3mocniej-cut,0.404,0.401,0.5,0.465,0.518,0.474,0.519,0.474,0.511,0.473,0.497,0.494
4delikatne-bezover-cut,0.723,0.73,0.845,0.837,0.853,0.832,0.853,0.832,0.842,0.838,0.867,0.866
balance-scale,0.234,0.263,0.425,0.511,0.251,0.479,0.314,0.47,0.254,0.484,0.225,0.439
cleveland,0.098,0.116,0.089,0.159,0.068,0.148,0.085,0.148,0.067,0.148,0.197,0.151
cleveland_v2,0.054,0.112,0.164,0.184,0.216,0.188,0.249,0.13,0.137,0.188,0.161,0.242
cmc,0.453,0.433,0.425,0.421,0.432,0.411,0.444,0.413,0.444,0.405,0.43,0.4
dermatology,0.903,0.857,0.909,0.872,0.912,0.868,0.912,0.868,0.9,0.858,0.909,0.872
glass,0.59,0.537,0.536,0.559,0.534,0.624,0.538,0.595,0.517,0.583,0.463,0.542


'MEAN G-MEAN'

base tree        0.568471
base knn         0.547471
default tree     0.580000
default knn      0.579000
custom_1 tree    0.572706
custom_1 knn     0.584882
custom_2 tree    0.577176
custom_2 knn     0.581294
custom_3 tree    0.556176
custom_3 knn     0.574824
xgboost tree     0.565059
xgboost knn      0.582588
dtype: float64

In [113]:
another = test_spiders(['base', 'default', 'custom_1', 'custom_2', 'custom_3', 'xgboost', 'rforest', 'knn'])

1czysty-cut
2delikatne-cut
3mocniej-cut
4delikatne-bezover-cut
balance-scale
car
cleveland
cleveland_v2
cmc
dermatology
flare
glass
hayes-roth
new_ecoli
new_led7digit
new_vehicle
new_winequality-red
new_yeast
thyroid-newthyroid


'G-MEAN'

Unnamed: 0,base tree,base knn,default tree,default knn,custom_1 tree,custom_1 knn,custom_2 tree,custom_2 knn,custom_3 tree,custom_3 knn,xgboost tree,xgboost knn,rforest tree,rforest knn,knn tree,knn knn
1czysty-cut,0.937,0.963,0.954,0.968,0.951,0.971,0.951,0.971,0.947,0.97,0.952,0.973,0.946,0.97,0.948,0.966
2delikatne-cut,0.651,0.658,0.727,0.737,0.758,0.751,0.758,0.751,0.752,0.748,0.76,0.768,0.763,0.775,0.77,0.777
3mocniej-cut,0.404,0.401,0.5,0.465,0.518,0.474,0.519,0.474,0.511,0.473,0.497,0.494,0.493,0.487,0.525,0.485
4delikatne-bezover-cut,0.723,0.73,0.845,0.837,0.853,0.832,0.853,0.832,0.842,0.838,0.867,0.866,0.864,0.855,0.863,0.864
balance-scale,0.234,0.263,0.425,0.511,0.251,0.479,0.314,0.47,0.254,0.484,0.225,0.439,0.322,0.342,0.387,0.426
car,0.241,0.321,0.256,0.305,0.086,0.177,0.24,0.223,0.112,0.177,0.244,0.227,0.264,0.142,0.288,0.177
cleveland,0.098,0.081,0.072,0.089,0.081,0.08,0.118,0.08,0.057,0.08,0.08,0.129,0.168,0.126,0.092,0.115
cleveland_v2,0.143,0.068,0.192,0.119,0.172,0.118,0.2,0.119,0.172,0.118,0.182,0.117,0.17,0.118,0.153,0.121
cmc,0.458,0.42,0.415,0.374,0.427,0.362,0.441,0.37,0.419,0.353,0.413,0.376,0.383,0.369,0.377,0.376
dermatology,0.917,0.909,0.917,0.913,0.903,0.923,0.915,0.919,0.908,0.927,0.907,0.913,0.917,0.913,0.917,0.918


'MEAN G-MEAN'

base tree        0.546053
base knn         0.528053
default tree     0.551211
default knn      0.547789
custom_1 tree    0.530632
custom_1 knn     0.542947
custom_2 tree    0.541895
custom_2 knn     0.544000
custom_3 tree    0.516316
custom_3 knn     0.532737
xgboost tree     0.527737
xgboost knn      0.540105
rforest tree     0.542632
rforest knn      0.531789
knn tree         0.548263
knn knn          0.531316
dtype: float64

[('default tree', 6.842105263157895), ('knn tree', 7.0), ('custom_2 tree', 7.421052631578948), ('rforest tree', 7.7894736842105265), ('xgboost knn', 7.947368421052632), ('custom_1 knn', 8.0), ('custom_2 knn', 8.31578947368421), ('custom_1 tree', 8.368421052631579), ('base tree', 8.473684210526315), ('default knn', 8.68421052631579), ('knn knn', 8.894736842105264), ('rforest knn', 9.210526315789474), ('xgboost tree', 9.263157894736842), ('custom_3 knn', 9.578947368421053), ('base knn', 9.842105263157896), ('custom_3 tree', 10.368421052631579)]


In [114]:
another = test_spiders(['base', 'default', 'custom_1'])

1czysty-cut
2delikatne-cut
3mocniej-cut
4delikatne-bezover-cut
balance-scale
car
cleveland
cleveland_v2
cmc
dermatology
flare
glass
hayes-roth
new_ecoli
new_led7digit
new_vehicle
new_winequality-red
new_yeast
thyroid-newthyroid


'G-MEAN'

Unnamed: 0,base tree,base knn,default tree,default knn,custom_1 tree,custom_1 knn
1czysty-cut,0.937,0.963,0.954,0.968,0.951,0.971
2delikatne-cut,0.651,0.658,0.727,0.737,0.758,0.751
3mocniej-cut,0.404,0.401,0.5,0.465,0.518,0.474
4delikatne-bezover-cut,0.723,0.73,0.845,0.837,0.853,0.832
balance-scale,0.234,0.263,0.425,0.511,0.251,0.479
car,0.241,0.321,0.256,0.305,0.086,0.177
cleveland,0.098,0.081,0.072,0.089,0.081,0.08
cleveland_v2,0.143,0.068,0.192,0.119,0.172,0.118
cmc,0.458,0.42,0.415,0.374,0.427,0.362
dermatology,0.917,0.909,0.917,0.913,0.903,0.923


'MEAN G-MEAN'

base tree        0.546053
base knn         0.528053
default tree     0.551211
default knn      0.547789
custom_1 tree    0.530632
custom_1 knn     0.542947
dtype: float64

[('default tree', 3.263157894736842), ('default knn', 3.4210526315789473), ('custom_1 knn', 3.4210526315789473), ('base tree', 3.473684210526316), ('custom_1 tree', 3.5789473684210527), ('base knn', 3.8421052631578947)]


In [51]:
another = test_spiders(['base', 'default', 'custom'])

1czysty-cut
2delikatne-cut
3mocniej-cut
4delikatne-bezover-cut
balance-scale
cleveland
cleveland_v2
cmc
dermatology
glass
hayes-roth
new_ecoli
new_led7digit
new_vehicle
new_winequality-red
new_yeast
thyroid-newthyroid


'G-MEAN'

Unnamed: 0,base knn,default knn,custom knn
1czysty-cut,0.963,0.968,0.971
2delikatne-cut,0.658,0.737,0.751
3mocniej-cut,0.401,0.465,0.474
4delikatne-bezover-cut,0.73,0.837,0.832
balance-scale,0.263,0.511,0.479
cleveland,0.081,0.089,0.08
cleveland_v2,0.068,0.119,0.118
cmc,0.42,0.374,0.362
dermatology,0.909,0.913,0.923
glass,0.537,0.559,0.624


'MEAN G-MEAN'

base knn       0.545118
default knn    0.570706
custom knn     0.577118
dtype: float64

MIN G-MEAN


Unnamed: 0,base knn,default knn,custom knn
1czysty-cut,0.95,0.958,0.967
2delikatne-cut,0.55,0.633,0.658
3mocniej-cut,0.242,0.308,0.333
4delikatne-bezover-cut,0.667,0.808,0.808
balance-scale,0.085,0.204,0.184
cleveland,0.048,0.06,0.052
cleveland_v2,0.031,0.077,0.075
cmc,0.349,0.715,0.772
dermatology,0.95,0.95,0.95
glass,0.444,0.623,0.685


'MEAN MIN G-MEAN'

base knn       0.479824
default knn    0.597059
custom knn     0.610353
dtype: float64

[('custom knn', 1.7647058823529411), ('default knn', 1.8235294117647058), ('base knn', 2.411764705882353)]


In [148]:
another = test_spiders(['base', 'default', 'xgboost'])

1czysty-cut
2delikatne-cut
3mocniej-cut
4delikatne-bezover-cut
balance-scale
cleveland
cleveland_v2
cmc
dermatology
glass
hayes-roth
new_ecoli
new_led7digit
new_vehicle
new_winequality-red
new_yeast
thyroid-newthyroid


'G-MEAN'

Unnamed: 0,base knn,default knn,xgboost knn
1czysty-cut,0.963,0.968,0.973
2delikatne-cut,0.658,0.737,0.768
3mocniej-cut,0.401,0.465,0.494
4delikatne-bezover-cut,0.73,0.837,0.866
balance-scale,0.263,0.511,0.439
cleveland,0.081,0.089,0.129
cleveland_v2,0.068,0.119,0.117
cmc,0.42,0.374,0.376
dermatology,0.909,0.913,0.913
glass,0.537,0.559,0.542


'MEAN G-MEAN'

base knn       0.545118
default knn    0.570706
xgboost knn    0.574941
dtype: float64

[('xgboost knn', 1.7058823529411764), ('default knn', 1.8235294117647058), ('base knn', 2.4705882352941178)]


In [161]:
another = test_spiders(['base', 'default', 'custom_4'])

1czysty-cut
2delikatne-cut
3mocniej-cut
4delikatne-bezover-cut
balance-scale
cleveland
cleveland_v2
cmc
dermatology
glass
hayes-roth
new_ecoli
new_led7digit
new_vehicle
new_winequality-red
new_yeast
thyroid-newthyroid


'G-MEAN'

Unnamed: 0,base knn,default knn,custom_4 knn
1czysty-cut,0.963,0.968,0.968
2delikatne-cut,0.658,0.737,0.738
3mocniej-cut,0.401,0.465,0.464
4delikatne-bezover-cut,0.73,0.837,0.837
balance-scale,0.263,0.511,0.511
cleveland,0.081,0.089,0.089
cleveland_v2,0.068,0.119,0.116
cmc,0.42,0.374,0.362
dermatology,0.909,0.913,0.913
glass,0.537,0.559,0.554


'MEAN G-MEAN'

base knn        0.545118
default knn     0.570706
custom_4 knn    0.570765
dtype: float64

[('default knn', 1.411764705882353), ('custom_4 knn', 2.176470588235294), ('base knn', 2.411764705882353)]


In [163]:
another = test_spiders(['base', 'default', 'custom_5'])

1czysty-cut
2delikatne-cut
3mocniej-cut
4delikatne-bezover-cut
balance-scale
cleveland
cleveland_v2
cmc
dermatology
glass
hayes-roth
new_ecoli
new_led7digit
new_vehicle
new_winequality-red
new_yeast
thyroid-newthyroid


'G-MEAN'

Unnamed: 0,base knn,default knn,custom_5 knn
1czysty-cut,0.963,0.968,0.969
2delikatne-cut,0.658,0.737,0.752
3mocniej-cut,0.401,0.465,0.47
4delikatne-bezover-cut,0.73,0.837,0.833
balance-scale,0.263,0.511,0.415
cleveland,0.081,0.089,0.09
cleveland_v2,0.068,0.119,0.119
cmc,0.42,0.374,0.356
dermatology,0.909,0.913,0.913
glass,0.537,0.559,0.553


'MEAN G-MEAN'

base knn        0.545118
default knn     0.570706
custom_5 knn    0.567529
dtype: float64

[('default knn', 1.7647058823529411), ('custom_5 knn', 1.8235294117647058), ('base knn', 2.411764705882353)]


In [133]:
another = test_spiders(['base', 'default', 'xgboost'])

1czysty-cut
[[0.         0.92044427 0.68177709]
 [0.92044427 0.         0.68177709]
 [3.78445049 0.68177709 0.        ]]
[[0.         0.89216723 0.89216723]
 [1.86266219 0.         0.56866891]
 [3.48015378 0.56866891 0.        ]]
[[0.         2.22627868 0.57080246]
 [0.84671517 0.         0.57080246]
 [3.32992949 0.57080246 0.        ]]
[[0.         1.98425098 0.55549956]
 [1.12700013 0.         0.55549956]
 [3.41300241 0.55549956 0.        ]]
[[0.         1.26037782 2.13915297]
 [1.55330287 0.         0.38160267]
 [3.01792812 0.38160267 0.        ]]
2delikatne-cut
[[0.         1.69247036 3.45630994]
 [0.6341666  0.         1.10452383]
 [0.5165773  1.10452383 0.        ]]
[[0.         1.59591412 3.31214678]
 [0.95232687 0.         0.30873962]
 [0.95232687 1.59591412 0.        ]]
[[0.         0.54165075 0.54165075]
 [1.77919372 0.         0.54165075]
 [3.01673669 2.19170805 0.        ]]
[[0.         1.02891575 0.50843231]
 [0.76867403 0.         0.50843231]
 [3.37109122 2.06988262 0.   

[[0.         1.33186166 1.33186166 1.33186166 1.33186166 1.33186166]
 [1.33186166 0.         1.33186166 3.72126558 1.33186166 1.33186166]
 [1.33186166 1.33186166 0.         1.33186166 1.33186166 1.33186166]
 [1.33186166 1.33186166 1.33186166 0.         1.33186166 1.33186166]
 [1.33186166 1.33186166 1.33186166 1.33186166 0.         1.33186166]
 [1.33186166 1.33186166 1.33186166 1.33186166 1.33186166 0.        ]]
flare
[[0.         0.69125473 0.69125473 0.69125473 0.69125473 0.69125473]
 [0.69125473 0.         5.38766159 1.31744232 0.84780163 0.69125473]
 [0.69125473 3.66564574 0.         3.35255195 1.63053611 0.69125473]
 [0.69125473 0.69125473 0.69125473 0.         0.69125473 0.69125473]
 [0.69125473 0.69125473 0.69125473 0.69125473 0.         0.69125473]
 [0.69125473 0.69125473 0.69125473 0.69125473 0.69125473 0.        ]]
[[0.         0.64951364 0.64951364 0.64951364 0.64951364 0.64951364]
 [0.64951364 0.         5.57822812 2.02955369 0.64951364 0.64951364]
 [0.64951364 2.81814801 0.

[[0.         0.89517152 0.89517152 3.41105512 0.89517152 2.15311332]
 [2.15311332 0.         0.89517152 4.66899693 0.89517152 0.89517152]
 [0.89517152 0.89517152 0.         0.89517152 0.89517152 0.89517152]
 [2.15311332 0.89517152 2.15311332 0.         0.89517152 0.89517152]
 [0.89517152 0.89517152 0.89517152 0.89517152 0.         0.89517152]
 [0.89517152 0.89517152 0.89517152 0.89517152 0.89517152 0.        ]]
new_vehicle
[[0.         1.70710678 0.43431458]
 [1.70710678 0.         2.97989899]
 [0.43431458 1.70710678 0.        ]]
[[0.         3.59299756 0.60803525]
 [0.60803525 0.         0.87939546]
 [0.60803525 1.69347609 0.        ]]
[[0.         1.41380294 0.66895764]
 [1.41380294 0.         2.90349354]
 [0.66895764 2.15864824 0.        ]]
[[0.         1.48112522 0.32642469]
 [1.76980036 0.         3.21317603]
 [0.90377496 1.19245009 0.        ]]
[[0.         1.8804711  0.66135727]
 [0.66135727 0.         3.09958493]
 [0.66135727 1.8804711  0.        ]]
new_winequality-red
[[0.    

'G-MEAN'

Unnamed: 0,base knn,default knn,xgboost knn
1czysty-cut,0.963,0.968,0.973
2delikatne-cut,0.658,0.737,0.77
3mocniej-cut,0.401,0.465,0.487
4delikatne-bezover-cut,0.73,0.837,0.866
balance-scale,0.263,0.511,0.355
car,0.321,0.305,0.265
cleveland,0.081,0.089,0.09
cleveland_v2,0.068,0.119,0.117
cmc,0.42,0.374,0.372
dermatology,0.909,0.913,0.913


'MEAN G-MEAN'

base knn       0.528053
default knn    0.547789
xgboost knn    0.541053
dtype: float64

[('default knn', 1.7894736842105263), ('xgboost knn', 1.894736842105263), ('base knn', 2.3157894736842106)]


In [44]:
def test_spiders_tpr(options, dataset_name):
    np.random.seed(0)

    datasets = load_arff_datasets()
    results_g_mean = dict()
    results_acc = dict()
    results_tpr = {}

    dataset_values = datasets[dataset_name]
    #if dataset_name == 'dermatology' or dataset_name == 'new_ecoli':
    #    continue
    #print(dataset_name)

    X, y = dataset_values.data, dataset_values.target
    labels = sorted(list(Counter(dataset_values.target).keys()), reverse=True)

    #if len(X)>1000:
    #    continue

    results_g_mean[dataset_name]=dict()
    results_acc[dataset_name]=dict()

    for resample in options:
        results_tpr[dataset_name +", "+ resample] = {}
        skf = StratifiedKFold(n_splits=5, random_state=0)
        acc, g_mean, acc2, g_mean2 = list(),list(), [], []
        conf_matrices = []
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            error_flag = False
            clf_tree = DecisionTreeClassifier(random_state=0)

            if resample == 'base':
                X_train_resampled, y_train_resampled = X_train, y_train
            elif resample == 'xgboost':
                model = xgb.XGBClassifier(random_state=1, learning_rate=0.01)
                model.fit(X_train, y_train)
                cm = confusion_matrix(y_test, model.predict(X_test))
                cost = cm.T
                cost[cost == 0] = 1
                np.fill_diagonal(cost, 0)print(sorted(list(mean_ranks.items()), key=lambda x: x[1]))
                clf = SPIDER3(k=5, majority_classes=maj_int_min[dataset_name]['maj'], intermediate_classes=maj_int_min[dataset_name]['int'], minority_classes=maj_int_min[dataset_name]['min'], cost=cost)
                X_train_resampled, y_train_resampled = clf.fit_transform(X_train.astype(np.float64), y_train)
            else:
                cost = calc_cost_matrix(dataset_name, resample)
                clf = SPIDER3(k=5, majority_classes=maj_int_min[dataset_name]['maj'], intermediate_classes=maj_int_min[dataset_name]['int'], minority_classes=maj_int_min[dataset_name]['min'], cost=cost)
                X_train_resampled, y_train_resampled = clf.fit_transform(X_train.astype(np.float64), y_train)

            clf_tree.fit(X_train_resampled, y_train_resampled)
            y_pred = clf_tree.predict(X_test)
            g_mean.append(geometric_mean_score(y_test, y_pred, correction=0.001))
            acc.append(accuracy_score(y_test, y_pred))

            neigh = KNeighborsClassifier(n_neighbors=1)
            for i in range(0, 2):
                std = union(X_train,X_test)[:,i].std()
                if std == 0:
                    std = 1e-6
                mean = union(X_train,X_test)[:,i].mean()
                X_train_resampled[:, i] = (X_train_resampled[:, i] - mean) / (4 * std)
                X_test[:, i] = (X_test[:, i] - mean) / (4 * std)
            neigh.fit(X_train_resampled, y_train_resampled)
            y_pred = neigh.predict(X_test)
            g_mean2.append(geometric_mean_score(y_test, y_pred, correction=0.001))
            acc2.append(accuracy_score(y_test, y_pred))
            conf_matrices.append(confusion_matrix(y_test, y_pred, labels=labels))

        result_g_mean = None if error_flag else round(np.mean(g_mean),3)
        result_acc = None if error_flag else round(np.mean(acc),3)
        result_g_mean2 = None if error_flag else round(np.mean(g_mean2),3)
        result_acc2 = None if error_flag else round(np.mean(acc2),3)

        #calculate average confusion matrix
        no_classes = np.unique(datasets[dataset_name].target).size
        average_conf_matrix = np.zeros((no_classes, no_classes))
        for cm in conf_matrices:
            average_conf_matrix += cm
        average_conf_matrix /= len(conf_matrices)
        tprs = [average_conf_matrix[i, i] / average_conf_matrix[i,:].sum() for i,label in enumerate(labels)]

        for tpr, label in zip(tprs, labels):
            results_tpr[dataset_name +", "+ resample][str(label) + " tpr, "] = tpr

        results_g_mean[dataset_name][resample + " tree"]=result_g_mean
        results_acc[dataset_name][resample + " tree"]=result_acc
        results_g_mean[dataset_name][resample + " knn"]=result_g_mean2
        results_acc[dataset_name][resample + " knn"]=result_acc2

    #display("G-MEAN")
    #df = pd.DataFrame(results_g_mean).T
    #display(df.style.apply(highlight_max, axis=1))

    #display("MEAN G-MEAN")
    #df.fillna(df.median(), inplace=True)
    #display(df.mean())
    
    #display("TPRs")
    df = pd.DataFrame(results_tpr).T
    display(df.style.apply(highlight_max, axis=0))
    return df

In [41]:
datasets.keys()

odict_keys(['1czysty-cut', '2delikatne-cut', '3mocniej-cut', '4delikatne-bezover-cut', 'balance-scale', 'car', 'cleveland', 'cleveland_v2', 'cmc', 'dermatology', 'flare', 'glass', 'hayes-roth', 'new_ecoli', 'new_led7digit', 'new_vehicle', 'new_winequality-red', 'new_yeast', 'thyroid-newthyroid'])

In [45]:
another = test_spiders_tpr(['default', 'custom_1'], list(datasets.keys())[0])

Unnamed: 0,"2 tpr,","1 tpr,","0 tpr,"
"1czysty-cut, default",0.979167,0.958333,0.966667
"1czysty-cut, custom_1",0.995833,0.966667,0.952381


In [None]:
for dataset_name in list(datasets.keys()):
    test_spiders_tpr(['base', 'default', 'custom_1', 'custom_2', 'custom_3', 'xgboost'], dataset_name)

In [47]:
for dataset_name in list(datasets.keys()):
    test_spiders_tpr(['base', 'default', 'custom_1'], dataset_name)

Unnamed: 0,"2 tpr,","1 tpr,","0 tpr,"
"1czysty-cut, base",0.9625,0.95,0.977381
"1czysty-cut, default",0.979167,0.958333,0.966667
"1czysty-cut, custom_1",0.995833,0.966667,0.952381


Unnamed: 0,"2 tpr,","1 tpr,","0 tpr,"
"2delikatne-cut, base",0.654167,0.55,0.911905
"2delikatne-cut, default",0.783333,0.633333,0.85119
"2delikatne-cut, custom_1",0.804167,0.658333,0.838095


Unnamed: 0,"2 tpr,","1 tpr,","0 tpr,"
"3mocniej-cut, base",0.379167,0.241667,0.834524
"3mocniej-cut, default",0.558333,0.308333,0.658333
"3mocniej-cut, custom_1",0.529167,0.333333,0.661905


Unnamed: 0,"2 tpr,","1 tpr,","0 tpr,"
"4delikatne-bezover-cut, base",0.775,0.666667,0.885714
"4delikatne-bezover-cut, default",0.916667,0.808333,0.827381
"4delikatne-bezover-cut, custom_1",0.916667,0.808333,0.82381


Unnamed: 0,"2 tpr,","1 tpr,","0 tpr,"
"balance-scale, base",0.815972,0.881944,0.0816327
"balance-scale, default",0.788194,0.829861,0.204082
"balance-scale, custom_1",0.760417,0.8125,0.183673


Unnamed: 0,"3 tpr,","2 tpr,","1 tpr,","0 tpr,"
"car, base",0.4,0.875207,0.318841,0.515625
"car, default",0.4,0.91405,0.449275,0.260417
"car, custom_1",0.8,0.928926,0.724638,0.00520833


Unnamed: 0,"4 tpr,","3 tpr,","2 tpr,","1 tpr,","0 tpr,"
"cleveland, base",0.0769231,0.0857143,0.25,0.236364,0.689024
"cleveland, default",0.0769231,0.114286,0.25,0.309091,0.439024
"cleveland, custom_1",0.0769231,0.0857143,0.25,0.309091,0.457317


Unnamed: 0,"3 tpr,","2 tpr,","1 tpr,","0 tpr,"
"cleveland_v2, base",0.0769231,0.0857143,0.25,0.826484
"cleveland_v2, default",0.0769231,0.142857,0.305556,0.707763
"cleveland_v2, custom_1",0.0769231,0.114286,0.305556,0.716895


Unnamed: 0,"2 tpr,","1 tpr,","0 tpr,"
"cmc, base",0.41683,0.348348,0.515103
"cmc, default",0.21135,0.714715,0.348172
"cmc, custom_1",0.332681,0.771772,0.18601


Unnamed: 0,"5 tpr,","4 tpr,","3 tpr,","2 tpr,","1 tpr,","0 tpr,"
"dermatology, base",0.95,0.942308,0.918367,1,0.737705,0.946429
"dermatology, default",0.95,0.942308,0.897959,1,0.786885,0.946429
"dermatology, custom_1",0.95,0.961538,0.918367,1,0.803279,0.9375


Unnamed: 0,"5 tpr,","4 tpr,","3 tpr,","2 tpr,","1 tpr,","0 tpr,"
"flare, base",0.984848,0.137255,0.301724,0.470948,0.477352,0.976415
"flare, default",0.987374,0.45098,0.0862069,0.321101,0.66899,0.976415
"flare, custom_1",0.994949,0.607843,0.155172,0.0366972,0.829268,0.976415


Unnamed: 0,"5 tpr,","4 tpr,","3 tpr,","2 tpr,","1 tpr,","0 tpr,"
"glass, base",0.470588,0.555556,0.896552,0.769231,0.697368,0.771429
"glass, default",0.647059,0.555556,0.758621,0.923077,0.526316,0.485714
"glass, custom_1",0.705882,0.666667,0.793103,0.923077,0.592105,0.571429


Unnamed: 0,"2 tpr,","1 tpr,","0 tpr,"
"hayes-roth, base",0.677419,0.734375,0.723077
"hayes-roth, default",0.83871,0.296875,0.476923
"hayes-roth, custom_1",0.806452,0.296875,0.676923


Unnamed: 0,"4 tpr,","3 tpr,","2 tpr,","1 tpr,","0 tpr,"
"new_ecoli, base",0.846154,0.72,0.540541,0.727273,0.903448
"new_ecoli, default",0.846154,0.72,0.756757,0.506494,0.896552
"new_ecoli, custom_1",0.846154,0.72,0.756757,0.558442,0.875862


Unnamed: 0,"5 tpr,","4 tpr,","3 tpr,","2 tpr,","1 tpr,","0 tpr,"
"new_led7digit, base",0.818182,0.807692,0.694444,0.851064,0.44898,0.642857
"new_led7digit, default",0.525253,0.884615,0.388889,0.808511,0.795918,0.244898
"new_led7digit, custom_1",0.474747,0.923077,0.537037,0.787234,0.795918,0.255102


Unnamed: 0,"2 tpr,","1 tpr,","0 tpr,"
"new_vehicle, base",0.879397,0.878788,0.87156
"new_vehicle, default",0.924623,0.818182,0.876147
"new_vehicle, custom_1",0.929648,0.801865,0.876147


Unnamed: 0,"3 tpr,","2 tpr,","1 tpr,","0 tpr,"
"new_winequality-red, base",0.0864198,0.266332,0.438871,0.518355
"new_winequality-red, default",0.197531,0.482412,0.233542,0.512482
"new_winequality-red, custom_1",0.185185,0.613065,0.0815047,0.653451


Unnamed: 0,"8 tpr,","7 tpr,","6 tpr,","5 tpr,","4 tpr,","3 tpr,","2 tpr,","1 tpr,","0 tpr,"
"new_yeast, base",0.463115,0.619048,0.313725,0.5,0.485714,0.0666667,0.3,0.48951,0.429806
"new_yeast, default",0.516393,0.60119,0.45098,0.5,0.514286,0.166667,0.6,0.459207,0.431965
"new_yeast, custom_1",0.5,0.464286,0.529412,0.5,0.514286,0.333333,0.65,0.508159,0.179266


Unnamed: 0,"2 tpr,","1 tpr,","0 tpr,"
"thyroid-newthyroid, base",0.766667,0.885714,0.96
"thyroid-newthyroid, default",0.833333,0.914286,0.92
"thyroid-newthyroid, custom_1",0.766667,0.942857,0.94


In [57]:
another = test_spiders(['base', 'default', 'custom_1', 'rforest', 'knn', 'xgboost']) #'default', 'custom_1',

1czysty-cut
2delikatne-cut
3mocniej-cut
4delikatne-bezover-cut
balance-scale
car
cleveland
cleveland_v2
cmc
dermatology
flare
glass
hayes-roth
new_ecoli
new_led7digit
new_vehicle
new_winequality-red
new_yeast
thyroid-newthyroid


'G-MEAN'

Unnamed: 0,base tree,base knn,default tree,default knn,custom_1 tree,custom_1 knn,rforest tree,rforest knn,knn tree,knn knn,xgboost tree,xgboost knn
1czysty-cut,0.937,0.963,0.954,0.968,0.951,0.971,0.946,0.97,0.948,0.966,0.952,0.973
2delikatne-cut,0.651,0.658,0.727,0.737,0.758,0.751,0.763,0.775,0.77,0.777,0.76,0.768
3mocniej-cut,0.404,0.401,0.5,0.465,0.518,0.474,0.493,0.487,0.525,0.485,0.497,0.494
4delikatne-bezover-cut,0.723,0.73,0.845,0.837,0.853,0.832,0.864,0.855,0.863,0.864,0.867,0.866
balance-scale,0.234,0.263,0.425,0.511,0.251,0.479,0.322,0.342,0.387,0.426,0.225,0.439
car,0.241,0.321,0.256,0.305,0.086,0.177,0.264,0.142,0.288,0.177,0.244,0.227
cleveland,0.098,0.081,0.072,0.089,0.081,0.08,0.168,0.126,0.092,0.115,0.08,0.129
cleveland_v2,0.143,0.068,0.192,0.119,0.172,0.118,0.17,0.118,0.153,0.121,0.182,0.117
cmc,0.458,0.42,0.415,0.374,0.427,0.362,0.383,0.369,0.377,0.376,0.413,0.376
dermatology,0.917,0.909,0.917,0.913,0.903,0.923,0.917,0.913,0.917,0.918,0.907,0.913


'MEAN G-MEAN'

base tree        0.546053
base knn         0.528053
default tree     0.551211
default knn      0.547789
custom_1 tree    0.530632
custom_1 knn     0.542947
rforest tree     0.542632
rforest knn      0.531789
knn tree         0.548263
knn knn          0.531316
xgboost tree     0.527737
xgboost knn      0.540105
dtype: float64

In [68]:
another = test_spiders(['base', 'xgboost']) #'default', 'custom_1'another = test_spiders(['base', 'default', 'xgboost']),

1czysty-cut
2delikatne-cut
3mocniej-cut
4delikatne-bezover-cut
balance-scale
car
cleveland
cleveland_v2
cmc
dermatology
flare
glass
hayes-roth
new_ecoli
new_led7digit
new_vehicle
new_winequality-red
new_yeast
thyroid-newthyroid


'G-MEAN'

Unnamed: 0,base tree,base knn,xgboost tree,xgboost knn
1czysty-cut,0.937,0.963,0.952,0.973
2delikatne-cut,0.651,0.658,0.76,0.768
3mocniej-cut,0.404,0.401,0.497,0.494
4delikatne-bezover-cut,0.723,0.73,0.867,0.866
balance-scale,0.234,0.263,0.225,0.439
car,0.241,0.321,0.244,0.227
cleveland,0.098,0.081,0.08,0.129
cleveland_v2,0.143,0.068,0.182,0.117
cmc,0.458,0.42,0.413,0.376
dermatology,0.917,0.909,0.907,0.913


'MEAN G-MEAN'

base tree       0.546053
base knn        0.528053
xgboost tree    0.527737
xgboost knn     0.540105
dtype: float64

In [69]:
another

Unnamed: 0,base tree,base knn,xgboost tree,xgboost knn
1czysty-cut,0.937,0.963,0.952,0.973
2delikatne-cut,0.651,0.658,0.76,0.768
3mocniej-cut,0.404,0.401,0.497,0.494
4delikatne-bezover-cut,0.723,0.73,0.867,0.866
balance-scale,0.234,0.263,0.225,0.439
car,0.241,0.321,0.244,0.227
cleveland,0.098,0.081,0.08,0.129
cleveland_v2,0.143,0.068,0.182,0.117
cmc,0.458,0.42,0.413,0.376
dermatology,0.917,0.909,0.907,0.913


In [106]:
mean_ranks = calc_mean_ranks(another)    
print(sorted(list(mean_ranks.items()), key=lambda x: x[1]))

[('base tree', 2.3684210526315788), ('xgboost knn', 2.4210526315789473), ('xgboost tree', 2.526315789473684), ('base knn', 2.6842105263157894)]


In [164]:
for dataset_name in list(datasets.keys()):
    if dataset_name == "flare" or dataset_name == "car":
            continue
    test_spiders_tpr(['base', 'default', 'custom_1'], dataset_name)

Unnamed: 0,"2 tpr,","1 tpr,","0 tpr,"
"1czysty-cut, base",0.9625,0.95,0.977381
"1czysty-cut, default",0.979167,0.958333,0.966667
"1czysty-cut, custom_1",0.995833,0.966667,0.952381


Unnamed: 0,"2 tpr,","1 tpr,","0 tpr,"
"2delikatne-cut, base",0.654167,0.55,0.911905
"2delikatne-cut, default",0.783333,0.633333,0.85119
"2delikatne-cut, custom_1",0.804167,0.658333,0.838095


Unnamed: 0,"2 tpr,","1 tpr,","0 tpr,"
"3mocniej-cut, base",0.379167,0.241667,0.834524
"3mocniej-cut, default",0.558333,0.308333,0.658333
"3mocniej-cut, custom_1",0.529167,0.333333,0.661905


Unnamed: 0,"2 tpr,","1 tpr,","0 tpr,"
"4delikatne-bezover-cut, base",0.775,0.666667,0.885714
"4delikatne-bezover-cut, default",0.916667,0.808333,0.827381
"4delikatne-bezover-cut, custom_1",0.916667,0.808333,0.82381


Unnamed: 0,"2 tpr,","1 tpr,","0 tpr,"
"balance-scale, base",0.815972,0.881944,0.0816327
"balance-scale, default",0.788194,0.829861,0.204082
"balance-scale, custom_1",0.760417,0.8125,0.183673


Unnamed: 0,"4 tpr,","3 tpr,","2 tpr,","1 tpr,","0 tpr,"
"cleveland, base",0.0769231,0.0857143,0.25,0.236364,0.689024
"cleveland, default",0.0769231,0.114286,0.25,0.309091,0.439024
"cleveland, custom_1",0.0769231,0.0857143,0.25,0.309091,0.457317


Unnamed: 0,"3 tpr,","2 tpr,","1 tpr,","0 tpr,"
"cleveland_v2, base",0.0769231,0.0857143,0.25,0.826484
"cleveland_v2, default",0.0769231,0.142857,0.305556,0.707763
"cleveland_v2, custom_1",0.0769231,0.114286,0.305556,0.716895


Unnamed: 0,"2 tpr,","1 tpr,","0 tpr,"
"cmc, base",0.41683,0.348348,0.515103
"cmc, default",0.21135,0.714715,0.348172
"cmc, custom_1",0.332681,0.771772,0.18601


Unnamed: 0,"5 tpr,","4 tpr,","3 tpr,","2 tpr,","1 tpr,","0 tpr,"
"dermatology, base",0.95,0.942308,0.918367,1,0.737705,0.946429
"dermatology, default",0.95,0.942308,0.897959,1,0.786885,0.946429
"dermatology, custom_1",0.95,0.961538,0.918367,1,0.803279,0.9375


Unnamed: 0,"5 tpr,","4 tpr,","3 tpr,","2 tpr,","1 tpr,","0 tpr,"
"glass, base",0.470588,0.555556,0.896552,0.769231,0.697368,0.771429
"glass, default",0.647059,0.555556,0.758621,0.923077,0.526316,0.485714
"glass, custom_1",0.705882,0.666667,0.793103,0.923077,0.592105,0.571429


Unnamed: 0,"2 tpr,","1 tpr,","0 tpr,"
"hayes-roth, base",0.677419,0.734375,0.723077
"hayes-roth, default",0.83871,0.296875,0.476923
"hayes-roth, custom_1",0.806452,0.296875,0.676923


Unnamed: 0,"4 tpr,","3 tpr,","2 tpr,","1 tpr,","0 tpr,"
"new_ecoli, base",0.846154,0.72,0.540541,0.727273,0.903448
"new_ecoli, default",0.846154,0.72,0.756757,0.506494,0.896552
"new_ecoli, custom_1",0.846154,0.72,0.756757,0.558442,0.875862


Unnamed: 0,"5 tpr,","4 tpr,","3 tpr,","2 tpr,","1 tpr,","0 tpr,"
"new_led7digit, base",0.818182,0.807692,0.694444,0.851064,0.44898,0.642857
"new_led7digit, default",0.525253,0.884615,0.388889,0.808511,0.795918,0.244898
"new_led7digit, custom_1",0.474747,0.923077,0.537037,0.787234,0.795918,0.255102


Unnamed: 0,"2 tpr,","1 tpr,","0 tpr,"
"new_vehicle, base",0.879397,0.878788,0.87156
"new_vehicle, default",0.924623,0.818182,0.876147
"new_vehicle, custom_1",0.929648,0.801865,0.876147


Unnamed: 0,"3 tpr,","2 tpr,","1 tpr,","0 tpr,"
"new_winequality-red, base",0.0864198,0.266332,0.438871,0.518355
"new_winequality-red, default",0.197531,0.482412,0.233542,0.512482
"new_winequality-red, custom_1",0.185185,0.613065,0.0815047,0.653451


Unnamed: 0,"8 tpr,","7 tpr,","6 tpr,","5 tpr,","4 tpr,","3 tpr,","2 tpr,","1 tpr,","0 tpr,"
"new_yeast, base",0.463115,0.619048,0.313725,0.5,0.485714,0.0666667,0.3,0.48951,0.429806
"new_yeast, default",0.516393,0.60119,0.45098,0.5,0.514286,0.166667,0.6,0.459207,0.431965
"new_yeast, custom_1",0.5,0.464286,0.529412,0.5,0.514286,0.333333,0.65,0.508159,0.179266


Unnamed: 0,"2 tpr,","1 tpr,","0 tpr,"
"thyroid-newthyroid, base",0.766667,0.885714,0.96
"thyroid-newthyroid, default",0.833333,0.914286,0.92
"thyroid-newthyroid, custom_1",0.766667,0.942857,0.94
