In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from multi_imbalance.resampling.spider import SPIDER3
from collections import Counter
from imblearn.metrics import geometric_mean_score
from IPython.core.display import display
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from multi_imbalance.utils.data import load_arff_datasets
from sklearn.ensemble import RandomForestClassifier



In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def read_train_and_test_data(overlap, imbalance_ratio, i):
    with open(f"data/3class-{imbalance_ratio}-overlap-{overlap}-learn-{i}.arff") as f:
        content = f.readlines()
    content = [x.strip().split(",") for x in content][5:]
    data = np.array(content)
    X_train, y_train = data[:, :-1].astype(float), data[:, -1].astype(object)

    with open(f"data/3class-{imbalance_ratio}-overlap-{overlap}-test-{i}.arff") as f:
        content = f.readlines()
    content = [x.strip().split(",") for x in content][5:]
    data = np.array(content)
    X_test, y_test = data[:, :-1].astype(float), data[:, -1].astype(object)

    return X_train, y_train, X_test, y_test


def train_and_test():
    neigh = KNeighborsClassifier(n_neighbors=1)
    neigh.fit(X_train, y_train)
    y_pred = neigh.predict(X_test)
    labels = ['MIN', 'INT', 'MAJ']
    return [confusion_matrix(y_test, y_pred, labels=labels)[i, i] / confusion_matrix(y_test, y_pred, labels=labels)[i,:].sum() for i,label in enumerate(labels)]


In [4]:
datasets = load_arff_datasets()

In [5]:
maj_int_min = {
    "1czysty-cut": {'maj': [0], 'int': [], 'min': [2, 1]},
    "2delikatne-cut": {'maj': [0], 'int': [], 'min': [2, 1]},
    "3mocniej-cut": {'maj': [0], 'int': [], 'min': [2, 1]},
    "4delikatne-bezover-cut": {'maj': [0], 'int': [], 'min': [2, 1]},
    "balance-scale": {'maj': [2, 1], 'int': [], 'min': [0]},
    "cleveland": {'maj': [0], 'int': [], 'min': [1, 2, 3, 4]},
    "cleveland_v2": {'maj': [0], 'int': [], 'min': [1, 2, 3]},
    "car": {'maj': [2, 0], 'int': [], 'min': [1,3]},
    "cmc": {'maj': [0, 2], 'int': [], 'min': [1]},
    "dermatology": {'maj': [0, 2, 1, 4, 3], 'int': [], 'min': [5]},
    "flare": {'maj': [1, 2, 3, 6], 'int': [], 'min': [4, 5]},
    "glass": {'maj': [1, 0, 3], 'int': [], 'min': [5, 2, 4]},
    "hayes-roth": {'maj': [0, 1], 'int': [], 'min': [2]},
    "new_ecoli": {'maj': [0, 1], 'int': [], 'min': [4, 2, 3]},
    "new_led7digit": {'maj': [3, 5, 0, 2], 'int': [], 'min': [4, 1]},
    "new_vehicle": {'maj': [1], 'int': [], 'min': [0, 2]},
    "new_winequality-red": {'maj': [0, 1], 'int': [], 'min': [2, 3]},
    "new_yeast": {'maj': [0, 1, 8, 7], 'int': [], 'min': [6, 5, 4, 3, 2]},
    "thyroid-newthyroid": {'maj': [0], 'int': [], 'min': [1, 2]}
}

In [6]:
def highlight_max(data, color='yellow'):
    '''
    highlight the maximum in a Series or DataFrame
    '''
    attr = 'font-weight: bold'
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)

In [7]:
def calc_cost_matrix(dataset_name, cost_strategy):
    if cost_strategy == 'default':
        #default
        no_classes = np.unique(datasets[dataset_name].target).size
        cost = np.ones((no_classes, no_classes))
        np.fill_diagonal(cost, 0)
        return cost
    elif cost_strategy == 'custom_1' or cost_strategy == 'custom': # IR pod diagonala, wyzej jedynki
        return SPIDER3._estimate_cost_matrix(datasets[dataset_name].target)
    elif cost_strategy == 'custom_2': # odwrotnie, tj IR nad diagonala, nizej jedynki
        y=datasets[dataset_name].target
        class_cardinality = Counter(y)
        classes = list(class_cardinality.keys())
        cost = np.ones([len(classes), len(classes)])
        for i, (c1, card1) in enumerate(class_cardinality.items()):
            for j, (c2, card2) in enumerate(class_cardinality.items()):
                if j < i:
                    cost[i, j] = 1
                else:
                    cost[i, j] = card1 / card2
        np.fill_diagonal(cost, 0)
    elif cost_strategy == 'custom_3': # IR wszedzie
        y=datasets[dataset_name].target
        class_cardinality = Counter(y)
        classes = list(class_cardinality.keys())
        cost = np.ones([len(classes), len(classes)])
        for i, (c1, card1) in enumerate(class_cardinality.items()):
            for j, (c2, card2) in enumerate(class_cardinality.items()):
                cost[i, j] = card1 / card2
        np.fill_diagonal(cost, 0)
    elif cost_strategy == 'custom_4':
        no_classes = np.unique(datasets[dataset_name].target).size
        cost = np.ones((no_classes, no_classes))
        X, y = datasets[dataset_name].data, datasets[dataset_name].target
        element_count = Counter(y)
        cardinality_pairs = list(element_count.items())
        for c1,_ in cardinality_pairs:
            for c2,_ in cardinality_pairs:
                f2_overlap_volume = 1
                for i in range(X.shape[1]):
                    f2_overlap_volume = f2_overlap_volume * (min(X[y == c1][:,i].max(), X[y == c2][:,i].max()) - max(X[y == c1][:,i].min(), X[y == c2][:,i].min())) / (max(X[y == c1][:,i].max(), X[y == c2][:,i].max()) - min(X[y == c1][:,i].min(), X[y == c2][:,i].min()) + 0.000001)
                cost[c2, c1] = f2_overlap_volume
        np.fill_diagonal(cost, 0)
        max_overlap = cost.max() + 1e-6
        cost = cost / max_overlap + np.ones((no_classes, no_classes))
        np.fill_diagonal(cost,0)
        return cost
    elif cost_strategy == 'custom_5':
        no_classes = np.unique(datasets[dataset_name].target).size
        cost = np.ones((no_classes, no_classes))
        X, y = datasets[dataset_name].data, datasets[dataset_name].target
        element_count = Counter(y)
        cardinality_pairs = list(element_count.items())
        for c1,_ in cardinality_pairs:
            for c2,_ in cardinality_pairs:
                f2_overlap_volume = 1
                for i in range(X.shape[1]):
                    f2_overlap_volume = f2_overlap_volume * (min(X[y == c1][:,i].max(), X[y == c2][:,i].max()) - max(X[y == c1][:,i].min(), X[y == c2][:,i].min())) / (max(X[y == c1][:,i].max(), X[y == c2][:,i].max()) - min(X[y == c1][:,i].min(), X[y == c2][:,i].min()) + 0.000001)
                cost[c2, c1] = f2_overlap_volume
        np.fill_diagonal(cost, 0)
        max_overlap = cost.max() + 1e-6
        cost = cost / max_overlap + np.ones((no_classes, no_classes))
        np.fill_diagonal(cost,0)
        for i in range(no_classes):
            for j in range(no_classes):
                if j > i:
                    cost[i, j] = 1
        return cost
    return cost

In [8]:
def fill_up_diag_ones(cost_matrix):
    for i in range(cost_matrix.shape[0]):
        for j in range(cost_matrix.shape[0]):
            if j > i:
                cost_matrix[i, j] = 1
    return cost_matrix

In [9]:
def calc_mean_ranks(another):
    ranks = {}
    for col in another.columns:
        ranks[col] = []
    for row in another.iterrows():
        pairs= list(zip(row[1].index.tolist(), row[1].values.tolist()))
        sorted_pairs = sorted(pairs, key=lambda x: -x[1])
        for i, (name, score) in enumerate(sorted_pairs):
            ranks[name].append(i + 1)

    mean_ranks = {}
    for name, lst in ranks.items():
        mean_ranks[name] = np.mean(lst)
    return mean_ranks

In [15]:
def test_spiders(options):
    np.random.seed(0)

    datasets = load_arff_datasets()
    results_g_mean = dict()
    results_acc = dict()
    results_min_g_mean = dict()

    for dataset_name, dataset_values in datasets.items():
        #if dataset_name == 'dermatology' or dataset_name == 'new_ecoli':
        #    continue
        print(dataset_name)

        X, y = dataset_values.data, dataset_values.target
        labels = sorted(list(Counter(dataset_values.target).keys()), reverse=True)

        #if len(X)>1000:
        #    continue

        results_g_mean[dataset_name]=dict()
        results_acc[dataset_name]=dict()
        results_min_g_mean[dataset_name]=dict()

        for resample in options:

            acc, g_mean, acc2, g_mean2 = list(),list(), [], []
            min_g_mean = []
            for fold in range(10):
                skf = StratifiedKFold(n_splits=5, random_state=fold)
                for train_index, test_index in skf.split(X, y):
                    X_train, X_test = X[train_index], X[test_index]
                    y_train, y_test = y[train_index], y[test_index]
                    error_flag = False
                    clf_tree = DecisionTreeClassifier(random_state=0)

                    if resample == 'base':
                        X_train_resampled, y_train_resampled = X_train, y_train
                    elif resample == 'xgboost':
                        model = xgb.XGBClassifier(random_state=1, learning_rate=0.01)
                        model.fit(X_train, y_train)
                        cm = confusion_matrix(y_test, model.predict(X_test))
                        cost = cm.T
                        cost[cost == 0] = 1
                        np.fill_diagonal(cost, 0)
                        #cost = fill_up_diag_ones(cost)
                        #cost = (cost - np.mean(cost.ravel())) / np.std(cost.ravel()) + 1
                        #np.fill_diagonal(cost, 0)
                        #print(cost)
                        clf = SPIDER3(k=5, majority_classes=maj_int_min[dataset_name]['maj'], intermediate_classes=maj_int_min[dataset_name]['int'], minority_classes=maj_int_min[dataset_name]['min'], cost=cost)
                        X_train_resampled, y_train_resampled = clf.fit_transform(X_train.astype(np.float64), y_train)
                    elif resample == "rforest":
                        model = RandomForestClassifier(n_estimators=100)
                        model.fit(X_train, y_train)
                        cm = confusion_matrix(y_test, model.predict(X_test))
                        cost = cm.T
                        cost[cost == 0] = 1
                        np.fill_diagonal(cost, 0)
                        clf = SPIDER3(k=5, majority_classes=maj_int_min[dataset_name]['maj'], intermediate_classes=maj_int_min[dataset_name]['int'], minority_classes=maj_int_min[dataset_name]['min'], cost=cost)
                        X_train_resampled, y_train_resampled = clf.fit_transform(X_train.astype(np.float64), y_train)
                    elif resample == "knn":
                        model = KNeighborsClassifier(n_neighbors=5)
                        model.fit(X_train, y_train)
                        cm = confusion_matrix(y_test, model.predict(X_test))
                        cost = cm.T
                        cost[cost == 0] = 1
                        np.fill_diagonal(cost, 0)
                        clf = SPIDER3(k=5, majority_classes=maj_int_min[dataset_name]['maj'], intermediate_classes=maj_int_min[dataset_name]['int'], minority_classes=maj_int_min[dataset_name]['min'], cost=cost)
                        X_train_resampled, y_train_resampled = clf.fit_transform(X_train.astype(np.float64), y_train)
                    else:
                        cost = calc_cost_matrix(dataset_name, resample)
                        clf = SPIDER3(k=5, majority_classes=maj_int_min[dataset_name]['maj'], intermediate_classes=maj_int_min[dataset_name]['int'], minority_classes=maj_int_min[dataset_name]['min'], cost=cost)
                        X_train_resampled, y_train_resampled = clf.fit_transform(X_train.astype(np.float64), y_train)

                    clf_tree.fit(X_train_resampled, y_train_resampled)
                    y_pred = clf_tree.predict(X_test)
                    g_mean.append(geometric_mean_score(y_test, y_pred, correction=0.001))
                    acc.append(accuracy_score(y_test, y_pred))

#                     neigh = KNeighborsClassifier(n_neighbors=1)
#                     for i in range(0, 2):
#                         std = union(X_train,X_test)[:,i].std()
#                         if std == 0:
#                             std = 1e-6
#                         mean = union(X_train,X_test)[:,i].mean()
#                         X_train_resampled[:, i] = (X_train_resampled[:, i] - mean) / (4 * std)
#                         X_test[:, i] = (X_test[:, i] - mean) / (4 * std)
#                     neigh.fit(X_train_resampled, y_train_resampled)
#                     y_pred = neigh.predict(X_test)
#                     g_mean2.append(geometric_mean_score(y_test, y_pred, correction=0.001))
#                     acc2.append(accuracy_score(y_test, y_pred))
                    cm = confusion_matrix(y_test, y_pred, labels=labels)
                    tprs = [cm[i, i] / cm[i,:].sum() for i,label in enumerate(labels)]

                    gm=1
                    for tpr, label in zip(tprs, labels):
                        if label in maj_int_min[dataset_name]['min']:
                            if tpr == 0:
                                gm*=0.001
                            else:
                                gm *= tpr
                    min_g_mean.append(gm ** (1/len(maj_int_min[dataset_name]['min'])))
                
                

            result_g_mean = None if error_flag else round(np.mean(g_mean),3)
            result_acc = None if error_flag else round(np.mean(acc),3)
            result_g_mean2 = None if error_flag else round(np.mean(g_mean2),3)
            result_acc2 = None if error_flag else round(np.mean(acc2),3)
            result_min_g_mean = None if error_flag else round(np.mean(min_g_mean),3)

            results_g_mean[dataset_name][resample + " tree"]=result_g_mean
            results_acc[dataset_name][resample + " tree"]=result_acc
            results_min_g_mean[dataset_name][resample + " tree"] = result_min_g_mean
#             results_g_mean[dataset_name][resample + " knn"]=result_g_mean2
#             results_acc[dataset_name][resample + " knn"]=result_acc2
#             results_min_g_mean[dataset_name][resample + " knn"] = result_min_g_mean

    display("G-MEAN")
    df = pd.DataFrame(results_g_mean).T
    display(df.style.apply(highlight_max, axis=1))

    display("MEAN G-MEAN")
    df.fillna(df.median(), inplace=True)
    display(df.mean())
    
    mean_ranks = calc_mean_ranks(df)
    print("MIN G-MEAN")
    df=pd.DataFrame(results_min_g_mean).T
    display(df.style.apply(highlight_max, axis=1))
    
    display("MEAN MIN G-MEAN")
    df.fillna(df.median(), inplace=True)
    display(df.mean())
    
    print(sorted(list(mean_ranks.items()), key=lambda x: x[1]))
    return df

In [16]:
another = test_spiders(['base', 'default', 'custom'])

1czysty-cut
2delikatne-cut
3mocniej-cut
4delikatne-bezover-cut
balance-scale
car
cleveland
cleveland_v2
cmc
dermatology
flare
glass
hayes-roth
new_ecoli
new_led7digit
new_vehicle
new_winequality-red
new_yeast
thyroid-newthyroid


'G-MEAN'

Unnamed: 0,base tree,default tree,custom tree
1czysty-cut,0.937,0.954,0.949
2delikatne-cut,0.651,0.727,0.755
3mocniej-cut,0.404,0.5,0.525
4delikatne-bezover-cut,0.723,0.845,0.834
balance-scale,0.234,0.425,0.251
car,0.241,0.256,0.086
cleveland,0.098,0.072,0.081
cleveland_v2,0.143,0.192,0.172
cmc,0.458,0.415,0.427
dermatology,0.917,0.917,0.903


'MEAN G-MEAN'

base tree       0.546053
default tree    0.551211
custom tree     0.529737
dtype: float64

MIN G-MEAN


Unnamed: 0,base tree,default tree,custom tree
1czysty-cut,0.92,0.947,0.94
2delikatne-cut,0.577,0.687,0.718
3mocniej-cut,0.308,0.462,0.488
4delikatne-bezover-cut,0.703,0.861,0.854
balance-scale,0.252,0.411,0.212
car,0.22,0.223,0.224
cleveland,0.06,0.046,0.054
cleveland_v2,0.096,0.135,0.135
cmc,0.39,0.709,0.763
dermatology,0.9,0.9,0.85


'MEAN MIN G-MEAN'

base tree       0.506579
default tree    0.609579
custom tree     0.607684
dtype: float64

[('default tree', 1.736842105263158), ('base tree', 2.1052631578947367), ('custom tree', 2.1578947368421053)]
