In [1]:
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.metrics import recall_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer
from IPython.core.display import display
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm_notebook

from multi_imbalance.ensemble.soup_bagging import SOUPBagging
from multi_imbalance.resampling.soup import SOUP
from multi_imbalance.resampling.mdo import MDO
from multi_imbalance.resampling.global_cs import GlobalCS

from imblearn.metrics import geometric_mean_score
from imblearn.over_sampling import SMOTE
from multi_imbalance.resampling.spider import SPIDER3

from sklearn.neighbors import KNeighborsClassifier
import warnings
import logging
from multi_imbalance.utils.data import load_arff_datasets
from multi_imbalance.utils.min_int_maj import maj_int_min
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
warnings.filterwarnings('ignore')

from IPython.display import clear_output
clear_output(wait=True)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [17]:
# def green_valid_backgroud(s):
#     correct = ['1czysty-cut', '2delikatne-cut', '3mocniej-cut','4delikatne-bezover-cut', 'cmc', 'dermatology', 'new_ecoli','new_vehicle','thyroid-newthyroid']
#     return ['background-color: green' if v in correct else '' for v in list(s.index)]
# 


def bold_max(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.max()
    return ['font-weight: bold' if v else '' for v in is_max]
    
def print_scores(scores, name, only_read_dt = False, columns=None, base=None):
    df = pd.DataFrame(scores).T
    if only_read_dt:
        df = df.iloc[4:]
    if columns is not None:
        df = df[columns]
    if base is not None:
        df = pd.merge(base,df, left_index=True, right_index=True)
    df2 = df.style.apply(bold_max, axis=1)
    display(df2)
    
    with open(f'{name}_main.tex','w') as tf:
        tf.write(df.to_latex())
    df.to_csv(f'{name}_main.csv')
    
    df.fillna(df.median(), inplace=True)
    df_median = pd.DataFrame(df.mean().sort_values(ascending=False),columns=['Mean G-mean'])
    display(df_median)
    df_meanrank = pd.DataFrame(df.rank(axis=1,ascending=False).mean().sort_values(),columns=['Mean rank'])
    display(df_meanrank)
    
    with open(f'{name}_median.tex','w') as tf:
        tf.write(df_median.to_latex())
    df_median.to_csv(f'{name}_median.csv')
    with open(f'{name}_meanrank.tex','w') as tf:
        tf.write(df_meanrank.to_latex())
    df_meanrank.to_csv(f'{name}_meanrank.csv')
# print_scores(scores_knn)

In [21]:
def resample_data(resample, seed, X_train, y_train, no_classes, dataset_name):
    if resample == 'base':
        X_train_resampled, y_train_resampled = X_train, y_train
    elif 'soup' in resample:
        k = int(resample[-1])
        soup = SOUP(k=k)
        X_train_resampled, y_train_resampled = soup.fit_transform(np.copy(X_train), np.copy(y_train), maj_int_min=maj_int_min[dataset_name])
    elif resample=='global':
        global_cs = GlobalCS()
        X_train_resampled, y_train_resampled = global_cs.fit_transform(np.copy(X_train), np.copy(y_train), shuffle=False)
    elif resample=='smote':
        smote = SMOTE(random_state=seed)
        X_train_resampled, y_train_resampled = smote.fit_sample(np.copy(X_train), np.copy(y_train))
    elif 'mdo' in resample:
        k=int(resample[-1])
        frac = float(resample[0])/10
        mdo = MDO(k=k, k1_frac=frac, seed=seed)
        X_train_resampled, y_train_resampled = mdo.fit_transform(np.copy(X_train), np.copy(y_train), maj_int_min=maj_int_min[dataset_name])
    elif resample=='spider':
        cost = np.ones((no_classes, no_classes))
        np.fill_diagonal(cost, 0)
        clf = SPIDER3(k=5, cost=cost, majority_classes=maj_int_min[dataset_name]['maj'], intermediate_classes=maj_int_min[dataset_name]['int'], minority_classes=maj_int_min[dataset_name]['min'])
        X_train_resampled, y_train_resampled = clf.fit_transform(X_train.astype(np.float64), y_train)
    elif 'soupbg' in resample or 'mrbbag' in resample:
        # SOUP Bagging does it by itself
        X_train_resampled, y_train_resampled = X_train, y_train
    else:
        raise ValueError(f'Bad type{resample}')
    return X_train_resampled, y_train_resampled



def test_resampling(res, dataset_values, dataset_name):
    X, y, scale_index = dataset_values.data, dataset_values.target, dataset_values.cat_length

    no_classes = np.unique(y).size
    minority_class = maj_int_min[dataset_name]['min']
    result_data = defaultdict(int)
    run_data = defaultdict(lambda: defaultdict(list)) # {metric: {run_number: [scores]}}
    for i in range(10):
        skf = StratifiedKFold(n_splits=5, shuffle=True,random_state=i)
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            if scale_index > 0:
                normalizer = StandardScaler().fit(X_train[:,:scale_index])
    
                X_train[:,:scale_index] = normalizer.transform(X_train[:,:scale_index])
                X_test[:,:scale_index] = normalizer.transform(X_test[:,:scale_index])
            X_train_resampled, y_train_resampled = resample_data(res, i, X_train, y_train, no_classes, dataset_name)

            # for clf_name in ['knn']:
            for clf_name in ['knn','tree']:
                if clf_name == 'knn':
                    clf = KNeighborsClassifier(n_neighbors=5)
                elif clf_name == 'tree':
                    clf = DecisionTreeClassifier(random_state=i)
                    
                if  'soupbg005' in res:
                    vote_classifier = SOUPBagging(clf, n_classifiers=5)
                    clf = vote_classifier
                elif  'soupbg015' in res:
                    vote_classifier = SOUPBagging(clf, n_classifiers=15)
                    clf = vote_classifier
                elif  'soupbg030' in res:
                    vote_classifier = SOUPBagging(clf, n_classifiers=30)
                    clf = vote_classifier
                elif  'soupbg050' in res:
                    vote_classifier = SOUPBagging(clf, n_classifiers=50)
                    clf = vote_classifier
                elif  'soupbg100' in res:
                    vote_classifier = SOUPBagging(clf, n_classifiers=100)
                    clf = vote_classifier
                # elif res == 'mrbbag005':
                    
                    
                clf.fit(X_train_resampled, y_train_resampled)
                if 'soupbg' in res:
                    for strategy in ['average','optimistic','pessimistic','mixed', 'global']:
                        y_pred = clf.predict(X_test, strategy=strategy, maj_int_min=maj_int_min[dataset_name])
                        gmean = geometric_mean_score(y_test, y_pred, correction=0.001)
                        minority_gmean = geometric_mean_score(y_test, y_pred,labels=minority_class, correction=0.001)
                        avg_acc = np.mean(recall_score(y_test, y_pred, average=None))
                        run_data['g_mean_{}_{}'.format(clf_name, strategy)][str(i)].append(gmean)
                        run_data['g_mean_{}_minority_{}'.format(clf_name, strategy)][str(i)].append(minority_gmean)
                else:
                    y_pred = clf.predict(X_test)
                    gmean = geometric_mean_score(y_test, y_pred, correction=0.001)
                    minority_gmean = geometric_mean_score(y_test, y_pred,labels=minority_class, correction=0.001)
                    avg_acc = np.mean(recall_score(y_test, y_pred, average=None))
                    run_data['g_mean_{}'.format(clf_name)][str(i)].append(gmean)
                    run_data['g_mean_{}_minority'.format(clf_name)][str(i)].append(minority_gmean)
                # run_data['avg_acc_{}'.format(clf_name)][str(i)].append(avg_acc)
    
    def get_score_from_metric(run_data, metric):
        runs = run_data[metric]
        runs_scores_list = list(runs.values()) #[[one run k-foledscores],[..]]
        result = np.mean(list(map(np.mean, runs_scores_list)))
        return result
            
    for metric_name, metric_values in run_data.items():
        result_data[metric_name] = get_score_from_metric(run_data, metric_name)
        
    return result_data


def provide_test_and_get_scores(datasets, clf_res_names):
    scores = defaultdict(lambda: defaultdict(dict))
    for dataset_name, dataset_values in tqdm_notebook(datasets.items(),total=len(datasets), desc='1st loop'):
        for resample in clf_res_names:
            result_data = test_resampling(resample, dataset_values, dataset_name)
            for key in result_data:
                scores[key][dataset_name][resample] = round(result_data[key],4)
    return scores

# clf_res_names =['base','soup','soupbg005','soupbg015','soupbg030','soupbg050','soupbg100']
clf_res_names =['soup3','soup5','soup7','3mdo3','3mdo5','3mdo7','5mdo3','5mdo5','5mdo7','7mdo3','7mdo5','7mdo7']
datasets = load_arff_datasets(return_cat_length=True)
scores = provide_test_and_get_scores(datasets, clf_res_names)

HBox(children=(IntProgress(value=0, description='1st loop', max=19, style=ProgressStyle(description_width='iniâ€¦




#### Gmean knn

In [35]:
columns = [i for i in clf_res_names if 'soup' in i]
print_scores(scores['g_mean_knn'], 'g_mean_knn_soup',columns=columns)

Unnamed: 0,soup3,soup5,soup7
1czysty-cut,0.9607,0.9559,0.9514
2delikatne-cut,0.7851,0.7896,0.7894
3mocniej-cut,0.5489,0.5467,0.547
4delikatne-bezover-cut,0.8855,0.8892,0.8918
balance-scale,0.6317,0.6673,0.6786
car,0.5315,0.5666,0.6069
cleveland,0.2038,0.199,0.1979
cleveland_v2,0.2479,0.2475,0.248
cmc,0.4801,0.4788,0.4774
dermatology,0.9485,0.9441,0.944


Unnamed: 0,Mean G-mean
soup7,0.636284
soup5,0.634995
soup3,0.634726


Unnamed: 0,Mean rank
soup3,1.842105
soup5,2.052632
soup7,2.105263


#### Gmean minority knn

In [36]:
columns = [i for i in clf_res_names if 'soup' in i]
print_scores(scores['g_mean_knn_minority'], 'g_mean_knn_soup_minority',columns=columns)

Unnamed: 0,soup3,soup5,soup7
1czysty-cut,1.0,1.0,1.0
2delikatne-cut,0.8025,0.8175,0.8225
3mocniej-cut,0.5942,0.6158,0.6108
4delikatne-bezover-cut,0.9558,0.9725,0.9842
balance-scale,0.6478,0.7527,0.7727
car,0.4923,0.5638,0.6156
cleveland,0.162,0.1578,0.1564
cleveland_v2,0.2019,0.2019,0.2019
cmc,0.5387,0.5357,0.5511
dermatology,0.995,0.995,0.995


Unnamed: 0,Mean G-mean
soup7,0.669421
soup5,0.665237
soup3,0.657032


Unnamed: 0,Mean rank
soup7,1.736842
soup5,2.052632
soup3,2.210526


#### Gmean tree


In [37]:
columns = [i for i in clf_res_names if 'soup' in i]
print_scores(scores['g_mean_tree'], 'g_mean_tree_soup',columns=columns)

Unnamed: 0,soup3,soup5,soup7
1czysty-cut,0.9574,0.9557,0.9578
2delikatne-cut,0.775,0.786,0.7919
3mocniej-cut,0.5664,0.5653,0.574
4delikatne-bezover-cut,0.8749,0.8894,0.8933
balance-scale,0.5575,0.6091,0.6098
car,0.8795,0.8723,0.8911
cleveland,0.1028,0.1171,0.1199
cleveland_v2,0.1422,0.1469,0.1564
cmc,0.4755,0.4827,0.4783
dermatology,0.9458,0.9368,0.9319


Unnamed: 0,Mean G-mean
soup7,0.656916
soup5,0.654374
soup3,0.649932


Unnamed: 0,Mean rank
soup7,1.631579
soup5,2.157895
soup3,2.210526


#### Gmean minority tree

In [38]:
columns = [i for i in clf_res_names if 'soup' in i]
print_scores(scores['g_mean_tree_minority'], 'g_mean_tree_soup_minority',columns=columns)

Unnamed: 0,soup3,soup5,soup7
1czysty-cut,0.9542,0.9567,0.9642
2delikatne-cut,0.67,0.7008,0.7183
3mocniej-cut,0.4292,0.4458,0.4483
4delikatne-bezover-cut,0.8833,0.9217,0.9425
balance-scale,0.3684,0.4509,0.4451
car,0.9344,0.9399,0.9639
cleveland,0.0701,0.0841,0.0842
cleveland_v2,0.0959,0.1017,0.1095
cmc,0.5067,0.5096,0.5321
dermatology,0.98,0.95,0.955


Unnamed: 0,Mean G-mean
soup7,0.6565
soup5,0.649011
soup3,0.636074


Unnamed: 0,Mean rank
soup7,1.578947
soup5,1.815789
soup3,2.605263


#### Gmean knn

In [39]:
columns = [i for i in clf_res_names if 'mdo' in i]
print_scores(scores['g_mean_knn'], 'gmean_mdo_knn',columns=columns)

Unnamed: 0,3mdo3,3mdo5,3mdo7,5mdo3,5mdo5,5mdo7,7mdo3,7mdo5,7mdo7
1czysty-cut,0.9781,0.9771,0.9777,0.9781,0.9777,0.9772,0.9794,0.9794,0.9795
2delikatne-cut,0.7937,0.7978,0.7835,0.7905,0.7762,0.7749,0.7562,0.7542,0.7551
3mocniej-cut,0.5799,0.5922,0.6064,0.5779,0.5748,0.567,0.5524,0.5443,0.5431
4delikatne-bezover-cut,0.8644,0.8703,0.8715,0.862,0.8616,0.8625,0.8407,0.8444,0.8478
balance-scale,0.6788,0.3931,0.3301,0.1485,0.1486,0.1572,0.1486,0.1486,0.1486
car,0.7812,0.7954,0.7835,0.7578,0.7547,0.7553,0.5218,0.5728,0.5724
cleveland,0.0754,0.115,0.1274,0.158,0.1059,0.0801,0.0694,0.0665,0.0665
cleveland_v2,0.0614,0.1223,0.1348,0.1381,0.0833,0.0689,0.0605,0.0568,0.0553
cmc,0.4566,0.4564,0.4556,0.4559,0.4561,0.4581,0.4586,0.4572,0.4575
dermatology,0.9578,0.9582,0.9582,0.9589,0.9582,0.9585,0.9579,0.9585,0.9572


Unnamed: 0,Mean G-mean
3mdo3,0.607847
3mdo5,0.598521
3mdo7,0.590832
5mdo3,0.584337
5mdo5,0.567026
5mdo7,0.560484
7mdo5,0.541416
7mdo3,0.540905
7mdo7,0.540732


Unnamed: 0,Mean rank
3mdo5,2.921053
5mdo3,3.315789
3mdo7,3.710526
3mdo3,3.947368
5mdo5,5.263158
5mdo7,5.526316
7mdo3,6.473684
7mdo5,6.842105
7mdo7,7.0


#### Gmean minority knn

In [40]:
columns = [i for i in clf_res_names if 'mdo' in i]
print_scores(scores['g_mean_knn_minority'], 'gmean_mdo_knn_minority',columns=columns)

Unnamed: 0,3mdo3,3mdo5,3mdo7,5mdo3,5mdo5,5mdo7,7mdo3,7mdo5,7mdo7
1czysty-cut,0.9992,0.9992,1.0,0.9992,0.9992,0.9983,0.995,0.9992,0.9983
2delikatne-cut,0.7167,0.725,0.6825,0.7033,0.6667,0.6533,0.6167,0.6025,0.6108
3mocniej-cut,0.4925,0.5025,0.4833,0.4917,0.41,0.405,0.3667,0.3508,0.3408
4delikatne-bezover-cut,0.8167,0.8458,0.8433,0.8142,0.8133,0.81,0.775,0.775,0.7817
balance-scale,0.566,0.1768,0.1267,0.0173,0.0173,0.0213,0.0173,0.0173,0.0173
car,0.7534,0.7825,0.7678,0.7275,0.7157,0.7219,0.3575,0.4265,0.4417
cleveland,0.0454,0.0757,0.0855,0.1111,0.0661,0.0459,0.0388,0.0367,0.0367
cleveland_v2,0.0314,0.0766,0.088,0.089,0.0392,0.0322,0.0296,0.0277,0.0269
cmc,0.4144,0.4175,0.4201,0.4192,0.4204,0.4265,0.4298,0.4277,0.4309
dermatology,0.98,0.98,0.98,0.98,0.98,0.98,0.98,0.98,0.98


Unnamed: 0,Mean G-mean
3mdo3,0.562653
3mdo5,0.549821
3mdo7,0.535463
5mdo3,0.534684
5mdo5,0.508411
5mdo7,0.499621
7mdo7,0.469953
7mdo5,0.469553
7mdo3,0.4686


Unnamed: 0,Mean rank
3mdo5,2.578947
3mdo7,3.184211
5mdo3,3.368421
3mdo3,3.947368
5mdo5,4.631579
5mdo7,5.789474
7mdo3,7.0
7mdo7,7.184211
7mdo5,7.315789


#### Gmean tree


In [41]:
columns = [i for i in clf_res_names if 'mdo' in i]
print_scores(scores['g_mean_tree'], 'gmean_tree_mdo',columns=columns)

Unnamed: 0,3mdo3,3mdo5,3mdo7,5mdo3,5mdo5,5mdo7,7mdo3,7mdo5,7mdo7
1czysty-cut,0.9646,0.9634,0.9616,0.9639,0.9626,0.9614,0.961,0.9603,0.9599
2delikatne-cut,0.7668,0.7597,0.7613,0.7729,0.7544,0.758,0.7423,0.74,0.7416
3mocniej-cut,0.5581,0.5661,0.594,0.557,0.5616,0.5586,0.5419,0.5415,0.535
4delikatne-bezover-cut,0.8186,0.8241,0.8207,0.8112,0.8144,0.8154,0.7999,0.8,0.8037
balance-scale,0.1704,0.1654,0.1588,0.1561,0.1562,0.1649,0.1562,0.1562,0.1562
car,0.941,0.9419,0.9357,0.9077,0.9076,0.9104,0.8966,0.9174,0.91
cleveland,0.0958,0.1159,0.1106,0.1452,0.1035,0.1028,0.158,0.1209,0.1246
cleveland_v2,0.111,0.1477,0.1186,0.146,0.1308,0.1236,0.1393,0.1069,0.1075
cmc,0.4441,0.4451,0.4472,0.4408,0.4447,0.4436,0.4458,0.4423,0.4408
dermatology,0.9362,0.9351,0.9369,0.939,0.9349,0.9363,0.9348,0.9376,0.9309


Unnamed: 0,Mean G-mean
3mdo5,0.622653
3mdo3,0.620721
3mdo7,0.618837
5mdo3,0.616595
5mdo5,0.610495
7mdo3,0.610016
5mdo7,0.609016
7mdo7,0.607337
7mdo5,0.604747


Unnamed: 0,Mean rank
3mdo5,2.894737
3mdo7,3.263158
3mdo3,3.473684
5mdo3,4.789474
5mdo7,5.210526
5mdo5,5.578947
7mdo3,6.342105
7mdo7,6.631579
7mdo5,6.815789


#### Gmean minority tree

In [42]:
columns = [i for i in clf_res_names if 'mdo' in i]
print_scores(scores['g_mean_tree_minority'], 'gmean_tree_mdo_minority',columns=columns)







Unnamed: 0,3mdo3,3mdo5,3mdo7,5mdo3,5mdo5,5mdo7,7mdo3,7mdo5,7mdo7
1czysty-cut,0.9633,0.9608,0.955,0.9633,0.9592,0.955,0.9533,0.9508,0.9508
2delikatne-cut,0.68,0.6717,0.6717,0.695,0.655,0.6442,0.61,0.5958,0.6067
3mocniej-cut,0.4958,0.485,0.4808,0.4883,0.4192,0.4058,0.3667,0.3667,0.3442
4delikatne-bezover-cut,0.7692,0.7858,0.775,0.7508,0.755,0.7608,0.7392,0.7258,0.7383
balance-scale,0.029,0.0272,0.0255,0.0232,0.0232,0.0272,0.0232,0.0232,0.0232
car,0.9132,0.9162,0.9076,0.864,0.8668,0.8672,0.8471,0.8831,0.8696
cleveland,0.0628,0.0808,0.0743,0.1063,0.0678,0.0701,0.1179,0.0839,0.087
cleveland_v2,0.0685,0.0988,0.0751,0.0973,0.0857,0.0807,0.0945,0.0646,0.0629
cmc,0.3568,0.3603,0.3754,0.3604,0.3711,0.3682,0.3619,0.3661,0.3603
dermatology,0.965,0.945,0.955,0.96,0.94,0.955,0.945,0.96,0.92


Unnamed: 0,Mean G-mean
3mdo5,0.580642
3mdo3,0.580374
3mdo7,0.573653
5mdo3,0.573374
5mdo5,0.559616
5mdo7,0.556589
7mdo3,0.553384
7mdo7,0.549784
7mdo5,0.548905


Unnamed: 0,Mean rank
3mdo5,3.052632
3mdo3,3.526316
3mdo7,3.947368
5mdo3,4.0
5mdo7,5.526316
5mdo5,5.868421
7mdo3,6.105263
7mdo7,6.394737
7mdo5,6.578947
