In [20]:
from collections import defaultdict
import numpy as np
import pandas as pd
from tqdm import tqdm
from IPython.core.display import display
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

from multi_imbalance.datasets import load_datasets
from multi_imbalance.resampling.SOUP import SOUP
from multi_imbalance.resampling.MDO import MDO
from multi_imbalance.resampling.GlobalCS import GlobalCS

from imblearn.metrics import geometric_mean_score
from imblearn.over_sampling import SMOTE
from multi_imbalance.resampling.SOUPBagging import SOUPBagging
from multi_imbalance.resampling.spider import SPIDER3

from sklearn.neighbors import KNeighborsClassifier
maj_int_min = {
    'balance_scale' : {
        'maj': [2, 1],
        'int': [],
        'min': [0]
    }, 
    'cleveland': {
        'maj': [0],
        'int': [1],
        'min': [2,3,4]
    }, 
    'cmc': {
        'maj': [0],
        'int': [2],
        'min': [1]
    }, 
    'dermatology': {
        'maj': [0],
        'int': [2,1,4,3],
        'min': [5]
    }, 
    'ecoli': {
        'maj': [0,1],
        'int': [7,4,5],
        'min': [6,3,2]
    }, 
    'glass': {
        'maj': [1,0],
        'int': [5],
        'min': [2,3,4]
    }, 
    'hayes_roth': {
        'maj': [0,1],
        'int': [],
        'min': [2]
    }, 
    'new_thyroid': {
        'maj': [0],
        'int': [],
        'min': [1,2]
    }, 
    'winequailty_red': {
        'maj': [2,3],
        'int': [4],
        'min': [1,5,0]
    }, 
    'yeast': {
        'maj': [0,7],
        'int': [6, 5],
        'min': [4,3,2,9,8,1]
    }
}
from IPython.display import clear_output
clear_output(wait=True)

In [21]:
def resample_data(resample, seed, X_train, y_train, no_classes, dataset_name):
    if resample == 'base':
        X_train_resampled, y_train_resampled = X_train, y_train
    elif resample=='soup':
        soup = SOUP()
        X_train_resampled, y_train_resampled = soup.fit_transform(np.copy(X_train), np.copy(y_train))
    elif resample=='global':
        global_cs = GlobalCS()
        X_train_resampled, y_train_resampled = global_cs.fit_transform(np.copy(X_train), np.copy(y_train), shuffle=False)
    elif resample=='smote':
        smote = SMOTE(random_state=seed)
        X_train_resampled, y_train_resampled = smote.fit_sample(np.copy(X_train), np.copy(y_train))
    elif resample=='mdo':
        mdo = MDO(k=9, k1_frac=0.1, seed=seed)
        X_train_resampled, y_train_resampled = mdo.fit_transform(np.copy(X_train), np.copy(y_train))
    elif resample=='spider':
        cost = np.ones((no_classes, no_classes))
        np.fill_diagonal(cost, 0)
        clf = SPIDER3(k=5, cost=cost, majority_classes=maj_int_min[dataset_name]['maj'], intermediate_classes=maj_int_min[dataset_name]['int'], minority_classes=maj_int_min[dataset_name]['min'])
        X_train_resampled, y_train_resampled = clf.fit_transform(X_train.astype(np.float64), y_train)
    elif 'soupbg' in resample:
        # SOUP Bagging does it by itself
        X_train_resampled, y_train_resampled = X_train, y_train
    return X_train_resampled, y_train_resampled


def test_resampling(classifier, res, dataset_values, dataset_name):
    X, y = dataset_values.data, dataset_values.target
    no_classes = np.unique(y).size
    result_data = defaultdict(int)
    run_data = defaultdict(lambda: defaultdict(list)) # {metric: {run_number: [scores]}}
    for i in range(10):
        skf = StratifiedKFold(n_splits=5, shuffle=True,random_state=i)
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            X_train_resampled, y_train_resampled = resample_data(res, i, X_train, y_train, no_classes, dataset_name)
            
            if classifier == 'knn':
                clf = KNeighborsClassifier(n_neighbors=5)
            elif classifier == 'tree':
                clf = DecisionTreeClassifier(random_state=i)
                
            # DONT JUDGE ME
            if res == 'soupbg005':
                vote_classifier = SOUPBagging(clf, n_classifiers=5)
                clf = vote_classifier
            elif res == 'soupbg015':
                vote_classifier = SOUPBagging(clf, n_classifiers=15)
                clf = vote_classifier
            elif res == 'soupbg030':
                vote_classifier = SOUPBagging(clf, n_classifiers=30)
                clf = vote_classifier
            elif res == 'soupbg050':
                vote_classifier = SOUPBagging(clf, n_classifiers=50)
                clf = vote_classifier
            elif res == 'soupbg100':
                vote_classifier = SOUPBagging(clf, n_classifiers=100)
                clf = vote_classifier
            
            clf.fit(X_train_resampled, y_train_resampled)
            y_pred = clf.predict(X_test)
            gmean = geometric_mean_score(y_test, y_pred, correction=0.001)
            run_data['g_mean'][str(i)].append(gmean)
    
    def get_score_from_metric(run_data, metric):
        runs = run_data[metric]
        runs_scores_list = list(runs.values()) #[[one run k-foledscores],[..]]
        result = np.mean(list(map(np.mean, runs_scores_list)))
        return result
            
    result_data['g_mean'] = get_score_from_metric(run_data, 'g_mean')
    return result_data


In [22]:
def provide_test_and_get_scores(datasets, clf):
    scores = defaultdict(dict)
    for dataset_name, dataset_values in tqdm(datasets.items(),total=len(d`atasets)):
        clf_res_names =['base','global','smote','mdo','soup','soupbg005','soupbg015','soupbg030', 'soupbg050', 'soupbg100']
        for resample in clf_res_names:
            result_data = test_resampling(clf, resample, dataset_values, dataset_name)
            scores[dataset_name][resample] = round(result_data['g_mean'],3)
    return scores

In [23]:
def print_scores(scores, only_read_dt = False):
    display("G-MEAN")
    df = pd.DataFrame(scores).T
    if only_read_dt:
        df = df.iloc[4:]
    display(df)
    
    # df.fillna(df.median(), inplace=True)
    display(pd.DataFrame(df.mean().sort_values(ascending=False),columns=['Mean G-mean']))
    display(pd.DataFrame(df.rank(axis=1,ascending=False).mean().sort_values(),columns=['Mean rank']))

In [24]:
datasets = load_datasets()


Testy dla drzewa,
Wszystkie zbiory danych:

In [25]:
score = provide_test_and_get_scores(datasets, 'tree')
print_scores(score)




  0%|          | 0/17 [00:00<?, ?it/s][A[A

  6%|▌         | 1/17 [17:00<4:32:02, 1020.14s/it][A[A

 12%|█▏        | 2/17 [37:21<4:30:08, 1080.57s/it][A[A

 18%|█▊        | 3/17 [54:59<4:10:31, 1073.70s/it][A[A

 24%|██▎       | 4/17 [1:12:50<3:52:27, 1072.91s/it][A[A

 29%|██▉       | 5/17 [1:22:05<3:03:31, 917.64s/it] [A[A

 35%|███▌      | 6/17 [1:26:45<2:13:08, 726.18s/it][A[A

 41%|████      | 7/17 [1:31:11<1:38:03, 588.31s/it][A[A

 47%|████▋     | 8/17 [1:51:52<1:57:35, 783.93s/it][A[A

 53%|█████▎    | 9/17 [1:57:41<1:27:07, 653.41s/it][A[A

 59%|█████▉    | 10/17 [2:01:09<1:00:39, 519.92s/it][A[A

 65%|██████▍   | 11/17 [2:03:37<40:49, 408.25s/it]  [A[A

 71%|███████   | 12/17 [2:08:24<30:59, 371.85s/it][A[A

 76%|███████▋  | 13/17 [2:15:10<25:28, 382.21s/it][A[A

 82%|████████▏ | 14/17 [2:26:44<23:47, 475.74s/it][A[A

 88%|████████▊ | 15/17 [2:48:06<23:55, 717.55s/it][A[A

 94%|█████████▍| 16/17 [3:08:30<14:29, 869.59s/it][A

'G-MEAN'

Unnamed: 0,base,global,smote,mdo,soup,soupbg005,soupbg015,soupbg030,soupbg050,soupbg100
1czysty-cut,0.939,0.946,0.955,0.965,0.957,0.944,0.955,0.956,0.958,0.958
2delikatne-cut,0.698,0.699,0.744,0.772,0.795,0.787,0.793,0.8,0.798,0.802
3mocniej-cut,0.492,0.482,0.496,0.585,0.578,0.59,0.603,0.611,0.609,0.61
4delikatne-bezover-cut,0.771,0.768,0.815,0.83,0.894,0.883,0.887,0.889,0.892,0.891
balance-scale,0.154,0.123,0.168,0.162,0.621,0.559,0.598,0.612,0.642,0.648
cleveland,0.127,0.096,0.142,0.098,0.139,0.128,0.113,0.129,0.123,0.133
cleveland_v2,0.113,0.11,0.129,0.09,0.162,0.15,0.183,0.191,0.208,0.186
cmc,0.44,0.451,0.444,0.439,0.466,0.477,0.489,0.503,0.504,0.508
dermatology,0.925,0.94,0.946,0.948,0.933,0.91,0.935,0.951,0.959,0.962
glass,0.463,0.486,0.554,0.598,0.606,0.59,0.631,0.646,0.645,0.654


Unnamed: 0,Mean G-mean
soupbg100,0.666118
soupbg050,0.664059
soupbg030,0.658941
soupbg015,0.650235
soup,0.646353
soupbg005,0.629706
mdo,0.613706
smote,0.606824
base,0.582353
global,0.579412


Unnamed: 0,Mean rank
soupbg100,2.294118
soupbg050,2.852941
soupbg030,3.764706
soupbg015,5.294118
soup,5.352941
smote,5.558824
mdo,5.647059
soupbg005,7.705882
global,8.176471
base,8.352941


Drzewo, tylko rzeczywiste zbiory danych:

In [26]:
print_scores(score,only_read_dt=True)


'G-MEAN'

Unnamed: 0,base,global,smote,mdo,soup,soupbg005,soupbg015,soupbg030,soupbg050,soupbg100
balance-scale,0.154,0.123,0.168,0.162,0.621,0.559,0.598,0.612,0.642,0.648
cleveland,0.127,0.096,0.142,0.098,0.139,0.128,0.113,0.129,0.123,0.133
cleveland_v2,0.113,0.11,0.129,0.09,0.162,0.15,0.183,0.191,0.208,0.186
cmc,0.44,0.451,0.444,0.439,0.466,0.477,0.489,0.503,0.504,0.508
dermatology,0.925,0.94,0.946,0.948,0.933,0.91,0.935,0.951,0.959,0.962
glass,0.463,0.486,0.554,0.598,0.606,0.59,0.631,0.646,0.645,0.654
hayes-roth,0.837,0.843,0.841,0.842,0.835,0.771,0.815,0.838,0.838,0.836
new_ecoli,0.708,0.707,0.723,0.758,0.714,0.701,0.728,0.725,0.734,0.735
new_led7digit,0.754,0.757,0.762,0.753,0.76,0.75,0.767,0.767,0.768,0.768
new_vehicle,0.9,0.894,0.89,0.899,0.886,0.879,0.897,0.906,0.91,0.912


Unnamed: 0,Mean G-mean
soupbg100,0.620231
soupbg050,0.617846
soupbg030,0.611231
soupbg015,0.601231
soup,0.597231
soupbg005,0.577
smote,0.562
mdo,0.560077
base,0.538462
global,0.535


Unnamed: 0,Mean rank
soupbg100,2.346154
soupbg050,2.923077
soupbg030,4.0
smote,4.923077
soupbg015,5.346154
mdo,5.769231
soup,5.769231
global,7.846154
base,8.0
soupbg005,8.076923


Testy dla knn,
Wszystkie zbiory danych:

In [27]:
score = provide_test_and_get_scores(datasets, 'knn')
print_scores(score)




  0%|          | 0/17 [00:00<?, ?it/s][A[A

  6%|▌         | 1/17 [15:39<4:10:27, 939.24s/it][A[A

 12%|█▏        | 2/17 [31:15<3:54:33, 938.20s/it][A[A

 18%|█▊        | 3/17 [46:53<3:38:57, 938.37s/it][A[A

 24%|██▎       | 4/17 [1:02:30<3:23:11, 937.79s/it][A[A

 29%|██▉       | 5/17 [1:10:53<2:41:29, 807.49s/it][A[A

 35%|███▌      | 6/17 [1:15:20<1:58:17, 645.18s/it][A[A

 41%|████      | 7/17 [1:19:45<1:28:32, 531.22s/it][A[A

 47%|████▋     | 8/17 [1:39:21<1:48:41, 724.60s/it][A[A

 53%|█████▎    | 9/17 [1:44:46<1:20:38, 604.86s/it][A[A

 59%|█████▉    | 10/17 [1:48:02<56:14, 482.07s/it] [A[A

 65%|██████▍   | 11/17 [1:50:25<38:02, 380.43s/it][A[A

 71%|███████   | 12/17 [1:55:11<29:19, 351.99s/it][A[A

 76%|███████▋  | 13/17 [2:02:04<24:41, 370.28s/it][A[A

 82%|████████▏ | 14/17 [2:13:42<23:25, 468.62s/it][A[A

 88%|████████▊ | 15/17 [2:34:58<23:41, 710.94s/it][A[A

 94%|█████████▍| 16/17 [2:56:10<14:39, 879.21s/it][A[A

10

'G-MEAN'

Unnamed: 0,base,global,smote,mdo,soup,soupbg005,soupbg015,soupbg030,soupbg050,soupbg100
1czysty-cut,0.971,0.975,0.978,0.977,0.947,0.953,0.952,0.953,0.953,0.953
2delikatne-cut,0.704,0.76,0.761,0.801,0.789,0.792,0.794,0.792,0.791,0.789
3mocniej-cut,0.466,0.523,0.498,0.599,0.556,0.56,0.556,0.554,0.558,0.545
4delikatne-bezover-cut,0.812,0.852,0.861,0.875,0.889,0.891,0.891,0.891,0.89,0.89
balance-scale,0.193,0.267,0.42,0.684,0.687,0.628,0.672,0.691,0.706,0.707
cleveland,0.02,0.134,0.129,0.066,0.107,0.129,0.103,0.113,0.104,0.112
cleveland_v2,0.009,0.183,0.233,0.056,0.191,0.166,0.172,0.189,0.205,0.186
cmc,0.482,0.476,0.481,0.479,0.506,0.488,0.513,0.515,0.519,0.519
dermatology,0.843,0.849,0.849,0.854,0.817,0.789,0.809,0.821,0.821,0.824
glass,0.201,0.625,0.621,0.499,0.609,0.553,0.603,0.599,0.613,0.616


Unnamed: 0,Mean G-mean
soupbg050,0.639882
soupbg100,0.639235
soupbg030,0.637353
soup,0.633471
soupbg015,0.632824
smote,0.621118
soupbg005,0.616824
mdo,0.608765
global,0.590176
base,0.521412


Unnamed: 0,Mean rank
soupbg050,4.294118
soupbg100,4.323529
soupbg030,4.352941
smote,4.794118
mdo,5.294118
soupbg015,5.529412
global,5.676471
soup,5.882353
soupbg005,6.676471
base,8.176471


knn (k=5), tylko rzeczywiste zbiory danych:

In [28]:
print_scores(score,only_read_dt=True)

'G-MEAN'

Unnamed: 0,base,global,smote,mdo,soup,soupbg005,soupbg015,soupbg030,soupbg050,soupbg100
balance-scale,0.193,0.267,0.42,0.684,0.687,0.628,0.672,0.691,0.706,0.707
cleveland,0.02,0.134,0.129,0.066,0.107,0.129,0.103,0.113,0.104,0.112
cleveland_v2,0.009,0.183,0.233,0.056,0.191,0.166,0.172,0.189,0.205,0.186
cmc,0.482,0.476,0.481,0.479,0.506,0.488,0.513,0.515,0.519,0.519
dermatology,0.843,0.849,0.849,0.854,0.817,0.789,0.809,0.821,0.821,0.824
glass,0.201,0.625,0.621,0.499,0.609,0.553,0.603,0.599,0.613,0.616
hayes-roth,0.559,0.614,0.627,0.611,0.601,0.562,0.618,0.627,0.639,0.641
new_ecoli,0.814,0.775,0.807,0.824,0.817,0.801,0.81,0.82,0.82,0.828
new_led7digit,0.757,0.441,0.727,0.774,0.746,0.753,0.758,0.758,0.756,0.756
new_vehicle,0.849,0.863,0.859,0.852,0.822,0.81,0.82,0.824,0.824,0.825


Unnamed: 0,Mean G-mean
soupbg100,0.591538
soupbg050,0.591231
soupbg030,0.588077
soup,0.583692
soupbg015,0.581923
smote,0.573923
soupbg005,0.560769
mdo,0.545923
global,0.532538
base,0.454692


Unnamed: 0,Mean rank
soupbg100,3.769231
soupbg050,4.153846
smote,4.269231
soupbg030,4.307692
global,5.192308
soup,5.615385
soupbg015,5.884615
mdo,6.076923
soupbg005,7.653846
base,8.076923
