In [18]:
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import recall_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer
from IPython.core.display import display
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm_notebook

from multi_imbalance.ensemble.SOUPBagging import SOUPBagging
from multi_imbalance.ensemble.mrbbagging import MRBBagging
from multi_imbalance.resampling.SOUP import SOUP
from multi_imbalance.resampling.MDO import MDO
from multi_imbalance.resampling.GlobalCS import GlobalCS

from imblearn.metrics import geometric_mean_score
from imblearn.over_sampling import SMOTE
from multi_imbalance.resampling.spider import SPIDER3

from sklearn.neighbors import KNeighborsClassifier
import warnings
import logging
from multi_imbalance.utils.data import load_arff_datasets
from multi_imbalance.utils.min_int_maj import maj_int_min
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
warnings.filterwarnings('ignore')

from IPython.display import clear_output
clear_output(wait=True)

In [19]:
# def green_valid_backgroud(s):
#     correct = ['1czysty-cut', '2delikatne-cut', '3mocniej-cut','4delikatne-bezover-cut', 'cmc', 'dermatology', 'new_ecoli','new_vehicle','thyroid-newthyroid']
#     return ['background-color: green' if v in correct else '' for v in list(s.index)]
# 


def bold_max(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.max()
    return ['font-weight: bold' if v else '' for v in is_max]
    
def print_scores(scores, name, only_read_dt = False, columns=None, base=None):
    df = pd.DataFrame(scores).T
    if only_read_dt:
        df = df.iloc[4:]
    if columns is not None:
        df = df[columns]
    if base is not None:
        df = pd.merge(base,df, left_index=True, right_index=True)
    df2 = df.style.apply(bold_max, axis=1)
    display(df2)
    
    with open(f'{name}_main.tex','w') as tf:
        tf.write(df.to_latex())
    df.to_csv(f'{name}_main.csv')
    
    df.fillna(df.median(), inplace=True)
    df_median = pd.DataFrame(df.mean().sort_values(ascending=False),columns=['Mean G-mean'])
    display(df_median)
    df_meanrank = pd.DataFrame(df.rank(axis=1,ascending=False).mean().sort_values(),columns=['Mean rank'])
    display(df_meanrank)
    
    with open(f'{name}_median.tex','w') as tf:
        tf.write(df_median.to_latex())
    df_median.to_csv(f'{name}_median.csv')
    with open(f'{name}_meanrank.tex','w') as tf:
        tf.write(df_meanrank.to_latex())
    df_meanrank.to_csv(f'{name}_meanrank.csv')
# print_scores(scores_knn)

In [20]:
def resample_data(resample, seed, X_train, y_train, no_classes, dataset_name):
    if resample == 'base' or resample=='bagging':
        X_train_resampled, y_train_resampled = X_train, y_train
    elif 'soup' in resample:
        soup = SOUP(k=7)
        X_train_resampled, y_train_resampled = soup.fit_transform(np.copy(X_train), np.copy(y_train), maj_int_min=maj_int_min[dataset_name])
    elif resample=='global':
        global_cs = GlobalCS()
        X_train_resampled, y_train_resampled = global_cs.fit_transform(np.copy(X_train), np.copy(y_train), shuffle=False)
    elif resample=='smote':
        smote = SMOTE(random_state=seed)
        X_train_resampled, y_train_resampled = smote.fit_sample(np.copy(X_train), np.copy(y_train))
    elif 'mdo' in resample:
        mdo = MDO(k=3, k1_frac=.4, seed=seed)
        X_train_resampled, y_train_resampled = mdo.fit_transform(np.copy(X_train), np.copy(y_train), maj_int_min=maj_int_min[dataset_name])
    elif resample=='spider':
        cost = np.ones((no_classes, no_classes))
        np.fill_diagonal(cost, 0)
        clf = SPIDER3(k=5, cost=cost, majority_classes=maj_int_min[dataset_name]['maj'], intermediate_classes=maj_int_min[dataset_name]['int'], minority_classes=maj_int_min[dataset_name]['min'])
        X_train_resampled, y_train_resampled = clf.fit_transform(X_train.astype(np.float64), y_train)
    elif 'soupbg' in resample or 'mrbbag' in resample:
        # SOUP Bagging does it by itself
        X_train_resampled, y_train_resampled = X_train, y_train
    else:
        raise ValueError(f'Bad type{resample}')
    return X_train_resampled, y_train_resampled



def test_resampling(res, dataset_values, dataset_name):
    X, y, scale_index = dataset_values.data, dataset_values.target, dataset_values.cat_length

    no_classes = np.unique(y).size
    minority_class = maj_int_min[dataset_name]['min']
    result_data = defaultdict(int)
    run_data = defaultdict(lambda: defaultdict(list)) # {metric: {run_number: [scores]}}
    for i in range(10):
        skf = StratifiedKFold(n_splits=5, shuffle=True,random_state=i)
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            if scale_index > 0:
                normalizer = StandardScaler().fit(X_train[:,:scale_index])
    
                X_train[:,:scale_index] = normalizer.transform(X_train[:,:scale_index])
                X_test[:,:scale_index] = normalizer.transform(X_test[:,:scale_index])
            X_train_resampled, y_train_resampled = resample_data(res, i, X_train, y_train, no_classes, dataset_name)

            # for clf_name in ['knn']:
            for clf_name in ['tree']:
                if clf_name == 'knn':
                    clf = KNeighborsClassifier(n_neighbors=5)
                elif clf_name == 'tree':
                    clf = DecisionTreeClassifier(random_state=i)
                    
                if  'soupbg005' in res:
                    vote_classifier = SOUPBagging(clf, maj_int_min[dataset_name], n_classifiers=5)
                    clf = vote_classifier
                elif  'soupbg015' in res:
                    vote_classifier = SOUPBagging(clf, maj_int_min[dataset_name], n_classifiers=15)
                    clf = vote_classifier
                elif  'soupbg030' in res:
                    vote_classifier = SOUPBagging(clf, maj_int_min[dataset_name], n_classifiers=30)
                    clf = vote_classifier
                elif  'soupbg050' in res:
                    vote_classifier = SOUPBagging(clf, maj_int_min[dataset_name], n_classifiers=50)
                    clf = vote_classifier
                elif  'soupbg100' in res:
                    vote_classifier = SOUPBagging(clf, maj_int_min[dataset_name], n_classifiers=100)
                    clf = vote_classifier
                elif res == 'bagging':
                    vote_classifier = BaggingClassifier(base_estimator=clf, n_estimators=50)
                    clf = vote_classifier
                # elif res == 'mrbbag005':
                    

                clf.fit(X_train_resampled, y_train_resampled)
                if 'soupbg' in res:
                    for strategy in ['average','optimistic','pessimistic','mixed', 'global']:
                        y_pred = clf.predict(X_test, strategy=strategy, maj_int_min=maj_int_min[dataset_name])
                        gmean = geometric_mean_score(y_test, y_pred, correction=0.001)
                        minority_gmean = geometric_mean_score(y_test, y_pred,labels=minority_class, correction=0.001)
                        avg_acc = np.mean(recall_score(y_test, y_pred, average=None))
                        run_data['g_mean_{}_{}'.format(clf_name, strategy)][str(i)].append(gmean)
                        run_data['g_mean_{}_minority_{}'.format(clf_name, strategy)][str(i)].append(minority_gmean)
                else:
                    y_pred = clf.predict(X_test)
                    gmean = geometric_mean_score(y_test, y_pred, correction=0.001)
                    minority_gmean = geometric_mean_score(y_test, y_pred,labels=minority_class, correction=0.001)
                    avg_acc = np.mean(recall_score(y_test, y_pred, average=None))
                    run_data['g_mean_{}'.format(clf_name)][str(i)].append(gmean)
                    run_data['g_mean_{}_minority'.format(clf_name)][str(i)].append(minority_gmean)
                # run_data['avg_acc_{}'.format(clf_name)][str(i)].append(avg_acc)
    
    def get_score_from_metric(run_data, metric):
        runs = run_data[metric]
        runs_scores_list = list(runs.values()) #[[one run k-foledscores],[..]]
        result = np.mean(list(map(np.mean, runs_scores_list)))
        return result
            
    for metric_name, metric_values in run_data.items():
        result_data[metric_name] = get_score_from_metric(run_data, metric_name)
        
    return result_data


def provide_test_and_get_scores(datasets, clf_res_names):
    scores = defaultdict(lambda: defaultdict(dict))
    for dataset_name, dataset_values in tqdm_notebook(datasets.items(),total=len(datasets), desc='1st loop'):
        for resample in clf_res_names:
            print(resample)
            result_data = test_resampling(resample, dataset_values, dataset_name)
            for key in result_data:
                scores[key][dataset_name][resample] = round(result_data[key],4)
    return scores

clf_res_names =['base','bagging','soupbg005','soupbg015','soupbg030','soupbg050','soupbg100']
datasets = load_arff_datasets(return_cat_length=True)
scores = provide_test_and_get_scores(datasets, clf_res_names)

HBox(children=(IntProgress(value=0, description='1st loop', max=19, style=ProgressStyle(description_width='ini…

base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
so

### Kfold - 5, powtórzone 10 razy
#### Drzewo, głosowanie: average, soup k = 7, miara Gmean

In [24]:
# scores
base = pd.DataFrame(scores['g_mean_tree']).T
columns = [i for i in clf_res_names if 'soupbg' in i]
print_scores(scores['g_mean_tree_average'], name='g_mean_tree_average_comparision', columns=columns, base=base)


Unnamed: 0,base,bagging,soupbg005,soupbg015,soupbg030,soupbg050,soupbg100
1czysty-cut,0.9391,0.9458,0.9589,0.9591,0.9591,0.96,0.9591
2delikatne-cut,0.6976,0.7206,0.7966,0.7978,0.7993,0.8001,0.7983
3mocniej-cut,0.492,0.4723,0.5867,0.5927,0.5911,0.5865,0.5865
4delikatne-bezover-cut,0.7711,0.7863,0.8932,0.8937,0.8936,0.8946,0.8935
balance-scale,0.1562,0.118,0.6564,0.6689,0.6646,0.6686,0.6723
car,0.9271,0.9488,0.8967,0.9044,0.9027,0.9054,0.9028
cleveland,0.1188,0.0678,0.11,0.1149,0.1287,0.1277,0.1292
cleveland_v2,0.133,0.0699,0.1756,0.1747,0.1679,0.1574,0.169
cmc,0.4424,0.4772,0.5023,0.515,0.5118,0.5183,0.5196
dermatology,0.9274,0.958,0.9395,0.9447,0.9436,0.9452,0.9469


Unnamed: 0,Mean G-mean
soupbg030,0.677879
soupbg100,0.677732
soupbg050,0.677326
soupbg015,0.675679
soupbg005,0.669074
bagging,0.595116
base,0.594337


Unnamed: 0,Mean rank
soupbg050,2.789474
soupbg100,2.789474
soupbg030,3.157895
soupbg015,3.526316
bagging,5.0
soupbg005,5.0
base,5.736842


#### Drzewo, głosowanie: average, soup k = 7, miara Gmean ale tylko dla klas mniejszościowych

In [26]:
base = pd.DataFrame(scores['g_mean_tree_minority']).T
columns = [i for i in clf_res_names if 'soupbg' in i]
print_scores(scores['g_mean_tree_minority_average'],name='g_mean_tree_minority_average_comparision', columns=columns, base=base)




Unnamed: 0,base,bagging,soupbg005,soupbg015,soupbg030,soupbg050,soupbg100
1czysty-cut,0.9142,0.905,0.9775,0.9767,0.975,0.9767,0.9742
2delikatne-cut,0.5367,0.5492,0.7342,0.73,0.7408,0.7383,0.7342
3mocniej-cut,0.3133,0.2633,0.4867,0.4833,0.4808,0.4733,0.4733
4delikatne-bezover-cut,0.6858,0.7008,0.9525,0.9483,0.9508,0.9492,0.9508
balance-scale,0.0232,0.0091,0.6096,0.6189,0.6144,0.624,0.6364
car,0.8989,0.9228,0.9817,0.99,0.9885,0.9908,0.9878
cleveland,0.0812,0.0378,0.0748,0.0802,0.0926,0.0905,0.0927
cleveland_v2,0.0891,0.0337,0.1261,0.1251,0.1178,0.1098,0.121
cmc,0.364,0.361,0.5372,0.5357,0.5354,0.5366,0.5395
dermatology,0.91,0.96,0.965,0.98,0.97,0.965,0.97


Unnamed: 0,Mean G-mean
soupbg030,0.6863
soupbg100,0.686063
soupbg050,0.684495
soupbg015,0.683821
soupbg005,0.675632
base,0.530068
bagging,0.525621


Unnamed: 0,Mean rank
soupbg100,2.684211
soupbg030,2.736842
soupbg050,3.026316
soupbg015,3.184211
soupbg005,3.657895
bagging,6.315789
base,6.394737


#### Porównanie różnych głosowań dla tree i SOUP Bagging - 100 klasyfikatorów, miara Gmean

In [27]:
# scores
base = pd.DataFrame(scores['g_mean_tree']).T
metrices = [k for k in scores if 'tree' in k and 'minority' not in k][1:]
# metrices
for metric in metrices:
    temp_df = pd.DataFrame(pd.DataFrame(scores[metric]).T['soupbg100'])
    temp_df.columns = [metric.split('_')[-1]]
    base = pd.merge(base, temp_df, left_index=True, right_index=True)
print_scores(base.T,name='g_mean_tree_voting',)

Unnamed: 0,base,bagging,average,optimistic,pessimistic,mixed,global
1czysty-cut,0.9391,0.9458,0.9591,0.9028,0.9028,0.9336,0.9588
2delikatne-cut,0.6976,0.7206,0.7983,0.7495,0.6419,0.7561,0.7988
3mocniej-cut,0.492,0.4723,0.5865,0.4878,0.2897,0.4582,0.5866
4delikatne-bezover-cut,0.7711,0.7863,0.8935,0.8701,0.8701,0.8726,0.8939
balance-scale,0.1562,0.118,0.6723,0.4345,0.4223,0.4223,0.663
car,0.9271,0.9488,0.9028,0.7498,0.7386,0.7419,0.9031
cleveland,0.1188,0.0678,0.1292,0.01,0.004,0.0093,0.1297
cleveland_v2,0.133,0.0699,0.169,0.018,0.0056,0.0194,0.1651
cmc,0.4424,0.4772,0.5196,0.0691,0.0506,0.054,0.5199
dermatology,0.9274,0.958,0.9469,0.7716,0.6204,0.6561,0.9467


Unnamed: 0,Mean G-mean
average,0.677732
global,0.673753
bagging,0.595116
base,0.594337
optimistic,0.453684
mixed,0.453221
pessimistic,0.387158


Unnamed: 0,Mean rank
average,1.894737
global,1.947368
bagging,3.315789
base,3.736842
optimistic,5.184211
mixed,5.236842
pessimistic,6.684211


#### Porównanie różnych głosowań dla tree i SOUP Bagging - 100 klasyfikatorów, Gmean dla klas mniejszościowych


In [28]:
base = pd.DataFrame(scores['g_mean_tree']).T
metrices = [k for k in scores if 'tree' in k and 'minority' in k][1:]
# metrices
for metric in metrices:
    temp_df = pd.DataFrame(pd.DataFrame(scores[metric]).T['soupbg100'])
    temp_df.columns = [metric.split('_')[-1]]
    base = pd.merge(base, temp_df, left_index=True, right_index=True)
print_scores(base.T,name='g_mean_tree_voting_minority',)

Unnamed: 0,base,bagging,average,optimistic,pessimistic,mixed,global
1czysty-cut,0.9391,0.9458,0.9742,0.8417,0.8417,1.0,0.9733
2delikatne-cut,0.6976,0.7206,0.7342,0.79,0.4933,0.9558,0.7283
3mocniej-cut,0.492,0.4723,0.4733,0.57,0.1351,0.9225,0.4517
4delikatne-bezover-cut,0.7711,0.7863,0.9508,0.86,0.86,0.995,0.9508
balance-scale,0.1562,0.118,0.6364,0.986,0.998,0.998,0.5527
car,0.9271,0.9488,0.9878,0.7298,0.7102,0.9274,0.9878
cleveland,0.1188,0.0678,0.0927,0.0032,0.001,0.0056,0.0927
cleveland_v2,0.133,0.0699,0.121,0.005,0.001,0.01,0.1162
cmc,0.4424,0.4772,0.5395,0.3132,0.1217,0.9883,0.5308
dermatology,0.9274,0.958,0.97,0.7,0.7,0.995,0.97


Unnamed: 0,Mean G-mean
mixed,0.751816
average,0.686063
global,0.672695
bagging,0.595116
base,0.594337
optimistic,0.463495
pessimistic,0.394937


Unnamed: 0,Mean rank
average,2.263158
mixed,2.447368
global,2.947368
base,4.210526
bagging,4.210526
optimistic,5.473684
pessimistic,6.447368


In [29]:
import json


json = json.dumps(scores)
f = open("scores_tree.json","w")
f.write(json)
f.close()
