In [30]:
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import recall_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer
from IPython.core.display import display
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm_notebook

from multi_imbalance.ensemble.soup_bagging import SOUPBagging
from multi_imbalance.resampling.soup import SOUP
from multi_imbalance.resampling.mdo import MDO
from multi_imbalance.resampling.global_cs import GlobalCS

from imblearn.metrics import geometric_mean_score
from imblearn.over_sampling import SMOTE
from multi_imbalance.resampling.spider import SPIDER3

from sklearn.neighbors import KNeighborsClassifier
import warnings
import logging
from multi_imbalance.utils.data import load_arff_datasets
from multi_imbalance.utils.min_int_maj import maj_int_min
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
warnings.filterwarnings('ignore')

from IPython.display import clear_output
clear_output(wait=True)

In [31]:
# def green_valid_backgroud(s):
#     correct = ['1czysty-cut', '2delikatne-cut', '3mocniej-cut','4delikatne-bezover-cut', 'cmc', 'dermatology', 'new_ecoli','new_vehicle','thyroid-newthyroid']
#     return ['background-color: green' if v in correct else '' for v in list(s.index)]
# 


def bold_max(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.max()
    return ['font-weight: bold' if v else '' for v in is_max]
    
def print_scores(scores, name, only_read_dt = False, columns=None, base=None):
    df = pd.DataFrame(scores).T
    if only_read_dt:
        df = df.iloc[4:]
    if columns is not None:
        df = df[columns]
    if base is not None:
        df = pd.merge(base,df, left_index=True, right_index=True)
    df2 = df.style.apply(bold_max, axis=1)
    display(df2)
    
    with open(f'{name}_main.tex','w') as tf:
        tf.write(df.to_latex())
    df.to_csv(f'{name}_main.csv')
    
    df.fillna(df.median(), inplace=True)
    df_median = pd.DataFrame(df.mean().sort_values(ascending=False),columns=['Mean G-mean'])
    display(df_median)
    df_meanrank = pd.DataFrame(df.rank(axis=1,ascending=False).mean().sort_values(),columns=['Mean rank'])
    display(df_meanrank)
    
    with open(f'{name}_median.tex','w') as tf:
        tf.write(df_median.to_latex())
    df_median.to_csv(f'{name}_median.csv')
    with open(f'{name}_meanrank.tex','w') as tf:
        tf.write(df_meanrank.to_latex())
    df_meanrank.to_csv(f'{name}_meanrank.csv')
# print_scores(scores_knn)

In [32]:
def resample_data(resample, seed, X_train, y_train, no_classes, dataset_name):
    if resample == 'base' or resample=='bagging':
        X_train_resampled, y_train_resampled = X_train, y_train
    elif 'soup' in resample:
        soup = SOUP(k=7)
        X_train_resampled, y_train_resampled = soup.fit_transform(np.copy(X_train), np.copy(y_train), maj_int_min=maj_int_min[dataset_name])
    elif resample=='global':
        global_cs = GlobalCS()
        X_train_resampled, y_train_resampled = global_cs.fit_transform(np.copy(X_train), np.copy(y_train), shuffle=False)
    elif resample=='smote':
        smote = SMOTE(random_state=seed)
        X_train_resampled, y_train_resampled = smote.fit_sample(np.copy(X_train), np.copy(y_train))
    elif 'mdo' in resample:
        mdo = MDO(k=3, k1_frac=.4, seed=seed)
        X_train_resampled, y_train_resampled = mdo.fit_transform(np.copy(X_train), np.copy(y_train), maj_int_min=maj_int_min[dataset_name])
    elif resample=='spider':
        cost = np.ones((no_classes, no_classes))
        np.fill_diagonal(cost, 0)
        clf = SPIDER3(k=5, cost=cost, majority_classes=maj_int_min[dataset_name]['maj'], intermediate_classes=maj_int_min[dataset_name]['int'], minority_classes=maj_int_min[dataset_name]['min'])
        X_train_resampled, y_train_resampled = clf.fit_transform(X_train.astype(np.float64), y_train)
    elif 'soupbg' in resample or 'mrbbag' in resample:
        # SOUP Bagging does it by itself
        X_train_resampled, y_train_resampled = X_train, y_train
    else:
        raise ValueError(f'Bad type{resample}')
    return X_train_resampled, y_train_resampled



def test_resampling(res, dataset_values, dataset_name):
    X, y, scale_index = dataset_values.data, dataset_values.target, dataset_values.cat_length

    no_classes = np.unique(y).size
    minority_class = maj_int_min[dataset_name]['min']
    result_data = defaultdict(int)
    run_data = defaultdict(lambda: defaultdict(list)) # {metric: {run_number: [scores]}}
    for i in range(10):
        skf = StratifiedKFold(n_splits=5, shuffle=True,random_state=i)
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            if scale_index > 0:
                normalizer = StandardScaler().fit(X_train[:,:scale_index])
    
                X_train[:,:scale_index] = normalizer.transform(X_train[:,:scale_index])
                X_test[:,:scale_index] = normalizer.transform(X_test[:,:scale_index])
            X_train_resampled, y_train_resampled = resample_data(res, i, X_train, y_train, no_classes, dataset_name)

            # for clf_name in ['knn']:
            for clf_name in ['knn']:
                if clf_name == 'knn':
                    clf = KNeighborsClassifier(n_neighbors=5)
                elif clf_name == 'tree':
                    clf = DecisionTreeClassifier(random_state=i)
                    
                if  'soupbg005' in res:
                    vote_classifier = SOUPBagging(clf, maj_int_min[dataset_name], n_classifiers=5)
                    clf = vote_classifier
                elif  'soupbg015' in res:
                    vote_classifier = SOUPBagging(clf, maj_int_min[dataset_name], n_classifiers=15)
                    clf = vote_classifier
                elif  'soupbg030' in res:
                    vote_classifier = SOUPBagging(clf, maj_int_min[dataset_name], n_classifiers=30)
                    clf = vote_classifier
                elif  'soupbg050' in res:
                    vote_classifier = SOUPBagging(clf, maj_int_min[dataset_name], n_classifiers=50)
                    clf = vote_classifier
                elif  'soupbg100' in res:
                    vote_classifier = SOUPBagging(clf, maj_int_min[dataset_name], n_classifiers=100)
                    clf = vote_classifier
                elif res == 'bagging':
                    vote_classifier = BaggingClassifier(base_estimator=clf, n_estimators=50)
                    clf = vote_classifier
                # elif res == 'mrbbag005':
                    

                clf.fit(X_train_resampled, y_train_resampled)
                if 'soupbg' in res:
                    for strategy in ['average','optimistic','pessimistic','mixed', 'global']:
                        y_pred = clf.predict(X_test, strategy=strategy, maj_int_min=maj_int_min[dataset_name])
                        gmean = geometric_mean_score(y_test, y_pred, correction=0.001)
                        minority_gmean = geometric_mean_score(y_test, y_pred,labels=minority_class, correction=0.001)
                        avg_acc = np.mean(recall_score(y_test, y_pred, average=None))
                        run_data['g_mean_{}_{}'.format(clf_name, strategy)][str(i)].append(gmean)
                        run_data['g_mean_{}_minority_{}'.format(clf_name, strategy)][str(i)].append(minority_gmean)
                else:
                    y_pred = clf.predict(X_test)
                    gmean = geometric_mean_score(y_test, y_pred, correction=0.001)
                    minority_gmean = geometric_mean_score(y_test, y_pred,labels=minority_class, correction=0.001)
                    avg_acc = np.mean(recall_score(y_test, y_pred, average=None))
                    run_data['g_mean_{}'.format(clf_name)][str(i)].append(gmean)
                    run_data['g_mean_{}_minority'.format(clf_name)][str(i)].append(minority_gmean)
                # run_data['avg_acc_{}'.format(clf_name)][str(i)].append(avg_acc)
    
    def get_score_from_metric(run_data, metric):
        runs = run_data[metric]
        runs_scores_list = list(runs.values()) #[[one run k-foledscores],[..]]
        result = np.mean(list(map(np.mean, runs_scores_list)))
        return result
            
    for metric_name, metric_values in run_data.items():
        result_data[metric_name] = get_score_from_metric(run_data, metric_name)
        
    return result_data


def provide_test_and_get_scores(datasets, clf_res_names):
    scores = defaultdict(lambda: defaultdict(dict))
    for dataset_name, dataset_values in tqdm_notebook(datasets.items(),total=len(datasets), desc='1st loop'):
        for resample in clf_res_names:
            print(resample)
            result_data = test_resampling(resample, dataset_values, dataset_name)
            for key in result_data:
                scores[key][dataset_name][resample] = round(result_data[key],4)
    return scores

clf_res_names =['base','bagging','soupbg005','soupbg015','soupbg030','soupbg050','soupbg100']
datasets = load_arff_datasets(return_cat_length=True)
scores = provide_test_and_get_scores(datasets, clf_res_names)

HBox(children=(IntProgress(value=0, description='1st loop', max=19, style=ProgressStyle(description_width='ini…

base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
soupbg100
base
bagging
soupbg005
soupbg015
soupbg030
soupbg050
so

### Kfold - 5, powtórzone 10 razy
#### KNN, głosowanie: average, soup k = 7, miara Gmean

In [38]:
# scores
base = pd.DataFrame(scores['g_mean_knn']).T
columns = [i for i in clf_res_names if 'soupbg' in i]
print_scores(scores['g_mean_knn_average'], name='g_mean_knn_average_comparision', columns=columns, base=base)


Unnamed: 0,base,bagging,soupbg005,soupbg015,soupbg030,soupbg050,soupbg100
1czysty-cut,0.9709,0.9717,0.9503,0.9507,0.9514,0.9506,0.9508
2delikatne-cut,0.7028,0.7075,0.7881,0.7877,0.7883,0.7886,0.7891
3mocniej-cut,0.4668,0.4686,0.5539,0.5485,0.5515,0.5474,0.5476
4delikatne-bezover-cut,0.8103,0.8097,0.8896,0.8907,0.8903,0.8906,0.8909
balance-scale,0.1486,0.1248,0.6878,0.691,0.6858,0.6857,0.6871
car,0.43,0.7832,0.6608,0.6663,0.6718,0.673,0.6723
cleveland,0.0652,0.1022,0.213,0.2179,0.2133,0.2147,0.2145
cleveland_v2,0.0533,0.0689,0.251,0.2577,0.2525,0.2495,0.2503
cmc,0.4551,0.4757,0.4764,0.4829,0.4816,0.481,0.4811
dermatology,0.9588,0.9595,0.9354,0.9436,0.9418,0.944,0.9426


Unnamed: 0,Mean G-mean
soupbg030,0.6485
soupbg050,0.648463
soupbg100,0.648458
soupbg015,0.6478
soupbg005,0.644605
bagging,0.549663
base,0.518363


Unnamed: 0,Mean rank
soupbg100,2.894737
soupbg015,2.947368
soupbg030,2.947368
soupbg050,3.526316
soupbg005,4.578947
bagging,5.052632
base,6.052632


#### Knn, głosowanie: average, soup k = 7, miara Gmean ale tylko dla klas mniejszościowych

In [39]:
base = pd.DataFrame(scores['g_mean_knn_minority']).T
columns = [i for i in clf_res_names if 'soupbg' in i]
print_scores(scores['g_mean_knn_minority_average'],name='g_mean_knn_minority_average_comparision', columns=columns, base=base)




Unnamed: 0,base,bagging,soupbg005,soupbg015,soupbg030,soupbg050,soupbg100
1czysty-cut,0.9517,0.9508,1.0,1.0,1.0,1.0,1.0
2delikatne-cut,0.515,0.5092,0.785,0.7942,0.7967,0.8,0.8058
3mocniej-cut,0.26,0.2492,0.5625,0.5675,0.5808,0.5742,0.5875
4delikatne-bezover-cut,0.7067,0.7117,0.9808,0.9825,0.9825,0.9833,0.9842
balance-scale,0.0173,0.0091,0.7113,0.732,0.7253,0.7169,0.7251
car,0.2611,0.6695,0.9015,0.9133,0.9272,0.9259,0.9272
cleveland,0.0358,0.0644,0.1711,0.1758,0.1719,0.173,0.1723
cleveland_v2,0.0259,0.0371,0.2011,0.2089,0.2043,0.201,0.2019
cmc,0.4024,0.401,0.5481,0.5426,0.5463,0.5454,0.5454
dermatology,0.98,0.995,0.995,1.0,0.995,1.0,0.995


Unnamed: 0,Mean G-mean
soupbg100,0.697763
soupbg030,0.695574
soupbg050,0.694542
soupbg015,0.693942
soupbg005,0.687426
bagging,0.468489
base,0.432532


Unnamed: 0,Mean rank
soupbg100,2.289474
soupbg030,2.578947
soupbg050,3.0
soupbg015,3.078947
soupbg005,4.131579
bagging,6.236842
base,6.684211


#### Porównanie różnych głosowań dla knn i SOUP Bagging - 100 klasyfikatorów, miara Gmean

In [35]:
# scores
base = pd.DataFrame(scores['g_mean_knn']).T
metrices = [k for k in scores if 'knn' in k and 'minority' not in k][1:]
# metrices
for metric in metrices:
    temp_df = pd.DataFrame(pd.DataFrame(scores[metric]).T['soupbg100'])
    temp_df.columns = [metric.split('_')[-1]]
    base = pd.merge(base, temp_df, left_index=True, right_index=True)
print_scores(base.T,name='g_mean_knn_voting',)

Unnamed: 0,base,bagging,average,optimistic,pessimistic,mixed,global
1czysty-cut,0.9709,0.9717,0.9508,0.9644,0.9652,0.935,0.9516
2delikatne-cut,0.7028,0.7075,0.7891,0.792,0.7374,0.7802,0.789
3mocniej-cut,0.4668,0.4686,0.5476,0.5566,0.4764,0.5262,0.5489
4delikatne-bezover-cut,0.8103,0.8097,0.8909,0.8918,0.8929,0.8801,0.8913
balance-scale,0.1486,0.1248,0.6871,0.6461,0.6621,0.5644,0.6861
car,0.43,0.7832,0.6723,0.6586,0.5268,0.3515,0.7196
cleveland,0.0652,0.1022,0.2145,0.1494,0.0485,0.1345,0.213
cleveland_v2,0.0533,0.0689,0.2503,0.2207,0.1471,0.2051,0.2487
cmc,0.4551,0.4757,0.4811,0.4551,0.3723,0.1863,0.4819
dermatology,0.9588,0.9595,0.9426,0.9402,0.9008,0.9016,0.9457


Unnamed: 0,Mean G-mean
global,0.652279
average,0.648458
optimistic,0.621526
pessimistic,0.55
bagging,0.549663
mixed,0.539874
base,0.518363


Unnamed: 0,Mean rank
global,2.078947
average,2.289474
optimistic,3.578947
bagging,4.263158
base,5.236842
pessimistic,5.236842
mixed,5.315789


#### Porównanie różnych głosowań dla knn i SOUP Bagging - 100 klasyfikatorów, Gmean dla klas mniejszościowych


In [36]:
base = pd.DataFrame(scores['g_mean_knn']).T
metrices = [k for k in scores if 'knn' in k and 'minority' in k][1:]
# metrices
for metric in metrices:
    temp_df = pd.DataFrame(pd.DataFrame(scores[metric]).T['soupbg100'])
    temp_df.columns = [metric.split('_')[-1]]
    base = pd.merge(base, temp_df, left_index=True, right_index=True)
print_scores(base.T,name='g_mean_knn_voting_minority',)

Unnamed: 0,base,bagging,average,optimistic,pessimistic,mixed,global
1czysty-cut,0.9709,0.9717,1.0,1.0,1.0,1.0,1.0
2delikatne-cut,0.7028,0.7075,0.8058,0.8425,0.6725,0.96,0.7825
3mocniej-cut,0.4668,0.4686,0.5875,0.61,0.3608,0.8808,0.5533
4delikatne-bezover-cut,0.8103,0.8097,0.9842,0.955,0.9525,1.0,0.9833
balance-scale,0.1486,0.1248,0.7251,0.8136,0.9313,0.974,0.7249
car,0.43,0.7832,0.9272,0.8614,0.5714,0.8862,0.9156
cleveland,0.0652,0.1022,0.1723,0.1081,0.0261,0.1206,0.1722
cleveland_v2,0.0533,0.0689,0.2019,0.1644,0.0977,0.1725,0.2019
cmc,0.4551,0.4757,0.5454,0.602,0.3565,0.9468,0.5532
dermatology,0.9588,0.9595,0.995,0.975,0.97,1.0,0.995


Unnamed: 0,Mean G-mean
mixed,0.801526
global,0.697963
average,0.697763
optimistic,0.649358
pessimistic,0.557621
bagging,0.549663
base,0.518363


Unnamed: 0,Mean rank
mixed,1.526316
average,2.578947
global,2.684211
optimistic,3.868421
bagging,5.315789
pessimistic,5.763158
base,6.263158


In [37]:
import json


json = json.dumps(scores)
f = open("scores_knn.json","w")
f.write(json)
f.close()
