In [2]:
from collections import defaultdict
import numpy as np
import pandas as pd
import tqdm
from IPython.core.display import display
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

from multi_imbalance.datasets import load_datasets
from multi_imbalance.resampling.SOUP import SOUP
from multi_imbalance.resampling.MDO import MDO
from multi_imbalance.resampling.GlobalCS import GlobalCS

from imblearn.metrics import geometric_mean_score
from imblearn.over_sampling import SMOTE
from multi_imbalance.resampling.SOUPBagging import SOUPBagging
from multi_imbalance.resampling.spider import SPIDER3

from sklearn.neighbors import KNeighborsClassifier
maj_int_min = {
    'balance_scale' : {
        'maj': [2, 1],
        'int': [],
        'min': [0]
    }, 
    'cleveland': {
        'maj': [0],
        'int': [1],
        'min': [2,3,4]
    }, 
    'cmc': {
        'maj': [0],
        'int': [2],
        'min': [1]
    }, 
    'dermatology': {
        'maj': [0],
        'int': [2,1,4,3],
        'min': [5]
    }, 
    'ecoli': {
        'maj': [0,1],
        'int': [7,4,5],
        'min': [6,3,2]
    }, 
    'glass': {
        'maj': [1,0],
        'int': [5],
        'min': [2,3,4]
    }, 
    'hayes_roth': {
        'maj': [0,1],
        'int': [],
        'min': [2]
    }, 
    'new_thyroid': {
        'maj': [0],
        'int': [],
        'min': [1,2]
    }, 
    'winequailty_red': {
        'maj': [2,3],
        'int': [4],
        'min': [1,5,0]
    }, 
    'yeast': {
        'maj': [0,7],
        'int': [6, 5],
        'min': [4,3,2,9,8,1]
    }
}
from IPython.display import clear_output
clear_output(wait=True)

ModuleNotFoundError: No module named 'multi_imbalance'

In [48]:
def resample_data(resample, seed, X_train, y_train, no_classes, dataset_name):
    if resample == 'base':
        X_train_resampled, y_train_resampled = X_train, y_train
    elif resample=='soup':
        soup = SOUP()
        X_train_resampled, y_train_resampled = soup.fit_transform(np.copy(X_train), np.copy(y_train))
    elif resample=='global':
        global_cs = GlobalCS()
        X_train_resampled, y_train_resampled = global_cs.fit_transform(np.copy(X_train), np.copy(y_train), shuffle=False)
    elif resample=='smote':
        smote = SMOTE(random_state=seed)
        X_train_resampled, y_train_resampled = smote.fit_sample(np.copy(X_train), np.copy(y_train))
    elif resample=='mdo':
        mdo = MDO(k=9, k1_frac=0.1, seed=seed)
        X_train_resampled, y_train_resampled = mdo.fit_transform(np.copy(X_train), np.copy(y_train))
    elif resample=='spider':
        cost = np.ones((no_classes, no_classes))
        np.fill_diagonal(cost, 0)
        clf = SPIDER3(k=5, cost=cost, majority_classes=maj_int_min[dataset_name]['maj'], intermediate_classes=maj_int_min[dataset_name]['int'], minority_classes=maj_int_min[dataset_name]['min'])
        X_train_resampled, y_train_resampled = clf.fit_transform(X_train.astype(np.float64), y_train)
    elif resample=='soupbagging':
        # SOUP Bagging does it by itself
        X_train_resampled, y_train_resampled = X_train, y_train
    return X_train_resampled, y_train_resampled


def test_resampling(classifier, res, dataset_values, dataset_name):
    X, y = dataset_values.data, dataset_values.target
    no_classes = np.unique(y).size
    result_data = defaultdict(int)
    run_data = defaultdict(lambda: defaultdict(list)) # {metric: {run_number: [scores]}}
    for i in range(1):
        skf = StratifiedKFold(n_splits=3, shuffle=True,random_state=i)
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            X_train_resampled, y_train_resampled = resample_data(res, i, X_train, y_train, no_classes, dataset_name)
            
            if classifier == 'knn':
                clf = KNeighborsClassifier(n_neighbors=5)
            elif classifier == 'tree':
                clf = DecisionTreeClassifier(random_state=i)
                
            if res == 'soupbagging':
                vote_classifier = SOUPBagging(clf, n_classifiers=5, seed=i)
                clf = vote_classifier
            
            clf.fit(X_train_resampled, y_train_resampled)
            y_pred = clf.predict(X_test)
            gmean = geometric_mean_score(y_test, y_pred, correction=0.001)
            run_data['g_mean'][str(i)].append(gmean)
    
    def get_score_from_metric(run_data, metric):
        runs = run_data[metric]
        runs_scores_list = list(runs.values()) #[[one run k-foledscores],[..]]
        result = np.mean(list(map(np.mean, runs_scores_list)))
        return result
            
    result_data['g_mean'] = get_score_from_metric(run_data, 'g_mean')
    return result_data


In [49]:
def provide_test_and_get_scores(datasets, clf):
    scores = defaultdict(dict)
    for dataset_name, dataset_values in tqdm(datasets.items(),total=len(datasets)):
        clf_res_names =['soup','soupbagging']
        for resample in clf_res_names:
            result_data = test_resampling(clf, resample, dataset_values, dataset_name)
            scores[dataset_name][resample] = round(result_data['g_mean'],3)
    return scores

In [50]:
def print_scores(scores, only_read_dt = False):
    display("G-MEAN")
    df = pd.DataFrame(scores).T
    if only_read_dt:
        df = df.iloc[4:]
    display(df)
    
    # df.fillna(df.median(), inplace=True)
    display(pd.DataFrame(df.mean().sort_values(ascending=False),columns=['Mean G-mean']))
    display(pd.DataFrame(df.rank(axis=1,ascending=False).mean().sort_values(),columns=['Mean rank']))

In [51]:
datasets = load_datasets()


Testy dla drzewa,
Wszystkie zbiory danych:

In [52]:
score = provide_test_and_get_scores(datasets, 'tree')
print_scores(score)


HBox(children=(IntProgress(value=0, max=17), HTML(value='')))




'G-MEAN'

Unnamed: 0,soup,soupbagging
1czysty-cut,0.95,0.936
2delikatne-cut,0.788,0.801
3mocniej-cut,0.582,0.604
4delikatne-bezover-cut,0.886,0.867
balance-scale,0.505,0.518
cleveland,0.099,0.169
cleveland_v2,0.271,0.068
cmc,0.494,0.477
dermatology,0.91,0.916
glass,0.679,0.638


Unnamed: 0,Mean G-mean
soup,0.651882
soupbagging,0.627176


Unnamed: 0,Mean rank
soup,1.294118
soupbagging,1.705882


Drzewo, tylko rzeczywiste zbiory danych:

In [53]:
print_scores(score,only_read_dt=True)


'G-MEAN'

Unnamed: 0,soup,soupbagging
balance-scale,0.505,0.518
cleveland,0.099,0.169
cleveland_v2,0.271,0.068
cmc,0.494,0.477
dermatology,0.91,0.916
glass,0.679,0.638
hayes-roth,0.835,0.784
new_ecoli,0.747,0.698
new_led7digit,0.754,0.733
new_vehicle,0.884,0.871


Unnamed: 0,Mean G-mean
soup,0.605846
soupbagging,0.573385


Unnamed: 0,Mean rank
soup,1.230769
soupbagging,1.769231


Testy dla knn,
Wszystkie zbiory danych:

In [54]:
score = provide_test_and_get_scores(datasets, 'knn')
print_scores(score)


HBox(children=(IntProgress(value=0, max=17), HTML(value='')))




'G-MEAN'

Unnamed: 0,soup,soupbagging
1czysty-cut,0.94,0.946
2delikatne-cut,0.792,0.785
3mocniej-cut,0.583,0.563
4delikatne-bezover-cut,0.886,0.878
balance-scale,0.674,0.628
cleveland,0.106,0.088
cleveland_v2,0.248,0.159
cmc,0.517,0.464
dermatology,0.799,0.759
glass,0.608,0.571


Unnamed: 0,Mean G-mean
soup,0.636529
soupbagging,0.606941


Unnamed: 0,Mean rank
soup,1.176471
soupbagging,1.823529


knn (k=5), tylko rzeczywiste zbiory danych:

In [55]:
print_scores(score,only_read_dt=True)

'G-MEAN'

Unnamed: 0,soup,soupbagging
balance-scale,0.674,0.628
cleveland,0.106,0.088
cleveland_v2,0.248,0.159
cmc,0.517,0.464
dermatology,0.799,0.759
glass,0.608,0.571
hayes-roth,0.566,0.52
new_ecoli,0.814,0.775
new_led7digit,0.755,0.758
new_vehicle,0.818,0.796


Unnamed: 0,Mean G-mean
soup,0.586154
soupbagging,0.549692


Unnamed: 0,Mean rank
soup,1.153846
soupbagging,1.846154


In [1]:
from ipywidgets import IntProgress
IntProgress(10,max=100)

IntProgress(value=10)