In [5]:

maj_int_min = {
    'balance_scale' : {
        'maj': [2, 1],
        'int': [],
        'min': [0]
    }, 
    'cleveland': {
        'maj': [0],
        'int': [1],
        'min': [2,3,4]
    }, 
    'cmc': {
        'maj': [0],
        'int': [2],
        'min': [1]
    }, 
    'dermatology': {
        'maj': [0],
        'int': [2,1,4,3],
        'min': [5]
    }, 
    'ecoli': {
        'maj': [0,1],
        'int': [7,4,5],
        'min': [6,3,2]
    }, 
    'glass': {
        'maj': [1,0],
        'int': [5],
        'min': [2,3,4]
    }, 
    'hayes_roth': {
        'maj': [0,1],
        'int': [],
        'min': [2]
    }, 
    'new_thyroid': {
        'maj': [0],
        'int': [],
        'min': [1,2]
    }, 
    'winequailty_red': {
        'maj': [2,3],
        'int': [4],
        'min': [1,5,0]
    }, 
    'yeast': {
        'maj': [0,7],
        'int': [6, 5],
        'min': [4,3,2,9,8,1]
    }
}

In [7]:
from collections import Counter
import numpy as np
import pandas as pd
from IPython.core.display import display
from sklearn.metrics import accuracy_score

from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

from multi_imbalance.datasets import load_datasets
from multi_imbalance.resampling.SOUP import SOUP
from multi_imbalance.resampling.MDO import MDO
from multi_imbalance.resampling.GlobalCS import GlobalCS

from imblearn.metrics import geometric_mean_score
from imblearn.over_sampling import SMOTE
from multi_imbalance.resampling.spider import SPIDER3

np.random.seed(0)

datasets = load_datasets()
results_g_mean = dict()
results_acc = dict()

for dataset_name, dataset_values in datasets.items():
    print(dataset_name)
    
    X, y = dataset_values.data, dataset_values.target
    
    if len(X)>1000:
        continue
        
    results_g_mean[dataset_name]=dict()
    results_acc[dataset_name]=dict()
    
    for resample in ['base','global','smote','soup','mdo','spider']:
        
        skf = StratifiedKFold(n_splits=4, random_state=0)
        acc, g_mean = list(),list()
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            error_flag = False
            clf_tree = DecisionTreeClassifier(random_state=0)
            
            if resample == 'base':
                X_train_resampled, y_train_resampled = X_train, y_train
            elif resample=='soup':
                soup = SOUP()
                X_train_resampled, y_train_resampled = soup.fit_transform(np.copy(X_train), np.copy(y_train))
            elif resample=='global':
                global_cs = GlobalCS()
                X_train_resampled, y_train_resampled = global_cs.fit_transform(np.copy(X_train), np.copy(y_train))
            elif resample=='smote':
                try:
                    smote = SMOTE()
                    X_train_resampled, y_train_resampled = smote.fit_sample(np.copy(X_train), np.copy(y_train))
                except Exception as e:
                    error_flag = True
                    print(resample, dataset_name, e)
                    X_train_resampled, y_train_resampled = X_train, y_train
            elif resample=='mdo':
                mdo = MDO(k=9, k1_frac=0, seed=0)
                X_train_resampled, y_train_resampled = mdo.fit_transform(np.copy(X_train), np.copy(y_train))
            elif resample=='spider':
                no_classes = np.unique(y).size
                cnt = Counter(y)
                cost = np.ones((no_classes, no_classes))
                np.fill_diagonal(cost, 0)
                clf = SPIDER3(k=5, cost=cost, majority_classes=maj_int_min[dataset_name]['maj'], intermediate_classes=maj_int_min[dataset_name]['int'], minority_classes=maj_int_min[dataset_name]['min'])
                X_train_resampled, y_train_resampled = clf.fit_transform(X_train.astype(np.float64), y_train)
            
            clf_tree.fit(X_train_resampled, y_train_resampled)
            y_pred = clf_tree.predict(X_test)
            g_mean.append(geometric_mean_score(y_test, y_pred, correction=0.001))
            acc.append(accuracy_score(y_test, y_pred))
        
        result_g_mean = None if error_flag else round(np.mean(g_mean),3)
        result_acc = None if error_flag else round(np.mean(acc),3)
        
        results_g_mean[dataset_name][resample]=result_g_mean
        results_acc[dataset_name][resample]=result_acc

display("G-MEAN")
df = pd.DataFrame(results_g_mean).T
display(df)

display("ACC")
df2 = pd.DataFrame(results_acc).T
display(df2)

display("MEAN G-MEAN")
df.fillna(df.median(), inplace=True)
display(df.mean())

balance_scale
cleveland
cmc
dermatology
ecoli
smote ecoli Expected n_neighbors <= n_samples,  but n_samples = 1, n_neighbors = 6
smote ecoli Expected n_neighbors <= n_samples,  but n_samples = 1, n_neighbors = 6
smote ecoli Expected n_neighbors <= n_samples,  but n_samples = 2, n_neighbors = 6
smote ecoli Expected n_neighbors <= n_samples,  but n_samples = 2, n_neighbors = 6
glass
hayes_roth
new_thyroid
winequailty_red
yeast


  "recall")
  "recall")
  "recall")
  "recall")
  "recall")
  "recall")


'G-MEAN'

Unnamed: 0,base,global,smote,soup,mdo,spider
balance_scale,0.101,0.062,0.177,0.292,0.183,0.379
cleveland,0.204,0.133,0.116,0.069,0.167,0.089
dermatology,0.928,0.927,0.904,0.924,0.919,0.928
ecoli,0.185,0.353,,0.211,0.28,0.284
glass,0.285,0.259,0.37,0.479,0.337,0.593
hayes_roth,0.871,0.871,0.864,0.83,0.863,0.376
new_thyroid,0.893,0.863,0.894,0.933,0.947,0.927


'ACC'

Unnamed: 0,base,global,smote,soup,mdo,spider
balance_scale,0.605,0.589,0.619,0.575,0.592,0.581
cleveland,0.518,0.459,0.446,0.389,0.502,0.379
dermatology,0.937,0.932,0.918,0.919,0.918,0.932
ecoli,0.789,0.779,,0.709,0.759,0.744
glass,0.579,0.594,0.6,0.536,0.566,0.543
hayes_roth,0.857,0.857,0.85,0.818,0.849,0.388
new_thyroid,0.93,0.926,0.948,0.949,0.925,0.92


'MEAN G-MEAN'

base      0.495286
global    0.495429
smote     0.563143
soup      0.534000
mdo       0.528000
spider    0.510857
dtype: float64