In [5]:
import os
from collections import Counter

import numpy as np
import pandas as pd
from IPython.core.display import display
from imblearn.datasets import fetch_datasets
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# from multi_imbalance.datasets import load_datasets
from multi_imbalance.resampling.SOUP import SOUP
from multi_imbalance.resampling.MDO import MDO as MDOMI
from multi_imbalance.resampling.GlobalCS import GlobalCS

from imblearn.metrics import geometric_mean_score
from imblearn.over_sampling import SMOTE

import smote_variants as sv

np.random.seed(0)

datasets = fetch_datasets()
# datasets = load_datasets()
results_g_mean = dict()
results_acc = dict()

for dataset_name, dataset_values in datasets.items():
    if len(dataset_values.data) > 1000:
        continue
    print(dataset_name)
    X, y = dataset_values.data, dataset_values.target
    X_train, X_test, y_train, y_test = train_test_split(X, y,shuffle=True, stratify=y, test_size=0.25, random_state=0)
    print(Counter(y_train))
    print(Counter(y_test))
    results_g_mean[dataset_name]=dict()
    results_acc[dataset_name]=dict()
    for resample in ['base','global','smote','soup','mdo-sv','mdo']:
        error_flag = False
        clf_tree = DecisionTreeClassifier(random_state=0)
        
        if resample == 'base':
            X_train_resampled, y_train_resampled = X_train, y_train
        elif resample=='soup':
            soup = SOUP()
            X_train_resampled, y_train_resampled = soup.fit_transform(np.copy(X_train), np.copy(y_train))
        elif resample=='global':
            global_cs = GlobalCS()
            X_train_resampled, y_train_resampled = global_cs.fit_transform(np.copy(X_train), np.copy(y_train))
        elif resample=='smote':
            try:
                smote = SMOTE()
                X_train_resampled, y_train_resampled = smote.fit_sample(np.copy(X_train), np.copy(y_train))
            except Exception as e:
                error_flag = True
                print(resample, dataset_name, e)
                X_train_resampled, y_train_resampled = X_train, y_train
        elif resample=='mdo':
            try:
                mdo = MDOMI(k=9)
                X_train_resampled, y_train_resampled = mdo.fit_transform(np.copy(dataset_values.data), np.copy(dataset_values.target))
            except Exception as e:
                error_flag = True
                print(resample, dataset_name, e)
                X_train_resampled, y_train_resampled = dataset_values.data, dataset_values.target
        elif resample=='mdo-sv':
            clf=sv.MDO(K2=9, K1_frac= 0, random_state=0)
            X_train_resampled, y_train_resampled = clf.sample(np.copy(dataset_values.data), np.copy(dataset_values.target))


        clf_tree.fit(X_train_resampled, y_train_resampled)
        y_pred = clf_tree.predict(X_test)
        g_mean = str(round(geometric_mean_score(y_test, y_pred, correction=0.001),3))
        acc = str(round(accuracy_score(y_test, y_pred),3))
        result_g_mean = '-' if error_flag else g_mean
        result_acc = '-'  if error_flag else acc
        
        results_g_mean[dataset_name][resample]=result_g_mean
        results_acc[dataset_name][resample]=result_acc

display("G-MEAN")
df = pd.DataFrame(results_g_mean).T
display(df)

display("ACC")
df = pd.DataFrame(results_acc).T
display(df)



2019-11-02 21:32:42,141:INFO:MDO: Running sampling via ('MDO', "{'proportion': 1.0, 'K2': 9, 'K1_frac': 0, 'n_jobs': 1, 'random_state': 0}")
2019-11-02 21:32:42,469:INFO:MDO: Running sampling via ('MDO', "{'proportion': 1.0, 'K2': 9, 'K1_frac': 0, 'n_jobs': 1, 'random_state': 0}")
2019-11-02 21:32:43,200:INFO:MDO: Running sampling via ('MDO', "{'proportion': 1.0, 'K2': 9, 'K1_frac': 0, 'n_jobs': 1, 'random_state': 0}")
2019-11-02 21:32:43,777:INFO:MDO: Running sampling via ('MDO', "{'proportion': 1.0, 'K2': 9, 'K1_frac': 0, 'n_jobs': 1, 'random_state': 0}")
2019-11-02 21:32:44,639:INFO:MDO: Running sampling via ('MDO', "{'proportion': 1.0, 'K2': 9, 'K1_frac': 0, 'n_jobs': 1, 'random_state': 0}")


ecoli
Counter({-1: 226, 1: 26})
Counter({-1: 75, 1: 9})
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1  1 -1 -1
  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1  1 -1 -1  1 -1 -1 -1 -1
  1 -1 -1  1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1  1 -1 -1
  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1  1 -1 -1  1 -1 -1 -1 -1
  1 -1 -1  1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
spectrometer
Counter({-1: 364, 1: 34})
Counter({-1: 122, 1: 11})
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1  1  1 -1  1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1
 -1  1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1  1  1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1


'G-MEAN'

Unnamed: 0,base,global,smote,soup,mdo-sv
ecoli,0.649,0.566,0.462,0.846,1.0
spectrometer,0.671,0.729,0.832,0.81,1.0
libras_move,0.699,0.913,0.994,0.907,1.0
arrhythmia,0.909,0.909,0.909,0.904,1.0
oil,0.54,0.542,0.533,0.621,1.0


'ACC'

Unnamed: 0,base,global,smote,soup,mdo-sv
ecoli,0.893,0.893,0.881,0.905,1.0
spectrometer,0.947,0.94,0.932,0.887,1.0
libras_move,0.944,0.989,0.989,0.978,1.0
arrhythmia,0.982,0.982,0.982,0.973,1.0
oil,0.945,0.949,0.919,0.94,1.0
