In [1]:
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['BLIS_NUM_THREADS'] = '1'
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['TF_NUM_INTEROP_THREADS'] = '1'
os.environ['TF_NUM_INTRAOP_THREADS'] = '1'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import logging
import json

import pandas as pd

import smote_variants
from smote_variants import get_simplex_sampling_oversamplers
from smote_variants.evaluation import evaluate_oversamplers

import common_datasets
from common_datasets.binary_classification import get_filtered_data_loaders, get_data_loaders

logger = logging.getLogger('smote_variants')
logger.setLevel(logging.CRITICAL)

smote_variants.config.suppress_external_warnings(False)
smote_variants.config.suppress_internal_warnings(False)

False

In [2]:
oversamplers = get_simplex_sampling_oversamplers(within_simplex_sampling='random', 
                                                    n_dim_range=2, 
                                                    n_quickest=50)

In [3]:
oversamplers

[smote_variants.oversampling._smote.SMOTE,
 smote_variants.oversampling._g_smote.G_SMOTE,
 smote_variants.oversampling._gaussian_smote.Gaussian_SMOTE,
 smote_variants.oversampling._borderline_smote.Borderline_SMOTE1,
 smote_variants.oversampling._cure_smote.CURE_SMOTE,
 smote_variants.oversampling._borderline_smote.Borderline_SMOTE2,
 smote_variants.oversampling._edge_det_smote.Edge_Det_SMOTE,
 smote_variants.oversampling._sl_graph_smote.SL_graph_SMOTE,
 smote_variants.oversampling._smmo.SMMO,
 smote_variants.oversampling._distance_smote.distance_SMOTE,
 smote_variants.oversampling._cbso.CBSO,
 smote_variants.oversampling._lee.Lee,
 smote_variants.oversampling._smote_tomeklinks.SMOTE_TomekLinks,
 smote_variants.oversampling._smote_rsb.SMOTE_RSB,
 smote_variants.oversampling._assembled_smote.Assembled_SMOTE,
 smote_variants.oversampling._ndo_sampling.NDO_sampling,
 smote_variants.oversampling._oups.OUPS,
 smote_variants.oversampling._nras.NRAS,
 smote_variants.oversampling._ismote.ISMOT

In [3]:
datasets = get_filtered_data_loaders(n_col_bounds=(2, 150), 
                                        n_minority_bounds=(10, 10000), 
                                        n_bounds=(50, 2500),
                                        n_smallest=50, 
                                        sorting='n',
                                        n_from_phenotypes=3)

datasets = [dataset() for dataset in datasets]

In [4]:
classifiers = [('sklearn.neighbors', 'KNeighborsClassifier', {'algorithm': 'brute', 'n_jobs': 1}),
                ('sklearn.tree', 'DecisionTreeClassifier', {'random_state': 5}),
                ('sklearn.ensemble', 'RandomForestClassifier', {'random_state': 5}),
                ('sklearn.svm', 'SVC', {'random_state': 5, 'probability': True}),
                ('smote_variants.classifiers', 'MLPClassifierWrapper', {'random_state': 5})]

In [5]:
validator_params = {'n_repeats': 2, 'n_splits': 5, 'random_state': 5}

ss_params = {'within_simplex_sampling': 'deterministic',
             'simplex_sampling': 'deterministic'}

vanilla_params = {'random_state': 5, 
                    'n_jobs': 1}

deterministic_params = {'random_state': 5,
                        'ss_params': ss_params,
                        'n_jobs': 1}

In [2]:
cache_path = os.path.expanduser('~/smote-deterministic')

In [7]:
# creating oversampler specifications

oversampler_list = [('smote_variants', o.__name__, vanilla_params) for o in oversamplers]
oversampler_deterministic = [('smote_variants', o.__name__, deterministic_params) for o in oversamplers]

all_oversamplers = oversampler_list + oversampler_deterministic

In [8]:
print(len(all_oversamplers))
print(len(datasets))

90
50


In [9]:
dataset_names = [dataset['name'] for dataset in datasets]

for dataset_name in sorted(dataset_names):
    print(dataset_name)

CM1
PC1
SPECTF
abalone-19_vs_10-11-12-13
abalone9_18
appendicitis
australian
bupa
car-vgood
car_good
cleveland-0_vs_4
crx
dermatology-6
ecoli1
ecoli2
ecoli3
flare-F
glass0
glass1
glass2
haberman
hepatitis
ionosphere
iris0
kddcup-guess_passwd_vs_satan
kddcup-land_vs_portsweep
kr-vs-k-zero_vs_eight
led7digit-0-2-4-6-7-8-9_vs_1
mammographic
monk-2
new_thyroid1
page-blocks-1-3_vs_4
pima
poker-8-9_vs_6
poker-8_vs_6
saheart
shuttle-6_vs_2-3
shuttle-c0-vs-c4
vehicle0
vehicle1
vehicle2
vowel0
wdbc
winequality-red-3_vs_5
winequality-red-4
winequality-red-8_vs_6
wisconsin
yeast1
yeast3
yeast4


In [10]:
results = evaluate_oversamplers(datasets=datasets, 
                                oversamplers=all_oversamplers, 
                                classifiers=classifiers,
                                scaler=('sklearn.preprocessing', 'StandardScaler', {}),
                                validator_params=validator_params,
                                cache_path=cache_path,
                                parse_results=False,
                                clean_up=None,
                                timeout=90,
                                n_jobs=4)

2022-08-28 10:26:51.794956: processing dataset: appendicitis
starting  23437 Borderline_SMOTE2starting3
 23443 SN_SMOTE 2starting
 23463 E_SMOTE 1starting
 23481 Borderline_SMOTE1 1
starting 23511 ISMOTE 1
starting 23533 SMOTE_ENN 3starting
 23542 OUPS 3starting
 23558 NEATER 3
starting 23590 SMMOstarting  235962 
ADASYN 0
starting23628  SMMO 3
starting 23641 NRSBoundary_SMOTEstarting  236552 
MWMOTE 4
starting 23684 kmeans_SMOTE 4
starting23705  G_SMOTE 2
starting 23725starting 23732  SL_graph_SMOTEBorderline_SMOTE2  13starting
 23746 
CBSO 2
starting 23783 SOMO 3starting
 23791 Lee starting0 
23806 SMOTE_FRST_2Tstarting  223826
 ANS 4
starting 123858 CURE_SMOTE starting
 23868 startingLee  238861 
ASMOBD starting4 
23897 SL_graph_SMOTE 0
starting 23927 SMMO 2
starting 23954 SMOTE 
starting2 23969 MWMOTE 3
starting23984  SMOTE_ENN 0
starting 24005 startingMOT2LD  240171 
SMOTE_TomekLinks 2
starting 24049MOT2LD  4
starting 24068 ADASYN 4
starting 24088 NEATER 4
starting 24098 distance_

KeyboardInterrupt: 


starting 29537 NDO_sampling 4
starting 29556 SMMO 2
starting 29575 SVM_balance 3
starting 29594 CBSO 1
starting

In [2]:
datasets = smote_variants.evaluation.datasets_in_cache(cache_path)

In [3]:
datasets

{'car_good': '/home/gykovacs/smote-deterministic/car_good',
 'shuttle-c0-vs-c4': '/home/gykovacs/smote-deterministic/shuttle-c0-vs-c4',
 'cleveland-0_vs_4': '/home/gykovacs/smote-deterministic/cleveland-0_vs_4',
 'haberman': '/home/gykovacs/smote-deterministic/haberman',
 'abalone9_18': '/home/gykovacs/smote-deterministic/abalone9_18',
 'wisconsin': '/home/gykovacs/smote-deterministic/wisconsin',
 'vehicle0': '/home/gykovacs/smote-deterministic/vehicle0',
 'abalone-19_vs_10-11-12-13': '/home/gykovacs/smote-deterministic/abalone-19_vs_10-11-12-13',
 'poker-8_vs_6': '/home/gykovacs/smote-deterministic/poker-8_vs_6',
 'hepatitis': '/home/gykovacs/smote-deterministic/hepatitis',
 'flare-F': '/home/gykovacs/smote-deterministic/flare-F',
 'shuttle-6_vs_2-3': '/home/gykovacs/smote-deterministic/shuttle-6_vs_2-3',
 'pima': '/home/gykovacs/smote-deterministic/pima',
 'mammographic': '/home/gykovacs/smote-deterministic/mammographic',
 'dermatology-6': '/home/gykovacs/smote-deterministic/dermatol

In [4]:
all_data = []
for dataset in datasets:
    print(dataset)
    data = smote_variants.evaluation.load_dataset_data(datasets[dataset])
    summary = smote_variants.evaluation.create_summary(data)
    summary.to_csv(os.path.join(datasets[dataset], f'summary_{dataset}.csv'))
    data['dataset'] = data['fold_descriptor'].apply(lambda x: x['name'])
    all_data.append(data[['oversampling_error', 'dataset', 'oversampler', 'oversampling_warning']])

car_good
shuttle-c0-vs-c4
cleveland-0_vs_4
haberman
abalone9_18
wisconsin
vehicle0
abalone-19_vs_10-11-12-13
poker-8_vs_6
hepatitis
flare-F
shuttle-6_vs_2-3
pima
mammographic
dermatology-6
kddcup-land_vs_portsweep
winequality-red-8_vs_6
ecoli2
winequality-red-3_vs_5
poker-8-9_vs_6
wdbc
vehicle1
vehicle2
car-vgood
kr-vs-k-zero_vs_eight
crx
yeast3
appendicitis
vowel0
glass1
saheart
bupa
winequality-red-4
ecoli3
monk-2
glass2
page-blocks-1-3_vs_4
ionosphere
kddcup-guess_passwd_vs_satan
led7digit-0-2-4-6-7-8-9_vs_1
yeast1
PC1
glass0
new_thyroid1
australian
iris0
CM1
ecoli1
SPECTF
yeast4


In [5]:
pdf = pd.concat(all_data).reset_index(drop=True)

In [6]:
pdf.head()

Unnamed: 0,oversampling_error,dataset,oversampler,oversampling_warning
0,,car_good,ANS,[]
1,,car_good,ANS,[]
2,,car_good,MOT2LD,[]
3,,car_good,kmeans_SMOTE,"[(""<class 'UserWarning'>"", UserWarning('kmeans..."
4,,car_good,SMOTE_RSB,[]


In [7]:
pdf = pdf.reset_index(drop=True)

In [8]:
len(pdf)

100000

In [9]:
pdf[~pdf['oversampling_error'].isnull()][['oversampling_error', 'dataset', 'oversampler', 'oversampling_warning']].drop_duplicates().values

array([], shape=(0, 4), dtype=object)

In [41]:
pdf[~pdf['oversampling_error'].isnull()][['oversampling_error', 'dataset', 'oversampler', 'oversampling_warning']].drop_duplicates().values

array([['TimeOutError', 'car_good', 'NEATER', None],
       ['TimeOutError', 'car_good', 'MSYN', None],
       ['TimeOutError', 'poker-8_vs_6', 'AMSCO', None],
       ['TimeOutError', 'mammographic', 'SOMO', None],
       ['TimeOutError', 'mammographic', 'ANS', None],
       ['TimeOutError', 'kddcup-land_vs_portsweep', 'MSYN', None],
       ['TimeOutError', 'kddcup-land_vs_portsweep', 'NRSBoundary_SMOTE',
        None],
       ['TimeOutError', 'poker-8-9_vs_6', 'AMSCO', None],
       ['TimeOutError', 'winequality-red-4', 'AMSCO', None],
       ['TimeOutError', 'yeast1', 'Supervised_SMOTE', None],
       ['TimeOutError', 'PC1', 'AMSCO', None]], dtype=object)