In [4]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold

from imblearn.metrics import geometric_mean_score
from sklearn.metrics import balanced_accuracy_score
import statistics as stats

# Se importan las librerías de AdaBoost y de árboles de decisión 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from imblearn.metrics import geometric_mean_score
from sklearn.metrics import balanced_accuracy_score
from imblearn.ensemble import RUSBoostClassifier

from imblearn.datasets import fetch_datasets

In [5]:
def obtain_data(dataset_name):
    dataset = fetch_datasets()[dataset_name]
    return dataset.data,dataset.target

def convert_classes(y):
    default_classes = np.unique(y)
#     print("Default classes of the dataset were: ",default_classes)
    maj_class = -1
    min_class = 1
    if sum(y == default_classes[0]) > sum(y == default_classes[1]):
    #     maj_class = default_classes[0]
    #     min_class = default_classes[1]
        y[y==default_classes[0]] = maj_class
        y[y==default_classes[1]] = min_class
    else:
    #     maj_class = default_classes[1]
    #     min_class = default_classes[0]
        y[y==default_classes[1]] = maj_class
        y[y==default_classes[0]] = min_class

#     print("There are {} instances for the majoritary class".format(sum(y == maj_class)))
#     print("There are {} instanes for the minoritary class".format(sum(y == min_class)))
    return [maj_class,min_class], maj_class, min_class

def train(X_train, y_train, method_name, base_classifier, T):
    if method_name=='adaboost':
        clf = AdaBoostClassifier(base_estimator=base_classifier, n_estimators=T)
    elif method_name=='RUSBoost':
        clf = RUSBoostClassifier(base_estimator=base_classifier,n_estimators=T,sampling_strategy='majority')

    clf.fit(X_train,y_train)
    return clf

def gmean_test(clf, X_test, y_test):
    # Se calcula el porcentaje de acierto de AdaBoost
    acc = clf.score(X_test,y_test)*100
#     accGlobal.append(acc)
    y_pred = clf.predict(X_test)
    gmean = geometric_mean_score(y_test, y_pred)*100
#     gmeanGlobal.append(gmean)
    bAcc = balanced_accuracy_score(y_test, y_pred)*100
#     baccGlobal.append(bAcc)
    
    return gmean

def train_ensemble_method(dataset_name,method_name, T=10, k=5):
    #fetch data from dataset
    X, y = obtain_data(dataset_name)
    print("Dataset of size {}".format(X.shape))
    
    #convert, just in case, class labels to -1 (majoritary class) and 1 (minoritari class)
    classes, maj_class, min_class = convert_classes(y)
    
    #number of instances of each class and IR
    n_maj = X[y==maj_class].shape[0]
    n_min = X[y==min_class].shape[0]
    IR = n_maj/n_min
    print("There are {} instances for the majoritary class".format(n_maj))
    print("There are {} instanes for the minoritary class".format(n_min))
    print("IR of the dataset: ",IR)
    
    # Llamada al constructor del clasificador 
    dtc = DecisionTreeClassifier(criterion='entropy', max_depth=1)

    kf = StratifiedKFold(n_splits=k)

    accGlobal = []
    gmean = []
    baccGlobal = []
    for train_index, test_index in kf.split(X,y):
    #     print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        clf = train(X_train, y_train, method_name, dtc, T)
        
        partial_gmean = gmean_test(clf, X_test, y_test)
        
        gmean.append(partial_gmean)
        
    print(gmean)
    rend = stats.mean(gmean)
    print("Rendimiento del clasificador {}: {}".format(method_name,rend))
    return rend, IR
    

In [6]:
imblearn_datasets = [
    'ecoli',
    'optical_digits',
    'satimage',
    'pen_digits',
    'abalone',
    'sick_euthyroid',
    'spectrometer',
    'car_eval_34',
    'isolet',
    'us_crime',
    'yeast_ml8',
    'scene',
    'libras_move',
    'thyroid_sick',
    'coil_2000',
    'arrhythmia',
    'solar_flare_m0',
    'oil',
    'car_eval_4',
    'wine_quality',
    'letter_img',
    'yeast_me2',
    'webpage',
    'ozone_level',
    'mammography',
    'protein_homo',
    'abalone_19'
]

In [7]:
rendimientos = []
IRlista = []
for dataset in imblearn_datasets:
    print(dataset.upper())
    r, IR = train_ensemble_method(dataset,'RUSBoost')
    rendimientos.append(r)
    IRlista.append(IR)
    print()


ECOLI
Dataset of size (336, 7)
There are 301 instances for the majoritary class
There are 35 instanes for the minoritary class
IR of the dataset:  8.6
[91.82000516742599, 92.58200997725514, 77.15167498104596, 60.9449400220044, 91.80725150319788]
Rendimiento del clasificador RUSBoost: 82.86117633018587

OPTICAL_DIGITS
Dataset of size (5620, 64)
There are 5066 instances for the majoritary class
There are 554 instanes for the minoritary class
IR of the dataset:  9.144404332129964
[90.8687376818712, 93.04400199759924, 90.58563674138676, 90.19010305483883, 82.46845374588622]
Rendimiento del clasificador RUSBoost: 89.43138664431645

SATIMAGE
Dataset of size (6435, 36)
There are 5809 instances for the majoritary class
There are 626 instanes for the minoritary class
IR of the dataset:  9.279552715654953
[86.73182598420385, 66.64572247023837, 93.45242790999035, 84.17913854755994, 85.67023965257118]
Rendimiento del clasificador RUSBoost: 83.33587091291274

PEN_DIGITS
Dataset of size (10992, 16)


[86.40412817957788, 92.99720566654378, 91.56557511064283, 93.88039618902091, 89.72546342524076]
Rendimiento del clasificador RUSBoost: 90.91455371420523

ABALONE_19
Dataset of size (4177, 10)
There are 4145 instances for the majoritary class
There are 32 instanes for the minoritary class
IR of the dataset:  129.53125
[60.896852758228114, 71.90090256571133, 61.34794672153364, 59.88865153793134, 84.57657253895236]
Rendimiento del clasificador RUSBoost: 67.72218522447136



In [11]:
print("Lista de rendimientos para cada uno de los datasets: ",rendimientos)
print()
# print(rendimientos.index(max(rendimientos)))

rend_sorted_idx = np.argsort(rendimientos)
# print(rend_sorted_idx[-5:])
best_rend_values = np.array(rendimientos)[rend_sorted_idx[-5:]]
best_rend_datasets = np.array(imblearn_datasets)[rend_sorted_idx[-5:]]
best_rend_IR = np.array(IRlista)[rend_sorted_idx[-5:]]
print("Top 5 rendimientos: ",best_rend_values)
print()
print("Datasets correspondientes a los top 5 rendimientos: ",best_rend_datasets)
print()
# print("IR correspondiente a los top 5 rendimientos: ",best_rend_IR)
print()


worst_rend_values = np.array(rendimientos)[rend_sorted_idx[:5]]
worst_rend_datasets = np.array(imblearn_datasets)[rend_sorted_idx[:5]]
worst_rend_IR = np.array(IRlista)[rend_sorted_idx[:5]]
print("Peores 5 rendimientos: ",worst_rend_values)
print()
print("Datasets correspondientes a los 5 peores rendimientos: ",worst_rend_datasets)
print()
# print("IR correspondiente a los 5 peores rendimientos: ",worst_rend_IR)

Lista de rendimientos para cada uno de los datasets:  [82.86117633018587, 89.43138664431645, 83.33587091291274, 89.33459372309083, 77.58663399776752, 93.01107909519996, 80.63272499337798, 91.66054788664458, 89.33651410115655, 79.80352268546791, 49.0262176164586, 61.528503765704, 57.558754061415854, 91.0599538673769, 67.28195526383571, 92.7571215141502, 58.30481651468983, 67.97102659067065, 75.69390353501971, 50.991526483948334, 94.04977990340822, 70.24168270415235, 60.09793040408956, 76.17688492590882, 84.53660342315035, 90.91455371420523, 67.72218522447136]

Top 5 rendimientos:  [91.05995387 91.66054789 92.75712151 93.0110791  94.0497799 ]

Datasets correspondientes a los top 5 rendimientos:  ['thyroid_sick' 'car_eval_34' 'arrhythmia' 'sick_euthyroid' 'letter_img']


Peores 5 rendimientos:  [49.02621762 50.99152648 57.55875406 58.30481651 60.0979304 ]

Datasets correspondientes a los 5 peores rendimientos:  ['yeast_ml8' 'wine_quality' 'libras_move' 'solar_flare_m0' 'webpage']



No existe relación entre el IR ofrecido por el dataset y el rendimiento del clasificador

In [10]:
print("La media de rendimiento entre todos los datasets es de :",stats.mean(rendimientos))

La media de rendimiento entre todos los datasets es de : 76.77434999565837
