In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold

from imblearn.metrics import geometric_mean_score
from sklearn.metrics import balanced_accuracy_score
import statistics as stats

# Se importan las librerías de AdaBoost y de árboles de decisión 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from imblearn.metrics import geometric_mean_score
from sklearn.metrics import balanced_accuracy_score
from imblearn.ensemble import RUSBoostClassifier

from imblearn.datasets import fetch_datasets

In [2]:
def obtain_data(dataset_name):
    dataset = fetch_datasets()[dataset_name]
    return dataset.data,dataset.target

def convert_classes(y):
    default_classes = np.unique(y)
#     print("Default classes of the dataset were: ",default_classes)
    maj_class = -1
    min_class = 1
    if sum(y == default_classes[0]) > sum(y == default_classes[1]):
    #     maj_class = default_classes[0]
    #     min_class = default_classes[1]
        y[y==default_classes[0]] = maj_class
        y[y==default_classes[1]] = min_class
    else:
    #     maj_class = default_classes[1]
    #     min_class = default_classes[0]
        y[y==default_classes[1]] = maj_class
        y[y==default_classes[0]] = min_class

#     print("There are {} instances for the majoritary class".format(sum(y == maj_class)))
#     print("There are {} instanes for the minoritary class".format(sum(y == min_class)))
    return [maj_class,min_class], maj_class, min_class

def train(X_train, y_train, method_name, base_classifier, T):
    if method_name=='adaboost':
        clf = AdaBoostClassifier(base_estimator=base_classifier, n_estimators=T)

    clf.fit(X_train,y_train)
#     # Lista para almacenar el accuracy de cada clasificador base
#     listaAcc = []
#     listaGmean = []
#     listaBAcc = []
#     # Por cada clasificador base
#     for i in range(len(clf.estimators_)):
#         # Se calcula el porcentaje de acierto del clasificador base correspondiente: adaboost.estimators_[i]
#         # Redondear a dos decimales
#         y_pred = clf.estimators_[i].predict(X_test)
#         gmean = round(geometric_mean_score(y_test, y_pred)*100,2)
#         bAcc = round(balanced_accuracy_score(y_test,y_pred)*100,2)
#         acc = round(clf.estimators_[i].score(X_test,y_test)*100,2)
        
#         # Se añade a la lista de accuracies
#         listaAcc.append(acc)
#         listaGmean.append(gmean)
#         listaBAcc.append(bAcc)
        
#         # Establecemos el título de la figura con el número de clasificador y su precisión en train
        #titulo = 'Clasificador {}, accuracy: {}%'.format(i, acc)
        # Mostramos la figura con los datos de train y la frontera del clasificador correspondiente
#         #mostrar(X_test,y_test,clasificador = adaboost.estimators_[i],title=titulo)
#     print("Estimators' accuracies: ",listaAcc)
#     print("Estimators' Gmeans: ",listaGmean)
#     print("Estimators' Balanced accuracies: ",listaBAcc)
    return clf

def gmean_test(clf, X_test, y_test):
    # Se calcula el porcentaje de acierto de AdaBoost
    acc = clf.score(X_test,y_test)*100
#     accGlobal.append(acc)
    y_pred = clf.predict(X_test)
    gmean = geometric_mean_score(y_test, y_pred)*100
#     gmeanGlobal.append(gmean)
    bAcc = balanced_accuracy_score(y_test, y_pred)*100
#     baccGlobal.append(bAcc)
    
    return gmean

def train_ensemble_method(dataset_name,method_name, T=10, k=5):
    #fetch data from dataset
    X, y = obtain_data(dataset_name)
    print("Dataset of size {}".format(X.shape))
    
    #convert, just in case, class labels to -1 (majoritary class) and 1 (minoritari class)
    classes, maj_class, min_class = convert_classes(y)
    
    #number of instances of each class and IR
    n_maj = X[y==maj_class].shape[0]
    n_min = X[y==min_class].shape[0]
    IR = n_maj/n_min
    print("There are {} instances for the majoritary class".format(n_maj))
    print("There are {} instanes for the minoritary class".format(n_min))
    print("IR of the dataset: ",IR)
    
    # Llamada al constructor del clasificador 
    dtc = DecisionTreeClassifier(criterion='entropy', max_depth=1)

    kf = StratifiedKFold(n_splits=k)

    accGlobal = []
    gmean = []
    baccGlobal = []
    for train_index, test_index in kf.split(X,y):
    #     print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        clf = train(X_train, y_train, method_name, dtc, T)
        
        partial_gmean = gmean_test(clf, X_test, y_test)
        
        gmean.append(partial_gmean)
        
    print(gmean)
    rend = stats.mean(gmean)
    print("Rendimiento del clasificador {}: {}".format(method_name,rend))
    return rend, IR
    

In [3]:
imblearn_datasets = [
    'ecoli',
    'optical_digits',
    'satimage',
    'pen_digits',
    'abalone',
    'sick_euthyroid',
    'spectrometer',
    'car_eval_34',
    'isolet',
    'us_crime',
    'yeast_ml8',
    'scene',
    'libras_move',
    'thyroid_sick',
    'coil_2000',
    'arrhythmia',
    'solar_flare_m0',
    'oil',
    'car_eval_4',
    'wine_quality',
    'letter_img',
    'yeast_me2',
    'webpage',
    'ozone_level',
    'mammography',
    'protein_homo',
    'abalone_19'
]

In [4]:
rendimientos = []
IRlista = []
for dataset in imblearn_datasets:
    print(dataset.upper())
    r, IR = train_ensemble_method(dataset,'adaboost')
    rendimientos.append(r)
    IRlista.append(IR)
    print()


ECOLI
Dataset of size (336, 7)
There are 301 instances for the majoritary class
There are 35 instanes for the minoritary class
IR of the dataset:  8.6
[75.59289460184544, 53.45224838248488, 69.69320524371696, 48.30458915396479, 75.59289460184544]
Rendimiento del clasificador adaboost: 64.5271663967715

OPTICAL_DIGITS
Dataset of size (5620, 64)
There are 5066 instances for the majoritary class
There are 554 instanes for the minoritary class
IR of the dataset:  9.144404332129964
[82.83353798577706, 85.95508552072386, 85.39662638543291, 84.43308197798834, 81.67688390300208]
Rendimiento del clasificador adaboost: 84.05904315458484

SATIMAGE
Dataset of size (6435, 36)
There are 5809 instances for the majoritary class
There are 626 instanes for the minoritary class
IR of the dataset:  9.279552715654953
[68.14785990000041, 37.46438354522076, 81.97812562443112, 62.87763709083991, 0.0]
Rendimiento del clasificador adaboost: 50.09360123209844

PEN_DIGITS
Dataset of size (10992, 16)
There are 993

[0.0, 0.0, 0.0, 0.0, 0.0]
Rendimiento del clasificador adaboost: 0.0



In [5]:
print("Lista de rendimientos para cada uno de los datasets: ",rendimientos)
print()
# print(rendimientos.index(max(rendimientos)))

rend_sorted_idx = np.argsort(rendimientos)
# print(rend_sorted_idx[-5:])
best_rend_values = np.array(rendimientos)[rend_sorted_idx[-5:]]
best_rend_datasets = np.array(imblearn_datasets)[rend_sorted_idx[-5:]]
best_rend_IR = np.array(IRlista)[rend_sorted_idx[-5:]]
print("Top 5 rendimientos: ",best_rend_values)
print()
print("Datasets correspondientes a los top 5 rendimientos: ",best_rend_datasets)
print()
print("IR correspondiente a los top 5 rendimientos: ",best_rend_IR)
print()


worst_rend_values = np.array(rendimientos)[rend_sorted_idx[:5]]
worst_rend_datasets = np.array(imblearn_datasets)[rend_sorted_idx[:5]]
worst_rend_IR = np.array(IRlista)[rend_sorted_idx[:5]]
print("Peores 5 rendimientos: ",worst_rend_values)
print()
print("Datasets correspondientes a los 5 peores rendimientos: ",worst_rend_datasets)
print()
print("IR correspondiente a los 5 peores rendimientos: ",worst_rend_IR)

Lista de rendimientos para cada uno de los datasets:  [64.5271663967715, 84.05904315458484, 50.09360123209844, 77.55826997884893, 0.0, 90.83209729177982, 80.45028164996583, 72.65721668996557, 77.07744420466578, 58.90030069509612, 0.0, 13.843194467222235, 53.75590277626545, 80.87741340471142, 6.3106731777528156, 73.35606366916791, 18.693911854720064, 33.025523320821215, 66.47547346705136, 33.2133690952931, 85.71739681348436, 58.88856701986391, 23.75613642579839, 38.039952979962095, 70.16354916352098, 80.89162707094685, 0.0]

Top 5 rendimientos:  [80.8774134  80.89162707 84.05904315 85.71739681 90.83209729]

Datasets correspondientes a los top 5 rendimientos:  ['thyroid_sick' 'protein_homo' 'optical_digits' 'letter_img'
 'sick_euthyroid']

IR correspondiente a los top 5 rendimientos:  [ 15.32900433 111.46219136   9.14440433  26.2479564    9.79522184]

Peores 5 rendimientos:  [ 0.          0.          0.          6.31067318 13.84319447]

Datasets correspondientes a los 5 peores rendimient

In [6]:
print("Lista de IR para cada uno de los datasets: ",IRlista)
print()
# print(rendimientos.index(max(rendimientos)))

IR_sorted_idx = np.argsort(IRlista)
# print(rend_sorted_idx[-5:])
best_IR_values = np.array(IRlista)[IR_sorted_idx[-5:]]
best_IR_datasets = np.array(imblearn_datasets)[IR_sorted_idx[-5:]]
best_IR_rend = np.array(rendimientos)[IR_sorted_idx[-5:]]
print("Top 5 IR: ",best_IR_values)
print()
print("Datasets correspondientes a los top 5 IR: ",best_IR_datasets)
print()
print("Rendimietno correspondiente a los top 5 IR: ",best_IR_rend)
print()


worst_IR_values = np.array(IRlista)[IR_sorted_idx[:5]]
worst_IR_datasets = np.array(imblearn_datasets)[IR_sorted_idx[:5]]
worst_IR_rend = np.array(rendimientos)[IR_sorted_idx[:5]]
print("5 peores IR: ",worst_IR_values)
print()
print("Datasets correspondientes a los peores 5 IR: ",worst_IR_datasets)
print()
print("Rendimietno correspondiente a los peores 5 IR: ",worst_IR_rend)

Lista de IR para cada uno de los datasets:  [8.6, 9.144404332129964, 9.279552715654953, 9.418957345971563, 9.682864450127877, 9.795221843003413, 10.8, 11.895522388059701, 11.995, 12.293333333333333, 12.57865168539326, 12.598870056497175, 14.0, 15.329004329004329, 15.761092150170649, 17.08, 19.426470588235293, 21.853658536585368, 25.584615384615386, 25.76502732240437, 26.247956403269754, 28.098039215686274, 34.45361875637105, 33.73972602739726, 42.011538461538464, 111.4621913580247, 129.53125]

Top 5 IR:  [ 33.73972603  34.45361876  42.01153846 111.46219136 129.53125   ]

Datasets correspondientes a los top 5 IR:  ['ozone_level' 'webpage' 'mammography' 'protein_homo' 'abalone_19']

Rendimietno correspondiente a los top 5 IR:  [38.03995298 23.75613643 70.16354916 80.89162707  0.        ]

5 peores IR:  [8.6        9.14440433 9.27955272 9.41895735 9.68286445]

Datasets correspondientes a los peores 5 IR:  ['ecoli' 'optical_digits' 'satimage' 'pen_digits' 'abalone']

Rendimietno correspond

No existe relación entre el IR ofrecido por el dataset y el rendimiento del clasificador

In [7]:
print("La media de rendimiento entre todos los datasets es de :",stats.mean(rendimientos))

La media de rendimiento entre todos los datasets es de : 51.59867318519848
