In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold

from imblearn.metrics import geometric_mean_score
from sklearn.metrics import balanced_accuracy_score
import statistics as stats

# Se importan las librerías de AdaBoost y de árboles de decisión 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from imblearn.metrics import geometric_mean_score
from sklearn.metrics import balanced_accuracy_score
from imblearn.ensemble import RUSBoostClassifier

from imblearn.datasets import fetch_datasets

import math

In [2]:
from collections import Counter

import numpy as np
from sklearn.base import is_regressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble.forest import BaseForest
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
from sklearn.tree.tree import BaseDecisionTree
from sklearn.utils import check_random_state
from sklearn.utils import check_X_y
#from sklearn.utils import shuffle


class SMOTE(object):
    """Implementation of Synthetic Minority Over-Sampling Technique (SMOTE).
    SMOTE performs oversampling of the minority class by picking target 
    minority class samples and their nearest minority class neighbors and 
    generating new samples that linearly combine features of each target 
    sample with features of its selected minority class neighbors [1].
    Parameters
    ----------
    k_neighbors : int, optional (default=5)
        Number of nearest neighbors.
    random_state : int or None, optional (default=None)
        If int, random_state is the seed used by the random number generator.
        If None, the random number generator is the RandomState instance used
        by np.random.
    References
    ----------
    .. [1] N. V. Chawla, K. W. Bowyer, L. O. Hall, and P. Kegelmeyer. "SMOTE:
           Synthetic Minority Over-Sampling Technique." Journal of Artificial
           Intelligence Research (JAIR), 2002.
    """

    def __init__(self, k_neighbors=5, random_state=None):
        self.k = k_neighbors
        self.random_state = random_state

    def sample(self, n_samples):
        """Generate samples.
        Parameters
        ----------
        n_samples : int
            Number of new synthetic samples.
        Returns
        -------
        S : array, shape = [n_samples, n_features]
            Returns synthetic samples.
        """
        np.random.seed(seed=self.random_state)

        S = np.zeros(shape=(n_samples, self.n_features))
        # Calculate synthetic samples.
        for i in range(n_samples):
            j = np.random.randint(0, self.X.shape[0])

            # Find the NN for each sample.
            # Exclude the sample itself.
            nn = self.neigh.kneighbors(self.X[j].reshape(1, -1),
                                       return_distance=False)[:, 1:]
            nn_index = np.random.choice(nn[0])

            dif = self.X[nn_index] - self.X[j]
            gap = np.random.random()

            S[i, :] = self.X[j, :] + gap * dif[:]

        return S

    def fit(self, X):
        """Train model based on input data.
        Parameters
        ----------
        X : array-like, shape = [n_minority_samples, n_features]
            Holds the minority samples.
        """
        self.X = X
        self.n_minority_samples, self.n_features = self.X.shape

        # Learn nearest neighbors.
        self.neigh = NearestNeighbors(n_neighbors=self.k + 1)
        self.neigh.fit(self.X)

        return self


class SMOTEBoost(AdaBoostClassifier):
    """Implementation of SMOTEBoost.
    SMOTEBoost introduces data sampling into the AdaBoost algorithm by
    oversampling the minority class using SMOTE on each boosting iteration [1].
    This implementation inherits methods from the scikit-learn 
    AdaBoostClassifier class, only modifying the `fit` method.
    Parameters
    ----------
    n_samples : int, optional (default=100)
        Number of new synthetic samples per boosting step.
    k_neighbors : int, optional (default=5)
        Number of nearest neighbors.
    base_estimator : object, optional (default=DecisionTreeClassifier)
        The base estimator from which the boosted ensemble is built.
        Support for sample weighting is required, as well as proper `classes_`
        and `n_classes_` attributes.
    n_estimators : int, optional (default=50)
        The maximum number of estimators at which boosting is terminated.
        In case of perfect fit, the learning procedure is stopped early.
    learning_rate : float, optional (default=1.)
        Learning rate shrinks the contribution of each classifier by
        ``learning_rate``. There is a trade-off between ``learning_rate`` and
        ``n_estimators``.
    algorithm : {'SAMME', 'SAMME.R'}, optional (default='SAMME.R')
        If 'SAMME.R' then use the SAMME.R real boosting algorithm.
        ``base_estimator`` must support calculation of class probabilities.
        If 'SAMME' then use the SAMME discrete boosting algorithm.
        The SAMME.R algorithm typically converges faster than SAMME,
        achieving a lower test error with fewer boosting iterations.
    random_state : int or None, optional (default=None)
        If int, random_state is the seed used by the random number generator.
        If None, the random number generator is the RandomState instance used
        by np.random.
    References
    ----------
    .. [1] N. V. Chawla, A. Lazarevic, L. O. Hall, and K. W. Bowyer.
           "SMOTEBoost: Improving Prediction of the Minority Class in
           Boosting." European Conference on Principles of Data Mining and
           Knowledge Discovery (PKDD), 2003.
    """

    def __init__(self,
                 n_samples=100,
                 k_neighbors=5,
                 base_estimator=None,
                 n_estimators=50,
                 learning_rate=1.,
                 algorithm='SAMME.R',
                 random_state=None):

        self.n_samples = n_samples
        self.algorithm = algorithm
        self.smote = SMOTE(k_neighbors=k_neighbors,
                           random_state=random_state)

        super(SMOTEBoost, self).__init__(
            base_estimator=base_estimator,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            random_state=random_state)

    def fit(self, X, y, sample_weight=None, minority_target=None):
        """Build a boosted classifier/regressor from the training set (X, y),
        performing SMOTE during each boosting step.
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrix can be CSC, CSR, COO,
            DOK, or LIL. COO, DOK, and LIL are converted to CSR. The dtype is
            forced to DTYPE from tree._tree if the base classifier of this
            ensemble weighted boosting classifier is a tree or forest.
        y : array-like of shape = [n_samples]
            The target values (class labels in classification, real numbers in
            regression).
        sample_weight : array-like of shape = [n_samples], optional
            Sample weights. If None, the sample weights are initialized to
            1 / n_samples.
        minority_target : int
            Minority class label.
        Returns
        -------
        self : object
            Returns self.
        Notes
        -----
        Based on the scikit-learn v0.18 AdaBoostClassifier and
        BaseWeightBoosting `fit` methods.
        """
        # Check that algorithm is supported.
        if self.algorithm not in ('SAMME', 'SAMME.R'):
            raise ValueError("algorithm %s is not supported" % self.algorithm)

        # Check parameters.
        if self.learning_rate <= 0:
            raise ValueError("learning_rate must be greater than zero")

        if (self.base_estimator is None or
                isinstance(self.base_estimator, (BaseDecisionTree,
                                                 BaseForest))):
            DTYPE = np.float64  # from fast_dict.pxd
            dtype = DTYPE
            accept_sparse = 'csc'
        else:
            dtype = None
            accept_sparse = ['csr', 'csc']

        X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=dtype,
                         y_numeric=is_regressor(self))

        if sample_weight is None:
            # Initialize weights to 1 / n_samples.
            sample_weight = np.empty(X.shape[0], dtype=np.float64)
            sample_weight[:] = 1. / X.shape[0]
        else:
            sample_weight = check_array(sample_weight, ensure_2d=False)
            # Normalize existing weights.
            sample_weight = sample_weight / sample_weight.sum(dtype=np.float64)

            # Check that the sample weights sum is positive.
            if sample_weight.sum() <= 0:
                raise ValueError(
                    "Attempting to fit with a non-positive "
                    "weighted number of samples.")

        if minority_target is None:
            # Determine the minority class label.
            stats_c_ = Counter(y)
            maj_c_ = max(stats_c_, key=stats_c_.get)
            min_c_ = min(stats_c_, key=stats_c_.get)
            self.minority_target = min_c_
        else:
            self.minority_target = minority_target

        # Check parameters.
        self._validate_estimator()

        # Clear any previous fit results.
        self.estimators_ = []
        self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64)
        self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64)

        random_state = check_random_state(self.random_state)

        for iboost in range(self.n_estimators):
            X_min = X[np.where(y == self.minority_target)]

            # SMOTE step.
            if len(X_min) >= self.smote.k:
                self.smote.fit(X_min)
                X_syn = self.smote.sample(self.n_samples)
                y_syn = np.full(X_syn.shape[0], fill_value=self.minority_target,
                                dtype=np.int64)

                # Normalize synthetic sample weights based on current training set.
                sample_weight_syn = np.empty(X_syn.shape[0], dtype=np.float64)
                sample_weight_syn[:] = 1. / X.shape[0]

                # Combine the original and synthetic samples.
                X = np.vstack((X, X_syn))
                y = np.append(y, y_syn)

                # Combine the weights.
                sample_weight = \
                    np.append(sample_weight, sample_weight_syn).reshape(-1, 1)
                sample_weight = \
                    np.squeeze(normalize(sample_weight, axis=0, norm='l1'))

                # X, y, sample_weight = shuffle(X, y, sample_weight,
                #                              random_state=random_state)

            # Boosting step.
            sample_weight, estimator_weight, estimator_error = self._boost(
                iboost,
                X, y,
                sample_weight,
                random_state)

            # Early termination.
            if sample_weight is None:
                break

            self.estimator_weights_[iboost] = estimator_weight
            self.estimator_errors_[iboost] = estimator_error

            # Stop if error is zero.
            if estimator_error == 0:
                break

            sample_weight_sum = np.sum(sample_weight)

            # Stop if the sum of sample weights has become non-positive.
            if sample_weight_sum <= 0:
                break

            if iboost < self.n_estimators - 1:
                # Normalize.
                sample_weight /= sample_weight_sum

        return self



In [3]:
def obtain_data(dataset_name):
    dataset = fetch_datasets()[dataset_name]
    return dataset.data,dataset.target

def convert_classes(y):
    default_classes = np.unique(y)
#     print("Default classes of the dataset were: ",default_classes)
    maj_class = -1
    min_class = 1
    if sum(y == default_classes[0]) > sum(y == default_classes[1]):
    #     maj_class = default_classes[0]
    #     min_class = default_classes[1]
        y[y==default_classes[0]] = maj_class
        y[y==default_classes[1]] = min_class
    else:
    #     maj_class = default_classes[1]
    #     min_class = default_classes[0]
        y[y==default_classes[1]] = maj_class
        y[y==default_classes[0]] = min_class

#     print("There are {} instances for the majoritary class".format(sum(y == maj_class)))
#     print("There are {} instanes for the minoritary class".format(sum(y == min_class)))
    return [maj_class,min_class], maj_class, min_class

def train(X_train, y_train, method_name, base_classifier, T, min_class, IR):
    if method_name=='adaboost':
        clf = AdaBoostClassifier(base_estimator=base_classifier, n_estimators=T)
    elif method_name=='RUSBoost':
        clf = RUSBoostClassifier(base_estimator=base_classifier,n_estimators=T,sampling_strategy='majority')
    elif method_name=='SMOTEBoost':
#         n_syn = (math.floor(IR)-1)*X_train[y_train==min_class].shape[0]
        n_syn = X_train[y_train==min_class].shape[0]
        clf = SMOTEBoost(n_samples=n_syn, base_estimator=base_classifier, n_estimators=T)
#         clf = SMOTEBoost(base_estimator=base_classifier, n_estimators=T)

    clf.fit(X_train,y_train)
    return clf

def gmean_test(clf, X_test, y_test):
    # Se calcula el porcentaje de acierto de AdaBoost
    acc = clf.score(X_test,y_test)*100
#     accGlobal.append(acc)
    y_pred = clf.predict(X_test)
    gmean = geometric_mean_score(y_test, y_pred)*100
#     gmeanGlobal.append(gmean)
    bAcc = balanced_accuracy_score(y_test, y_pred)*100
#     baccGlobal.append(bAcc)
    
    return gmean

def train_ensemble_method(dataset_name,method_name, T=10, k=5):
    #fetch data from dataset
    X, y = obtain_data(dataset_name)
    print("Dataset of size {}".format(X.shape))
    
    #convert, just in case, class labels to -1 (majoritary class) and 1 (minoritari class)
    classes, maj_class, min_class = convert_classes(y)
    
    #number of instances of each class and IR
    n_maj = X[y==maj_class].shape[0]
    n_min = X[y==min_class].shape[0]
    IR = n_maj/n_min
    print("There are {} instances for the majoritary class".format(n_maj))
    print("There are {} instanes for the minoritary class".format(n_min))
    print("IR of the dataset: ",IR)
    
    # Llamada al constructor del clasificador 
    dtc = DecisionTreeClassifier(criterion='entropy', max_depth=1)

    kf = StratifiedKFold(n_splits=k)

    accGlobal = []
    gmean = []
    baccGlobal = []
    for train_index, test_index in kf.split(X,y):
    #     print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        clf = train(X_train, y_train, method_name, dtc, T, min_class, IR)
        
        partial_gmean = gmean_test(clf, X_test, y_test)
        
        gmean.append(partial_gmean)
        
    print(gmean)
    rend = stats.mean(gmean)
    print("Rendimiento del clasificador {}: {}".format(method_name,rend))
    return rend, IR
    

In [4]:
imblearn_datasets = [
    'ecoli',
    'optical_digits',
    'satimage',
    'pen_digits',
    'abalone',
    'sick_euthyroid',
    'spectrometer',
    'car_eval_34',
    'isolet',
    'us_crime',
    'yeast_ml8',
    'scene',
    'libras_move',
    'thyroid_sick',
    'coil_2000',
    'arrhythmia',
    'solar_flare_m0',
    'oil',
    'car_eval_4',
    'wine_quality',
    'letter_img',
    'yeast_me2',
    'webpage',
    'ozone_level',
    'mammography',
    'protein_homo',
    'abalone_19'
]

In [5]:
rendimientos = []
IRlista = []
for dataset in imblearn_datasets:
    print(dataset.upper())
    r, IR = train_ensemble_method(dataset,'SMOTEBoost')
    rendimientos.append(r)
    IRlista.append(IR)
    print()


ECOLI
Dataset of size (336, 7)
There are 301 instances for the majoritary class
There are 35 instanes for the minoritary class
IR of the dataset:  8.6
[100.0, 92.58200997725514, 78.52812659593165, 68.1385143869247, 91.80725150319788]
Rendimiento del clasificador SMOTEBoost: 86.21118049266187

OPTICAL_DIGITS
Dataset of size (5620, 64)
There are 5066 instances for the majoritary class
There are 554 instanes for the minoritary class
IR of the dataset:  9.144404332129964
[88.02415434556443, 86.16589683840994, 89.82253520786301, 85.22931420856592, 82.902446042826]
Rendimiento del clasificador SMOTEBoost: 86.42886932864586

SATIMAGE
Dataset of size (6435, 36)
There are 5809 instances for the majoritary class
There are 626 instanes for the minoritary class
IR of the dataset:  9.279552715654953
[71.33977268644419, 64.99040045624426, 82.55806906439852, 82.13166877332114, 84.14497487158144]
Rendimiento del clasificador SMOTEBoost: 77.03297717039791

PEN_DIGITS
Dataset of size (10992, 16)
There a

KeyboardInterrupt: 

In [None]:
print("Lista de rendimientos para cada uno de los datasets: ",rendimientos)
print()
# print(rendimientos.index(max(rendimientos)))

rend_sorted_idx = np.argsort(rendimientos)
# print(rend_sorted_idx[-5:])
best_rend_values = np.array(rendimientos)[rend_sorted_idx[-5:]]
best_rend_datasets = np.array(imblearn_datasets)[rend_sorted_idx[-5:]]
best_rend_IR = np.array(IRlista)[rend_sorted_idx[-5:]]
print("Top 5 rendimientos: ",best_rend_values)
print()
print("Datasets correspondientes a los top 5 rendimientos: ",best_rend_datasets)
print()
# print("IR correspondiente a los top 5 rendimientos: ",best_rend_IR)
print()


worst_rend_values = np.array(rendimientos)[rend_sorted_idx[:5]]
worst_rend_datasets = np.array(imblearn_datasets)[rend_sorted_idx[:5]]
worst_rend_IR = np.array(IRlista)[rend_sorted_idx[:5]]
print("Peores 5 rendimientos: ",worst_rend_values)
print()
print("Datasets correspondientes a los 5 peores rendimientos: ",worst_rend_datasets)
print()
# print("IR correspondiente a los 5 peores rendimientos: ",worst_rend_IR)

No existe relación entre el IR ofrecido por el dataset y el rendimiento del clasificador

In [None]:
print("La media de rendimiento entre todos los datasets es de :",stats.mean(rendimientos))

In [None]:
print("n_samples default\nLa media de rendimiento entre todos los datasets es de :",stats.mean(rendimientos))

In [None]:
print("n_samples n+*(floor(ir)-1)\nLa media de rendimiento entre todos los datasets es de :",stats.mean(rendimientos))