# Ada-Boost (AdaB)

In [37]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split

In [38]:
# Se importan las librerías de AdaBoost y de árboles de decisión 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from imblearn.datasets import fetch_datasets

In [67]:
abalone = fetch_datasets()['abalone']
X, y = abalone.data, abalone.target


In [49]:
satimage = fetch_datasets()['satimage']
X, y = satimage.data, satimage.target

In [50]:
def mostrar(X, y, clasificador=None, title=None):
    """
    Esta función muestra las fronteras de decisión del clasificador ya entrenado y los ejemplos en X
    (con el color dependiendo de y).
    :param clasificador: Clasificador entrenado de scikit-learn
    :param X: Matriz con los ejemplos a mostrar
    :param y: Vector con las salidas de los ejemplos a mostrar
    :return: Nada
    """
    # Creamos los mapas de colores a utilizar
    cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#FFFFAA', '#AAAAFF'])
    cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#FFFF00', '#0000FF'])

    # Creamos la figura
    plt.figure(figsize=(10,8))

    # Primer plot a la izquierda
    plt.subplot(111)

    if clasificador is not None:
        # Preparamos los ejemplos de entrada para poder pintar la frontera de decisión
        # Asignamos una clase (color) a cada ejemplo de la malla en [x_min, x_max]x[y_min, y_max].
        x_min, x_max = X[:, 0].min() * 0.9-0.05, X[:, 0].max() * 1.1
        y_min, y_max = X[:, 1].min() * 0.9-0.05, X[:, 1].max() * 1.1
        xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200),
                             np.linspace(y_min, y_max, 200))

        # Clasificamos los puntos
        # <RELLENAR>
        Z = clasificador.predict(np.hstack((xx.reshape(-1,1),yy.reshape(-1,1))))
        # Ponemos el resultado en el formato deseado
        # <RELLENAR>
        Z = Z.reshape(xx.shape)
        # Pintamos las fronteras
        plt.contourf(xx, yy, Z, cmap=cmap_light, alpha=0.2)
    
    # Pintamos los puntos
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, s=60)
    # Asignamos el título
    plt.xlabel('Variable 1')
    plt.ylabel('Variable 2')
    if title is None:
        plt.title("Ejemplos de Train")
    else:
        # Establecemos el título recibido como parámetro
        # <RELLENAR>
        plt.title(title)    
    
    if clasificador is not None:
        # Establecemos los límites
        plt.xlim(xx.min(), xx.max())
        plt.ylim(yy.min(), yy.max())

    
    # Mostramos la figura
    plt.show()
    return

In [68]:
default_classes = np.unique(y)
print(default_classes)

[-1  1]


In [69]:
maj_class = -1
min_class = 1
if sum(y == default_classes[0]) > sum(y == default_classes[1]):
#     maj_class = default_classes[0]
#     min_class = default_classes[1]
    y[y==default_classes[0]] = maj_class
    y[y==default_classes[1]] = min_class
else:
#     maj_class = default_classes[1]
#     min_class = default_classes[0]
    y[y==default_classes[1]] = maj_class
    y[y==default_classes[0]] = min_class
    
print("There are {} instances for the majoritary class".format(sum(y == maj_class)))
print("There are {} instanes for the minoritary class".format(sum(y == min_class)))
classes = [maj_class,min_class]

There are 3786 instances for the majoritary class
There are 391 instanes for the minoritary class


In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,random_state=0)
#number of features of the dataset 
D = X_train.shape[1]

print("There are {} instances for the majoritary class".format(sum(y_train == maj_class)))
print("There are {} instanes for the minoritary class".format(sum(y_train == min_class)))

There are 2839 instances for the majoritary class
There are 293 instanes for the minoritary class


In [71]:
# Se define el número de clasificadores base de AdaBoost (numClasificadoresBase)
numClasificadoresBase = 10
# Llamada al constructor del clasificador AdaBoost
dtc = DecisionTreeClassifier(criterion='entropy', max_depth=1)
adaboost = AdaBoostClassifier(base_estimator=dtc, n_estimators=numClasificadoresBase)
# Entrenamiento del clasificador creado
# <RELLENAR>
adaboost.fit(X_train,y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy',
                                                         max_depth=1),
                   n_estimators=10)

In [72]:
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import balanced_accuracy_score
# Lista para almacenar el accuracy de cada clasificador base
listaAcc = []
listaGmean = []
listaBAcc = []
# Por cada clasificador base
for i in range(len(adaboost.estimators_)):
    # Se calcula el porcentaje de acierto del clasificador base correspondiente: adaboost.estimators_[i]
    # Redondear a dos decimales
    y_pred = adaboost.estimators_[i].predict(X_test)
    gmean = round(geometric_mean_score(y_test, y_pred)*100,2)
    bAcc = round(balanced_accuracy_score(y_test,y_pred)*100,2)
    acc = round(adaboost.estimators_[i].score(X_test,y_test)*100,2)
    # Se añade a la lista de accuracies
    listaAcc.append(acc)
    listaGmean.append(gmean)
    listaBAcc.append(bAcc)
    # Establecemos el título de la figura con el número de clasificador y su precisión en train
    #titulo = 'Clasificador {}, accuracy: {}%'.format(i, acc)
    # Mostramos la figura con los datos de train y la frontera del clasificador correspondiente
    #mostrar(X_test,y_test,clasificador = adaboost.estimators_[i],title=titulo)

In [73]:
print("Estimators' accuracies: ",listaAcc)
print("Estimators' Gmeans: ",listaGmean)
print("Estimators' Balanced accuracies: ",listaBAcc)

Estimators' accuracies:  [90.62, 31.0, 12.54, 53.78, 79.23, 40.48, 49.0, 47.94, 68.9, 31.0]
Estimators' Gmeans:  [0.0, 48.56, 18.85, 17.32, 74.62, 58.16, 19.52, 64.47, 8.8, 48.56]
Estimators' Balanced accuracies:  [50.0, 61.02, 51.28, 31.96, 74.82, 66.24, 30.23, 69.91, 38.47, 61.02]


Para mostrar los puntos en el scatter plot se debe escoger sólo dos atriubutos
NO SÉ HASTA QUÉ PUNTO TIENE SENTIDO Y SI ESTÁ BIEN

In [74]:
# X_train_2features = X_train[:,:2]
# X_test_2features = X_test[:,:2]
# # Se define el número de clasificadores base de AdaBoost (numClasificadoresBase)
# numClasificadoresBase = 10
# # Llamada al constructor del clasificador AdaBoost
# dtc = DecisionTreeClassifier(criterion='entropy', max_depth=1)
# adaboost = AdaBoostClassifier(base_estimator=dtc, n_estimators=numClasificadoresBase)
# # Entrenamiento del clasificador creado
# # <RELLENAR>
# adaboost.fit(X_train_2features,y_train)
# # Lista para almacenar el accuracy de cada clasificador base
# listaAcc = []
# # Por cada clasificador base
# for i in range(len(adaboost.estimators_)):
#     # Se calcula el porcentaje de acierto del clasificador base correspondiente: adaboost.estimators_[i]
#     # Redondear a dos decimales
#     acc = round(adaboost.estimators_[i].score(X_test_2features,y_test)*100,2)
#     # Se añade a la lista de accuracies
#     listaAcc.append(acc)
#     # Establecemos el título de la figura con el número de clasificador y su precisión en train
#     titulo = 'Clasificador {}, accuracy: {}%'.format(i, acc)
#     # Mostramos la figura con los datos de train y la frontera del clasificador correspondiente
#     # <RELLENAR>
#     mostrar(X_test_2features,y_test,clasificador = adaboost.estimators_[i],title=titulo)
# print(listaAcc)

In [75]:
# Se calcula el porcentaje de acierto de AdaBoost
acc = adaboost.score(X_test,y_test)*100
print(acc)
y_pred = adaboost.predict(X_test)
gmean = geometric_mean_score(y_test, y_pred)*100
print(gmean)
bAcc = balanced_accuracy_score(y_test, y_pred)*100
print(bAcc)

90.622009569378
10.096190602031177
50.457405771178585


# SMOTE-Boost (SBO)

In [78]:
from collections import Counter

import numpy as np
from sklearn.base import is_regressor
from sklearn.ensemble import AdaBoostClassifier
# from sklearn.ensemble.forest import BaseForest
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
#from sklearn.tree.tree import BaseDecisionTree
from sklearn.utils import check_random_state
from sklearn.utils import check_X_y
#from sklearn.utils import shuffle


class SMOTE(object):
    """Implementation of Synthetic Minority Over-Sampling Technique (SMOTE).
    SMOTE performs oversampling of the minority class by picking target 
    minority class samples and their nearest minority class neighbors and 
    generating new samples that linearly combine features of each target 
    sample with features of its selected minority class neighbors [1].
    Parameters
    ----------
    k_neighbors : int, optional (default=5)
        Number of nearest neighbors.
    random_state : int or None, optional (default=None)
        If int, random_state is the seed used by the random number generator.
        If None, the random number generator is the RandomState instance used
        by np.random.
    References
    ----------
    .. [1] N. V. Chawla, K. W. Bowyer, L. O. Hall, and P. Kegelmeyer. "SMOTE:
           Synthetic Minority Over-Sampling Technique." Journal of Artificial
           Intelligence Research (JAIR), 2002.
    """

    def __init__(self, k_neighbors=5, random_state=None):
        self.k = k_neighbors
        self.random_state = random_state

    def sample(self, n_samples):
        """Generate samples.
        Parameters
        ----------
        n_samples : int
            Number of new synthetic samples.
        Returns
        -------
        S : array, shape = [n_samples, n_features]
            Returns synthetic samples.
        """
        np.random.seed(seed=self.random_state)

        S = np.zeros(shape=(n_samples, self.n_features))
        # Calculate synthetic samples.
        for i in range(n_samples):
            j = np.random.randint(0, self.X.shape[0])

            # Find the NN for each sample.
            # Exclude the sample itself.
            nn = self.neigh.kneighbors(self.X[j].reshape(1, -1),
                                       return_distance=False)[:, 1:]
            nn_index = np.random.choice(nn[0])

            dif = self.X[nn_index] - self.X[j]
            gap = np.random.random()

            S[i, :] = self.X[j, :] + gap * dif[:]

        return S

    def fit(self, X):
        """Train model based on input data.
        Parameters
        ----------
        X : array-like, shape = [n_minority_samples, n_features]
            Holds the minority samples.
        """
        self.X = X
        self.n_minority_samples, self.n_features = self.X.shape

        # Learn nearest neighbors.
        self.neigh = NearestNeighbors(n_neighbors=self.k + 1)
        self.neigh.fit(self.X)

        return self


class SMOTEBoost(AdaBoostClassifier):
    """Implementation of SMOTEBoost.
    SMOTEBoost introduces data sampling into the AdaBoost algorithm by
    oversampling the minority class using SMOTE on each boosting iteration [1].
    This implementation inherits methods from the scikit-learn 
    AdaBoostClassifier class, only modifying the `fit` method.
    Parameters
    ----------
    n_samples : int, optional (default=100)
        Number of new synthetic samples per boosting step.
    k_neighbors : int, optional (default=5)
        Number of nearest neighbors.
    base_estimator : object, optional (default=DecisionTreeClassifier)
        The base estimator from which the boosted ensemble is built.
        Support for sample weighting is required, as well as proper `classes_`
        and `n_classes_` attributes.
    n_estimators : int, optional (default=50)
        The maximum number of estimators at which boosting is terminated.
        In case of perfect fit, the learning procedure is stopped early.
    learning_rate : float, optional (default=1.)
        Learning rate shrinks the contribution of each classifier by
        ``learning_rate``. There is a trade-off between ``learning_rate`` and
        ``n_estimators``.
    algorithm : {'SAMME', 'SAMME.R'}, optional (default='SAMME.R')
        If 'SAMME.R' then use the SAMME.R real boosting algorithm.
        ``base_estimator`` must support calculation of class probabilities.
        If 'SAMME' then use the SAMME discrete boosting algorithm.
        The SAMME.R algorithm typically converges faster than SAMME,
        achieving a lower test error with fewer boosting iterations.
    random_state : int or None, optional (default=None)
        If int, random_state is the seed used by the random number generator.
        If None, the random number generator is the RandomState instance used
        by np.random.
    References
    ----------
    .. [1] N. V. Chawla, A. Lazarevic, L. O. Hall, and K. W. Bowyer.
           "SMOTEBoost: Improving Prediction of the Minority Class in
           Boosting." European Conference on Principles of Data Mining and
           Knowledge Discovery (PKDD), 2003.
    """

    def __init__(self,
                 n_samples=100,
                 k_neighbors=5,
                 base_estimator=None,
                 n_estimators=50,
                 learning_rate=1.,
                 algorithm='SAMME.R',
                 random_state=None):

        self.n_samples = n_samples
        self.algorithm = algorithm
        self.smote = SMOTE(k_neighbors=k_neighbors,
                           random_state=random_state)

        super(SMOTEBoost, self).__init__(
            base_estimator=base_estimator,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            random_state=random_state)

    def fit(self, X, y, sample_weight=None, minority_target=None):
        """Build a boosted classifier/regressor from the training set (X, y),
        performing SMOTE during each boosting step.
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrix can be CSC, CSR, COO,
            DOK, or LIL. COO, DOK, and LIL are converted to CSR. The dtype is
            forced to DTYPE from tree._tree if the base classifier of this
            ensemble weighted boosting classifier is a tree or forest.
        y : array-like of shape = [n_samples]
            The target values (class labels in classification, real numbers in
            regression).
        sample_weight : array-like of shape = [n_samples], optional
            Sample weights. If None, the sample weights are initialized to
            1 / n_samples.
        minority_target : int
            Minority class label.
        Returns
        -------
        self : object
            Returns self.
        Notes
        -----
        Based on the scikit-learn v0.18 AdaBoostClassifier and
        BaseWeightBoosting `fit` methods.
        """
        # Check that algorithm is supported.
        if self.algorithm not in ('SAMME', 'SAMME.R'):
            raise ValueError("algorithm %s is not supported" % self.algorithm)

        # Check parameters.
        if self.learning_rate <= 0:
            raise ValueError("learning_rate must be greater than zero")

        if (self.base_estimator is None or
                isinstance(self.base_estimator, (BaseDecisionTree,
                                                 BaseForest))):
            DTYPE = np.float64  # from fast_dict.pxd
            dtype = DTYPE
            accept_sparse = 'csc'
        else:
            dtype = None
            accept_sparse = ['csr', 'csc']

        X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=dtype,
                         y_numeric=is_regressor(self))

        if sample_weight is None:
            # Initialize weights to 1 / n_samples.
            sample_weight = np.empty(X.shape[0], dtype=np.float64)
            sample_weight[:] = 1. / X.shape[0]
        else:
            sample_weight = check_array(sample_weight, ensure_2d=False)
            # Normalize existing weights.
            sample_weight = sample_weight / sample_weight.sum(dtype=np.float64)

            # Check that the sample weights sum is positive.
            if sample_weight.sum() <= 0:
                raise ValueError(
                    "Attempting to fit with a non-positive "
                    "weighted number of samples.")

        if minority_target is None:
            # Determine the minority class label.
            stats_c_ = Counter(y)
            maj_c_ = max(stats_c_, key=stats_c_.get)
            min_c_ = min(stats_c_, key=stats_c_.get)
            self.minority_target = min_c_
        else:
            self.minority_target = minority_target

        # Check parameters.
        self._validate_estimator()

        # Clear any previous fit results.
        self.estimators_ = []
        self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64)
        self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64)

        random_state = check_random_state(self.random_state)

        for iboost in range(self.n_estimators):
            X_min = X[np.where(y == self.minority_target)]

            # SMOTE step.
            if len(X_min) >= self.smote.k:
                self.smote.fit(X_min)
                X_syn = self.smote.sample(self.n_samples)
                y_syn = np.full(X_syn.shape[0], fill_value=self.minority_target,
                                dtype=np.int64)

                # Normalize synthetic sample weights based on current training set.
                sample_weight_syn = np.empty(X_syn.shape[0], dtype=np.float64)
                sample_weight_syn[:] = 1. / X.shape[0]

                # Combine the original and synthetic samples.
                X = np.vstack((X, X_syn))
                y = np.append(y, y_syn)

                # Combine the weights.
                sample_weight = \
                    np.append(sample_weight, sample_weight_syn).reshape(-1, 1)
                sample_weight = \
                    np.squeeze(normalize(sample_weight, axis=0, norm='l1'))

                # X, y, sample_weight = shuffle(X, y, sample_weight,
                #                              random_state=random_state)

            # Boosting step.
            sample_weight, estimator_weight, estimator_error = self._boost(
                iboost,
                X, y,
                sample_weight,
                random_state)

            # Early termination.
            if sample_weight is None:
                break

            self.estimator_weights_[iboost] = estimator_weight
            self.estimator_errors_[iboost] = estimator_error

            # Stop if error is zero.
            if estimator_error == 0:
                break

            sample_weight_sum = np.sum(sample_weight)

            # Stop if the sum of sample weights has become non-positive.
            if sample_weight_sum <= 0:
                break

            if iboost < self.n_estimators - 1:
                # Normalize.
                sample_weight /= sample_weight_sum

        return self

In [79]:
from collections import Counter

import numpy as np
from sklearn.base import is_regressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble.forest import BaseForest
from sklearn.preprocessing import normalize
from sklearn.tree.tree import BaseDecisionTree
from sklearn.utils import check_random_state
from sklearn.utils import check_X_y
#from sklearn.utils import shuffle


class RandomUnderSampler(object):
    """Implementation of random undersampling (RUS).
    Undersample the majority class(es) by randomly picking samples with or
    without replacement.
    Parameters
    ----------
    with_replacement : bool, optional (default=True)
        Undersample with replacement.
    return_indices : bool, optional (default=False)
        Whether or not to return the indices of the samples randomly selected
        from the majority class.
    random_state : int or None, optional (default=None)
        If int, random_state is the seed used by the random number generator.
        If None, the random number generator is the RandomState instance used
        by np.random.
    """

    def __init__(self, with_replacement=True, return_indices=False,
                 random_state=None):
        self.return_indices = return_indices
        self.with_replacement = with_replacement
        self.random_state = random_state

    def sample(self, n_samples):
        """Perform undersampling.
        Parameters
        ----------
        n_samples : int
            Number of samples to remove.
        Returns
        -------
        S : array, shape = [n_majority_samples - n_samples, n_features]
            Returns synthetic samples.
        """
        np.random.seed(seed=self.random_state)

        if self.n_majority_samples <= n_samples:
            n_samples = self.n_majority_samples

        idx = np.random.choice(self.n_majority_samples,
                               size=self.n_majority_samples - n_samples,
                               replace=self.with_replacement)

        if self.return_indices:
            return (self.X[idx], idx)
        else:
            return self.X[idx]

    def fit(self, X):
        """Train model based on input data.
        Parameters
        ----------
        X : array-like, shape = [n_majority_samples, n_features]
            Holds the majority samples.
        """
        self.X = X
        self.n_majority_samples, self.n_features = self.X.shape

        return self


class RUSBoost(AdaBoostClassifier):
    """Implementation of RUSBoost.
    RUSBoost introduces data sampling into the AdaBoost algorithm by
    undersampling the majority class using random undersampling (with or
    without replacement) on each boosting iteration [1].
    This implementation inherits methods from the scikit-learn 
    AdaBoostClassifier class, only modifying the `fit` method.
    Parameters
    ----------
    n_samples : int, optional (default=100)
        Number of new synthetic samples per boosting step.
    min_ratio : float (default=1.0)
        Minimum ratio of majority to minority class samples to generate.
    with_replacement : bool, optional (default=True)
        Undersample with replacement.
    base_estimator : object, optional (default=DecisionTreeClassifier)
        The base estimator from which the boosted ensemble is built.
        Support for sample weighting is required, as well as proper `classes_`
        and `n_classes_` attributes.
    n_estimators : int, optional (default=50)
        The maximum number of estimators at which boosting is terminated.
        In case of perfect fit, the learning procedure is stopped early.
    learning_rate : float, optional (default=1.)
        Learning rate shrinks the contribution of each classifier by
        ``learning_rate``. There is a trade-off between ``learning_rate`` and
        ``n_estimators``.
    algorithm : {'SAMME', 'SAMME.R'}, optional (default='SAMME.R')
        If 'SAMME.R' then use the SAMME.R real boosting algorithm.
        ``base_estimator`` must support calculation of class probabilities.
        If 'SAMME' then use the SAMME discrete boosting algorithm.
        The SAMME.R algorithm typically converges faster than SAMME,
        achieving a lower test error with fewer boosting iterations.
    random_state : int or None, optional (default=None)
        If int, random_state is the seed used by the random number generator.
        If None, the random number generator is the RandomState instance used
        by np.random.
    References
    ----------
    .. [1] C. Seiffert, T. M. Khoshgoftaar, J. V. Hulse, and A. Napolitano.
           "RUSBoost: Improving Classification Performance when Training Data
           is Skewed". International Conference on Pattern Recognition
           (ICPR), 2008.
    """

    def __init__(self,
                 n_samples=100,
                 min_ratio=1.0,
                 with_replacement=True,
                 base_estimator=None,
                 n_estimators=50,
                 learning_rate=1.,
                 algorithm='SAMME.R',
                 random_state=None):

        self.n_samples = n_samples
        self.min_ratio = min_ratio
        self.algorithm = algorithm
        self.rus = RandomUnderSampler(with_replacement=with_replacement,
                                      return_indices=True,
                                      random_state=random_state)

        super(RUSBoost, self).__init__(
            base_estimator=base_estimator,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            random_state=random_state)

    def fit(self, X, y, sample_weight=None, minority_target=None):
        """Build a boosted classifier/regressor from the training set (X, y),
        performing random undersampling during each boosting step.
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrix can be CSC, CSR, COO,
            DOK, or LIL. COO, DOK, and LIL are converted to CSR. The dtype is
            forced to DTYPE from tree._tree if the base classifier of this
            ensemble weighted boosting classifier is a tree or forest.
        y : array-like of shape = [n_samples]
            The target values (class labels in classification, real numbers in
            regression).
        sample_weight : array-like of shape = [n_samples], optional
            Sample weights. If None, the sample weights are initialized to
            1 / n_samples.
        minority_target : int
            Minority class label.
        Returns
        -------
        self : object
            Returns self.
        Notes
        -----
        Based on the scikit-learn v0.18 AdaBoostClassifier and
        BaseWeightBoosting `fit` methods.
        """
        # Check that algorithm is supported.
        if self.algorithm not in ('SAMME', 'SAMME.R'):
            raise ValueError("algorithm %s is not supported" % self.algorithm)

        # Check parameters.
        if self.learning_rate <= 0:
            raise ValueError("learning_rate must be greater than zero")

        if (self.base_estimator is None or
                isinstance(self.base_estimator, (BaseDecisionTree,
                                                 BaseForest))):
            DTYPE = np.float64  # from fast_dict.pxd
            dtype = DTYPE
            accept_sparse = 'csc'
        else:
            dtype = None
            accept_sparse = ['csr', 'csc']

        X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=dtype,
                         y_numeric=is_regressor(self))

        if sample_weight is None:
            # Initialize weights to 1 / n_samples.
            sample_weight = np.empty(X.shape[0], dtype=np.float64)
            sample_weight[:] = 1. / X.shape[0]
        else:
            sample_weight = check_array(sample_weight, ensure_2d=False)
            # Normalize existing weights.
            sample_weight = sample_weight / sample_weight.sum(dtype=np.float64)

            # Check that the sample weights sum is positive.
            if sample_weight.sum() <= 0:
                raise ValueError(
                    "Attempting to fit with a non-positive "
                    "weighted number of samples.")

        if minority_target is None:
            # Determine the minority class label.
            stats_c_ = Counter(y)
            maj_c_ = max(stats_c_, key=stats_c_.get)
            min_c_ = min(stats_c_, key=stats_c_.get)
            self.minority_target = min_c_
        else:
            self.minority_target = minority_target

        # Check parameters.
        self._validate_estimator()

        # Clear any previous fit results.
        self.estimators_ = []
        self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64)
        self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64)

        random_state = check_random_state(self.random_state)

        for iboost in range(self.n_estimators):
            # Random undersampling step.
            X_maj = X[np.where(y != self.minority_target)]
            X_min = X[np.where(y == self.minority_target)]
            self.rus.fit(X_maj)

            n_maj = X_maj.shape[0]
            n_min = X_min.shape[0]
            if n_maj - self.n_samples < int(n_min * self.min_ratio):
                self.n_samples = n_maj - int(n_min * self.min_ratio)
            X_rus, X_idx = self.rus.sample(self.n_samples)

            y_rus = y[np.where(y != self.minority_target)][X_idx]
            y_min = y[np.where(y == self.minority_target)]

            sample_weight_rus = \
                sample_weight[np.where(y != self.minority_target)][X_idx]
            sample_weight_min = \
                sample_weight[np.where(y == self.minority_target)]

            # Combine the minority and majority class samples.
            X = np.vstack((X_rus, X_min))
            y = np.append(y_rus, y_min)

            # Combine the weights.
            sample_weight = \
                np.append(sample_weight_rus, sample_weight_min).reshape(-1, 1)
            sample_weight = \
                np.squeeze(normalize(sample_weight, axis=0, norm='l1'))

            # X, y, sample_weight = shuffle(X, y, sample_weight,
            #                              random_state=random_state)

            # Boosting step.
            sample_weight, estimator_weight, estimator_error = self._boost(
                iboost,
                X, y,
                sample_weight,
                random_state)

            # Early termination.
            if sample_weight is None:
                break

            self.estimator_weights_[iboost] = estimator_weight
            self.estimator_errors_[iboost] = estimator_error

            # Stop if error is zero.
            if estimator_error == 0:
                break

            sample_weight_sum = np.sum(sample_weight)

            # Stop if the sum of sample weights has become non-positive.
            if sample_weight_sum <= 0:
                break

            if iboost < self.n_estimators - 1:
                # Normalize.
                sample_weight /= sample_weight_sum

        return self

In [84]:
from imblearn.ensemble import RUSBoostClassifier
from sklearn.metrics import classification_report
target_names = ['-1','1']

for algorithm in [AdaBoostClassifier(base_estimator=dtc, n_estimators=numClasificadoresBase),
                  SMOTEBoost(n_estimators=10,k_neighbors = 5),
                  RUSBoost(n_estimators=10),
                  RUSBoostClassifier()]:
    algorithm.fit(X_train, y_train)
    y_pred = algorithm.predict(X_test)
    print()
    print(str(algorithm))
    print()
    print(classification_report(y_test, y_pred, 
                                target_names=target_names))


AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy',
                                                         max_depth=1),
                   n_estimators=10)

              precision    recall  f1-score   support

          -1       0.91      1.00      0.95       947
           1       0.50      0.01      0.02        98

    accuracy                           0.91      1045
   macro avg       0.70      0.50      0.49      1045
weighted avg       0.87      0.91      0.86      1045


SMOTEBoost(k_neighbors=None, n_estimators=10)

              precision    recall  f1-score   support

          -1       0.95      0.85      0.90       947
           1       0.30      0.59      0.39        98

    accuracy                           0.83      1045
   macro avg       0.62      0.72      0.65      1045
weighted avg       0.89      0.83      0.85      1045


RUSBoost(n_estimators=10, with_replacement=None)

              precision    recall  f1-score   support

     




RUSBoostClassifier()

              precision    recall  f1-score   support

          -1       0.98      0.74      0.84       947
           1       0.25      0.83      0.38        98

    accuracy                           0.75      1045
   macro avg       0.61      0.79      0.61      1045
weighted avg       0.91      0.75      0.80      1045



# KEEL DATASETS

# # Glass

In [120]:
def intLabel(c):
    if c==b'negative':
        return -1
    else:
        return 1

In [108]:
filepath= "../keel_datasets/glass/glass-0-1-2-3_vs_4-5-6.dat"
dataset = np.loadtxt(filepath,comments='@',delimiter=", ",
          converters = {9: intLabel})
X, y = dataset[:,:-1],dataset[:,-1]
print(X)
print(y)

[[ 1.51588824 12.87795     3.43036    ...  8.04468     0.
   0.1224    ]
 [ 1.5176423  12.9777      3.53812    ...  8.52888     0.
   0.        ]
 [ 1.52212996 14.20795     3.82099    ...  9.5726      0.
   0.        ]
 ...
 [ 1.51837126 14.321       3.25974    ...  5.78508     1.62855
   0.        ]
 [ 1.51657164 14.7998      0.         ...  8.2814      1.71045
   0.        ]
 [ 1.51732338 14.95275     0.         ...  8.61496     1.5498
   0.        ]]
[-1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1

In [97]:
maj_class = -1
min_class = 1
if sum(y == default_classes[0]) > sum(y == default_classes[1]):
#     maj_class = default_classes[0]
#     min_class = default_classes[1]
    y[y==default_classes[0]] = maj_class
    y[y==default_classes[1]] = min_class
else:
#     maj_class = default_classes[1]
#     min_class = default_classes[0]
    y[y==default_classes[1]] = maj_class
    y[y==default_classes[0]] = min_class
    
print("There are {} instances for the majoritary class".format(sum(y == maj_class)))
print("There are {} instanes for the minoritary class".format(sum(y == min_class)))
classes = [maj_class,min_class]

There are 163 instances for the majoritary class
There are 51 instanes for the minoritary class


In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,random_state=0)
#number of features of the dataset 
D = X_train.shape[1]

print("There are {} instances for the majoritary class".format(sum(y_train == maj_class)))
print("There are {} instanes for the minoritary class".format(sum(y_train == min_class)))

There are 122 instances for the majoritary class
There are 38 instanes for the minoritary class


In [116]:
from imblearn.ensemble import RUSBoostClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC
target_names = ['-1','1']

for algorithm in [#AdaBoostClassifier(base_estimator=SVC(), algorithm='SAMME', n_estimators=numClasificadoresBase),
                  AdaBoostClassifier(base_estimator=dtc, n_estimators=numClasificadoresBase),
                  SMOTEBoost(n_estimators=10,k_neighbors = 5),
                  RUSBoost(n_estimators=10),
                  RUSBoostClassifier()]:
    algorithm.fit(X_train, y_train)
    y_pred = algorithm.predict(X_test)
    print()
    print(str(algorithm))
    print()
    print(classification_report(y_test, y_pred, 
                                target_names=target_names))
    gmean = geometric_mean_score(y_test, y_pred)
    print("Gmean: ",gmean)


AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy',
                                                         max_depth=1),
                   n_estimators=10)

              precision    recall  f1-score   support

          -1       1.00      0.90      0.95        41
           1       0.76      1.00      0.87        13

    accuracy                           0.93        54
   macro avg       0.88      0.95      0.91        54
weighted avg       0.94      0.93      0.93        54

Gmean:  0.9499679070317291

SMOTEBoost(k_neighbors=None, n_estimators=10)

              precision    recall  f1-score   support

          -1       1.00      0.90      0.95        41
           1       0.76      1.00      0.87        13

    accuracy                           0.93        54
   macro avg       0.88      0.95      0.91        54
weighted avg       0.94      0.93      0.93        54

Gmean:  0.9499679070317291

RUSBoost(n_estimators=10, n_samples=0, with_replacement=



# # pima

In [121]:
filepath= "../keel_datasets/pima/pima.dat"
dataset = np.loadtxt(filepath,comments='@',delimiter=",",
          converters = {8: intLabel})
X, y = dataset[:,:-1],dataset[:,-1]
print(X)
print(y)

[[  6.    148.     72.    ...  33.6     0.627  50.   ]
 [  1.     85.     66.    ...  26.6     0.351  31.   ]
 [  8.    183.     64.    ...  23.3     0.672  32.   ]
 ...
 [  5.    121.     72.    ...  26.2     0.245  30.   ]
 [  1.    126.     60.    ...  30.1     0.349  47.   ]
 [  1.     93.     70.    ...  30.4     0.315  23.   ]]
[ 1. -1.  1. -1.  1. -1.  1. -1.  1.  1. -1.  1. -1.  1.  1.  1.  1.  1.
 -1.  1. -1. -1.  1.  1.  1.  1.  1. -1. -1. -1. -1.  1. -1. -1. -1. -1.
 -1.  1.  1.  1. -1. -1. -1.  1. -1.  1. -1. -1.  1. -1. -1. -1. -1.  1.
 -1. -1.  1. -1. -1. -1. -1.  1. -1. -1.  1. -1.  1. -1. -1. -1.  1. -1.
  1. -1. -1. -1. -1. -1.  1. -1. -1. -1. -1. -1.  1. -1. -1. -1.  1. -1.
 -1. -1. -1.  1. -1. -1. -1. -1. -1.  1.  1. -1. -1. -1. -1. -1. -1. -1.
 -1.  1.  1.  1. -1. -1.  1.  1.  1. -1. -1. -1.  1. -1. -1. -1.  1.  1.
 -1. -1.  1.  1.  1.  1.  1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.  1.
 -1. -1. -1. -1. -1. -1. -1. -1.  1. -1.  1.  1. -1. -1. -1.  1. -1. -1.
 -1. -1

In [122]:
maj_class = -1
min_class = 1
if sum(y == default_classes[0]) > sum(y == default_classes[1]):
#     maj_class = default_classes[0]
#     min_class = default_classes[1]
    y[y==default_classes[0]] = maj_class
    y[y==default_classes[1]] = min_class
else:
#     maj_class = default_classes[1]
#     min_class = default_classes[0]
    y[y==default_classes[1]] = maj_class
    y[y==default_classes[0]] = min_class
    
print("There are {} instances for the majoritary class".format(sum(y == maj_class)))
print("There are {} instanes for the minoritary class".format(sum(y == min_class)))
classes = [maj_class,min_class]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,random_state=0)
#number of features of the dataset 
D = X_train.shape[1]

print("There are {} instances for the majoritary class".format(sum(y_train == maj_class)))
print("There are {} instanes for the minoritary class".format(sum(y_train == min_class)))

There are 500 instances for the majoritary class
There are 268 instanes for the minoritary class
There are 375 instances for the majoritary class
There are 201 instanes for the minoritary class


In [123]:
from imblearn.ensemble import RUSBoostClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC
target_names = ['-1','1']

for algorithm in [#AdaBoostClassifier(base_estimator=SVC(), algorithm='SAMME', n_estimators=numClasificadoresBase),
                  AdaBoostClassifier(base_estimator=dtc, n_estimators=numClasificadoresBase),
                  SMOTEBoost(n_estimators=10,k_neighbors = 5),
                  RUSBoost(n_estimators=10),
                  RUSBoostClassifier()]:
    algorithm.fit(X_train, y_train)
    y_pred = algorithm.predict(X_test)
    print()
    print(str(algorithm))
    print()
    print(classification_report(y_test, y_pred, 
                                target_names=target_names))
    gmean = geometric_mean_score(y_test, y_pred)
    print("Gmean: ",gmean)


AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy',
                                                         max_depth=1),
                   n_estimators=10)

              precision    recall  f1-score   support

          -1       0.81      0.83      0.82       125
           1       0.67      0.64      0.66        67

    accuracy                           0.77       192
   macro avg       0.74      0.74      0.74       192
weighted avg       0.76      0.77      0.76       192

Gmean:  0.7307326113249164

SMOTEBoost(k_neighbors=None, n_estimators=10)

              precision    recall  f1-score   support

          -1       0.88      0.60      0.71       125
           1       0.53      0.85      0.66        67

    accuracy                           0.69       192
   macro avg       0.71      0.73      0.68       192
weighted avg       0.76      0.69      0.69       192

Gmean:  0.7144562696162935

RUSBoost(n_estimators=10, n_samples=0, with_replacement=

