In [3]:
from collections import Counter

import numpy as np
from sklearn.base import is_regressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble.forest import BaseForest
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
from sklearn.tree.tree import BaseDecisionTree
from sklearn.utils import check_random_state
from sklearn.utils import check_X_y
#from sklearn.utils import shuffle


In [1]:
class SMOTE(object):
    """Implementation of Synthetic Minority Over-Sampling Technique (SMOTE).
    SMOTE performs oversampling of the minority class by picking target 
    minority class samples and their nearest minority class neighbors and 
    generating new samples that linearly combine features of each target 
    sample with features of its selected minority class neighbors [1].
    
    Parameters
    ----------
    k_neighbors : int, optional (default=5)
        Number of nearest neighbors.
    random_state : int or None, optional (default=None)
        If int, random_state is the seed used by the random number generator.
        If None, the random number generator is the RandomState instance used
        by np.random.
    References
    ----------
    .. [1] N. V. Chawla, K. W. Bowyer, L. O. Hall, and P. Kegelmeyer. "SMOTE:
           Synthetic Minority Over-Sampling Technique." Journal of Artificial
           Intelligence Research (JAIR), 2002.
    """
    
    def __init__(self, k_neighbors=5, random_state=None):
        self.k = k_neighbors
        self.random_state = random_state
        
    def fit(self, X):
        """Train model based on input data.
        Parameters
        ----------
        X : array-like, shape = [n_minority_samples, n_features]
            Holds the minority samples.
        """
        self.X = X
        self.n_minority_samples, self.n_features = self.X.shape

        # Learn nearest neighbors.
        self.neigh = NearestNeighbors(n_neighbors=self.k + 1)
        self.neigh.fit(self.X)

        return self
    
    def sample(self, n_samples):
        """Generate samples.
        
        Parameters
        ----------
        n_samples : int
            Number of new synthetic samples.
            
        Returns
        -------
        S : array, shape = [n_samples, n_features]
            Returns synthetic samples.
        """
        np.random.seed(seed=self.random_state)
        
        S = np.zeroes(shape=(n_samples, self.n_features))
        
        #Calculate synthetic samples.
        for i in range(n_samples):
            j = np.random.randint(0, self.X.shape[0])
            
            # Find the k NN for sample j.
            # Exclude the sample itself.
            nn = self.neigh.kneighbors(self.X[j].reshape(1, -1),
                                       return_distance=False)[:, 1:]
            nn_index = np.random.choice(nn[0])

            diff = self.X[nn_index] - self.X[j]
            gap = np.random.random()

            S[i, :] = self.X[j, :] + gap * diff[:]
        
        return S
    
    

In [None]:
class SMOTEBoost(AdaBoostClassifier):
        """Implementation of SMOTEBoost.
        SMOTEBoost introduces data sampling into the AdaBoost algorithm by
        oversampling the minority class using SMOTE on each boosting iteration [1].
        This implementation inherits methods from the scikit-learn 
        AdaBoostClassifier class, only modifying the `fit` method.
        Parameters
        ----------
        n_samples : int, optional (default=100)
            Number of new synthetic samples per boosting step.
        k_neighbors : int, optional (default=5)
            Number of nearest neighbors.
        base_estimator : object, optional (default=DecisionTreeClassifier)
            The base estimator from which the boosted ensemble is built.
            Support for sample weighting is required, as well as proper `classes_`
            and `n_classes_` attributes.
        n_estimators : int, optional (default=50)
            The maximum number of estimators at which boosting is terminated.
            In case of perfect fit, the learning procedure is stopped early.
        learning_rate : float, optional (default=1.)
            Learning rate shrinks the contribution of each classifier by
            ``learning_rate``. There is a trade-off between ``learning_rate`` and
            ``n_estimators``.
        algorithm : {'SAMME', 'SAMME.R'}, optional (default='SAMME.R')
            If 'SAMME.R' then use the SAMME.R real boosting algorithm.
            ``base_estimator`` must support calculation of class probabilities.
            If 'SAMME' then use the SAMME discrete boosting algorithm.
            The SAMME.R algorithm typically converges faster than SAMME,
            achieving a lower test error with fewer boosting iterations.
        random_state : int or None, optional (default=None)
            If int, random_state is the seed used by the random number generator.
            If None, the random number generator is the RandomState instance used
            by np.random.

        References
        ----------
        .. [1] N. V. Chawla, A. Lazarevic, L. O. Hall, and K. W. Bowyer.
               "SMOTEBoost: Improving Prediction of the Minority Class in
               Boosting." European Conference on Principles of Data Mining and
               Knowledge Discovery (PKDD), 2003.
        """
        
        def __init__(self,
                    n_samples=100,
                    k_neighbors=5,
                    base_estimator=None,
                    n_estimators=50,
                    learning_rate=1.,
                    algorithm='SAMME.R',
                    random_state=None):
            
            self.n_samples = n_samples
            self.algorithm = algorithm
            self.smote = SMOTE(k_neighbors=k_neighbors,
                              random_state=random_state)
            
            super().__init__(
                base_estimator=base_estimator,
                n_estimators=n_estimators,
                learning_rate=learning_rate,
                random_state=random_state)
            
        def fit(self, X, y, sample_weight=None, minority_target=None):
            """Build a boosted classifier/regressor from the training set (X, y),
            performing SMOTE during each boosting step.
            Parameters
            ----------
            X : {array-like, sparse matrix} of shape = [n_samples, n_features]
                The training input samples. Sparse matrix can be CSC, CSR, COO,
                DOK, or LIL. COO, DOK, and LIL are converted to CSR. The dtype is
                forced to DTYPE from tree._tree if the base classifier of this
                ensemble weighted boosting classifier is a tree or forest.
            y : array-like of shape = [n_samples]
                The target values (class labels in classification, real numbers in
                regression).
            sample_weight : array-like of shape = [n_samples], optional
                Sample weights. If None, the sample weights are initialized to
                1 / n_samples.
            minority_target : int
                Minority class label.
            Returns
            -------
            self : object
                Returns self.
            Notes
            -----
            Based on the scikit-learn v0.18 AdaBoostClassifier and
            BaseWeightBoosting `fit` methods.
            """
            
            # Check that algorithm is supported.
            if self.algorithm not in ('SAMME', 'SAMME.R'):
                raise ValueError("algorithm %s is not supported" % self.algorithm)
                
            # Check parameters
            if self.learning_rate <= 0:
                raise ValueError("learning rate must be greater than zero")
                
            if(self.base_estimator is None or
                    isinstance(self.base_estimator, (BaseDecisionTree,
                                                    BaseForest))):
                DTYPE = np.float64 # from fast_dict.pxd
                dtype = DTYPE
                accept_sparse = 'csc'
            else:
                dtype = None
                accept_sparse = ['csr', 'csc']
                
            X,y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=dtype,
                           y_numeric=is_regressor(self))
            
            if sample_weight is None:
                # Initialize weights to 1/n_samples.
                sample_weight = np.empty(X.shape[0], dtype=np.float64)
                sample_weight[:] = 1./X.shape[0]
            else:
                sample_weight = check_array(sample_weight, ensure_2d=False)
                # Normalize existing weights
                sample_weight = sample_weight/sample_weight.sum(dtype=np.float64)
                
                # Check that the sample weights sum is positive.
                if sample_weight.sum() <= 0:
                    raise ValueError(
                        "Attempting to fit with a non-positive "
                        "weighted number of samples.")
                    
                if minority_target is None:
                    # Determine the minority class label.
                    stats_c_ = Counter(y)
                    maj_c_ = max(stats_c_, key=stats_c_.get)
                    min_c_ = min(stats_c_, key=stats_c_.get)
                    self.minority_target = min_c_
                else:
                    self.minority_target = minority_target