From 54f9d1609707146a3d53d30ef69dcf9a6e3c454c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 31 Oct 2016 17:14:00 +0100 Subject: [PATCH] [MRG] Apply deprecation SMOTE and ADADYN (#183) * Apply deprecation SMOTE and ADADYN * Update doc pipeline --- doc/whats_new.rst | 4 ++ imblearn/base.py | 27 +++++++++++++ imblearn/over_sampling/adasyn.py | 26 +++++++++---- imblearn/over_sampling/smote.py | 65 ++++++++++++++++++++++---------- imblearn/pipeline.py | 4 +- 5 files changed, 97 insertions(+), 29 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 29023772d..6f4d16264 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -14,6 +14,7 @@ Changelog Bug fixes ~~~~~~~~~ + - Fixed a bug in :class:`under_sampling.NearMiss` which was not picking the right samples during under sampling for the method 3. By `Guillaume Lemaitre`_. - Fixed a bug in :class:`ensemble.EasyEnsemble`, correction of the `random_state` generation. By `Guillaume Lemaitre`_ and `Christos Aridas`_. - Fixed a bug in :class:`under_sampling.RepeatedEditedNearestNeighbours`, add additional stopping criterion to avoid that the minority class become a majority class or that a class disappear. By `Guillaume Lemaitre`_. @@ -53,6 +54,9 @@ API changes summary - Two base classes :class:`BaseBinaryclassSampler` and :class:`BaseMulticlassSampler` have been created to handle the target type and raise warning in case of abnormality. By `Guillaume Lemaitre`_ and `Christos Aridas`_. - Move `random_state` to be assigned in the :class:`SamplerMixin` initialization. By `Guillaume Lemaitre`_. - Provide estimators instead of parameters in :class:`combine.SMOTEENN` and :class:`combine.SMOTETomek`. Therefore, the list of parameters have been deprecated. By `Guillaume Lemaitre`_ and `Christos Aridas`_. +- `k` has been deprecated in :class:`over_sampling.ADASYN`. Use `n_neighbors` instead. By `Guillaume Lemaitre`_. +- `k` and `m` have been deprecated in :class:`over_sampling.SMOTE`. Use `k_neighbors` and `m_neighbors` instead. By `Guillaume Lemaitre`_. + Documentation changes ~~~~~~~~~~~~~~~~~~~~~ diff --git a/imblearn/base.py b/imblearn/base.py index 8aa7e679d..954fc5c28 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -83,6 +83,10 @@ def fit(self, X, y): if hasattr(self, 'size_ngh'): self._validate_size_ngh_deprecation() + elif hasattr(self, 'k') and not hasattr(self, 'm'): + self._validate_k_deprecation() + elif hasattr(self, 'k') and hasattr(self, 'm'): + self._validate_k_m_deprecation() self.logger.info('Compute classes statistics ...') @@ -161,6 +165,10 @@ def sample(self, X, y): if hasattr(self, 'size_ngh'): self._validate_size_ngh_deprecation() + elif hasattr(self, 'k') and not hasattr(self, 'm'): + self._validate_k_deprecation() + elif hasattr(self, 'k') and hasattr(self, 'm'): + self._validate_k_m_deprecation() return self._sample(X, y) @@ -212,6 +220,25 @@ def _validate_size_ngh_deprecation(self): ' `n_neighbors` instead.', DeprecationWarning) self.n_neighbors = self.size_ngh + def _validate_k_deprecation(self): + """Private function to warn about deprecation of k in ADASYN""" + if self.k is not None: + warnings.warn('`k` will be replaced in version 0.4. Use' + ' `n_neighbors` instead.', DeprecationWarning) + self.n_neighbors = self.k + + def _validate_k_m_deprecation(self): + """Private function to warn about deprecation of k in ADASYN""" + if self.k is not None: + warnings.warn('`k` will be replaced in version 0.4. Use' + ' `k_neighbors` instead.', DeprecationWarning) + self.k_neighbors = self.k + + if self.m is not None: + warnings.warn('`m` will be replaced in version 0.4. Use' + ' `m_neighbors` instead.', DeprecationWarning) + self.m_neighbors = self.m + @abstractmethod def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/over_sampling/adasyn.py b/imblearn/over_sampling/adasyn.py index 7d139d4e2..fc067fcae 100644 --- a/imblearn/over_sampling/adasyn.py +++ b/imblearn/over_sampling/adasyn.py @@ -31,7 +31,13 @@ class ADASYN(BaseBinarySampler): If None, the random number generator is the RandomState instance used by np.random. - k : int, optional (default=5) + k : int, optional (default=None) + Number of nearest neighbours to used to construct synthetic samples. + + NOTE: `k` is deprecated from 0.2 and will be replaced in 0.4 + Use ``n_neighbors`` instead. + + n_neighbours : int, optional (default=5) Number of nearest neighbours to used to construct synthetic samples. n_jobs : int, optional (default=1) @@ -84,12 +90,15 @@ class ADASYN(BaseBinarySampler): """ - def __init__(self, ratio='auto', random_state=None, k=5, n_jobs=1): + def __init__(self, ratio='auto', random_state=None, k=None, n_neighbors=5, + n_jobs=1): super(ADASYN, self).__init__(ratio=ratio, random_state=random_state) self.k = k + self.n_neighbors = n_neighbors self.n_jobs = n_jobs - self.nearest_neighbour = NearestNeighbors(n_neighbors=self.k + 1, - n_jobs=self.n_jobs) + self.nearest_neighbour = NearestNeighbors( + n_neighbors=self.n_neighbors + 1, + n_jobs=self.n_jobs) def _sample(self, X, y): """Resample the dataset. @@ -130,7 +139,8 @@ def _sample(self, X, y): X_min = X[y == self.min_c_] # Print if verbose is true - self.logger.debug('Finding the %s nearest neighbours ...', self.k) + self.logger.debug('Finding the %s nearest neighbours ...', + self.n_neighbors) # Look for k-th nearest neighbours, excluding, of course, the # point itself. @@ -140,7 +150,8 @@ def _sample(self, X, y): _, ind_nn = self.nearest_neighbour.kneighbors(X_min) # Compute the ratio of majority samples next to minority samples - ratio_nn = np.sum(y[ind_nn[:, 1:]] == self.maj_c_, axis=1) / self.k + ratio_nn = (np.sum(y[ind_nn[:, 1:]] == self.maj_c_, axis=1) / + self.n_neighbors) # Check that we found at least some neighbours belonging to the # majority class if not np.sum(ratio_nn): @@ -158,7 +169,8 @@ def _sample(self, X, y): for x_i, x_i_nn, num_sample_i in zip(X_min, ind_nn, num_samples_nn): # Pick-up the neighbors wanted - nn_zs = random_state.randint(1, high=self.k + 1, size=num_sample_i) + nn_zs = random_state.randint(1, high=self.n_neighbors + 1, + size=num_sample_i) # Create a new sample for nn_z in nn_zs: diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py index 02168609d..48deb6041 100644 --- a/imblearn/over_sampling/smote.py +++ b/imblearn/over_sampling/smote.py @@ -33,10 +33,23 @@ class SMOTE(BaseBinarySampler): If None, the random number generator is the RandomState instance used by np.random. - k : int, optional (default=5) + k : int, optional (default=None) Number of nearest neighbours to used to construct synthetic samples. - m : int, optional (default=10) + NOTE: `k` is deprecated from 0.2 and will be replaced in 0.4 + Use ``k_neighbors`` instead. + + k_neighbors : int, optional (default=5) + Number of nearest neighbours to used to construct synthetic samples. + + m : int, optional (default=None) + Number of nearest neighbours to use to determine if a minority sample + is in danger. + + NOTE: `m` is deprecated from 0.2 and will be replaced in 0.4 + Use ``m_neighbors`` instead. + + m_neighbors : int, optional (default=10) Number of nearest neighbours to use to determine if a minority sample is in danger. @@ -102,12 +115,15 @@ class SMOTE(BaseBinarySampler): """ - def __init__(self, ratio='auto', random_state=None, k=5, m=10, - out_step=0.5, kind='regular', n_jobs=-1, **kwargs): + def __init__(self, ratio='auto', random_state=None, k=None, k_neighbors=5, + m=None, m_neighbors=10, out_step=0.5, kind='regular', + n_jobs=-1, **kwargs): super(SMOTE, self).__init__(ratio=ratio, random_state=random_state) self.kind = kind self.k = k + self.k_neighbors = k_neighbors self.m = m + self.m_neighbors = m_neighbors self.out_step = out_step self.n_jobs = n_jobs self.kwargs = kwargs @@ -149,11 +165,11 @@ def _in_danger_noise(self, samples, y, kind='danger'): if kind == 'danger': # Samples are in danger for m/2 <= m' < m - return np.bitwise_and(n_maj >= float(self.m) / 2., - n_maj < self.m) + return np.bitwise_and(n_maj >= float(self.m_neighbors) / 2., + n_maj < self.m_neighbors) elif kind == 'noise': # Samples are noise for m = m' - return n_maj == self.m + return n_maj == self.m_neighbors else: raise NotImplementedError @@ -281,7 +297,8 @@ def _sample(self, X, y): # If regular SMOTE is to be performed if self.kind == 'regular': - self.logger.debug('Finding the %s nearest neighbours ...', self.k) + self.logger.debug('Finding the %s nearest neighbours ...', + self.k_neighbors) # Look for k-th nearest neighbours, excluding, of course, the # point itself. @@ -312,7 +329,8 @@ def _sample(self, X, y): if self.kind == 'borderline1' or self.kind == 'borderline2': - self.logger.debug('Finding the %s nearest neighbours ...', self.m) + self.logger.debug('Finding the %s nearest neighbours ...', + self.m_neighbors) # Find the NNs for all samples in the data set. self.nearest_neighbour.fit(X) @@ -334,7 +352,8 @@ def _sample(self, X, y): # # We start by changing the number of NNs to consider from m + 1 # to k + 1 - self.nearest_neighbour.set_params(**{'n_neighbors': self.k + 1}) + self.nearest_neighbour.set_params(**{'n_neighbors': + self.k_neighbors + 1}) self.nearest_neighbour.fit(X_min) # nns...# @@ -358,7 +377,7 @@ def _sample(self, X, y): # Reset the k-neighbours to m+1 neighbours self.nearest_neighbour.set_params( - **{'n_neighbors': self.m + 1}) + **{'n_neighbors': self.m_neighbors + 1}) return X_resampled, y_resampled @@ -395,7 +414,7 @@ def _sample(self, X, y): # Reset the k-neighbours to m+1 neighbours self.nearest_neighbour.set_params( - **{'n_neighbors': self.m + 1}) + **{'n_neighbors': self.m_neighbors + 1}) return X_resampled, y_resampled @@ -416,7 +435,8 @@ def _sample(self, X, y): # First, find the nn of all the samples to identify samples # in danger and noisy ones - self.logger.debug('Finding the %s nearest neighbours ...', self.m) + self.logger.debug('Finding the %s nearest neighbours ...', + self.m_neighbors) # As usual, fit a nearest neighbour model to the data self.nearest_neighbour.fit(X) @@ -439,9 +459,11 @@ def _sample(self, X, y): safety_bool.sum().astype(int)) # Proceed to find support vectors NNs among the minority class - self.logger.debug('Finding the %s nearest neighbours ...', self.k) + self.logger.debug('Finding the %s nearest neighbours ...', + self.k_neighbors) - self.nearest_neighbour.set_params(**{'n_neighbors': self.k + 1}) + self.nearest_neighbour.set_params(**{'n_neighbors': + self.k_neighbors + 1}) self.nearest_neighbour.fit(X_min) self.logger.debug('Create synthetic samples ...') @@ -496,7 +518,8 @@ def _sample(self, X, y): y_resampled = np.concatenate((y, y_new_1), axis=0) # Reset the k-neighbours to m+1 neighbours - self.nearest_neighbour.set_params(**{'n_neighbors': self.m + 1}) + self.nearest_neighbour.set_params(**{'n_neighbors': + self.m_neighbors + 1}) return X_resampled, y_resampled @@ -509,8 +532,9 @@ def _get_smote_kind(self): # Regular smote does not look for samples in danger, instead it # creates synthetic samples directly from the k-th nearest # neighbours with not filtering - self.nearest_neighbour = NearestNeighbors(n_neighbors=self.k + 1, - n_jobs=self.n_jobs) + self.nearest_neighbour = NearestNeighbors( + n_neighbors=self.k_neighbors + 1, + n_jobs=self.n_jobs) else: # Borderline1, 2 and SVM variations of smote must first look for # samples that could be considered noise and samples that live @@ -518,8 +542,9 @@ def _get_smote_kind(self): # creating synthetic samples from the k-th nns, it first look # for m nearest neighbors to decide whether or not a sample is # noise or near the boundary. - self.nearest_neighbour = NearestNeighbors(n_neighbors=self.m + 1, - n_jobs=self.n_jobs) + self.nearest_neighbour = NearestNeighbors( + n_neighbors=self.m_neighbors + 1, + n_jobs=self.n_jobs) # --- SVM smote # Unlike the borderline variations, the SVM variation uses the support diff --git a/imblearn/pipeline.py b/imblearn/pipeline.py index 79a874129..289d91940 100644 --- a/imblearn/pipeline.py +++ b/imblearn/pipeline.py @@ -100,8 +100,8 @@ class Pipeline(pipeline.Pipeline): >>> pipeline = Pipeline([('smt', smt), ('pca', pca), ('knn', knn)]) >>> X_train, X_test, y_train, y_test = tts(X, y, random_state=42) >>> pipeline.fit(X_train, y_train) - Pipeline(steps=[('smt', SMOTE(k=5, kind='regular', m=10, n_jobs=-1, out_step=0.5, random_state=42, - ratio='auto')), ('pca', PCA(copy=True, n_components=None, whiten=False)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', + Pipeline(steps=[('smt', SMOTE(k=None, k_neighbors=5, kind='regular', m=None, m_neighbors=10, + n_jobs=-1, out_step=0.5, random_state=42, ratio='auto')), ('pca', PCA(copy=True, n_components=None, whiten=False)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights='uniform'))]) >>> y_hat = pipeline.predict(X_test)