[MRG] Apply deprecation SMOTE and ADADYN (scikit-learn-contrib#183)

* Apply deprecation SMOTE and ADADYN * Update doc pipeline
glemaitre · Oct 31, 2016 · 54f9d16 · 54f9d16
1 parent e8f7874
commit 54f9d16
Show file tree

Hide file tree

Showing 5 changed files with 97 additions and 29 deletions.
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -14,6 +14,7 @@ Changelog
 
 Bug fixes
 ~~~~~~~~~
+
 - Fixed a bug in :class:`under_sampling.NearMiss` which was not picking the right samples during under sampling for the method 3. By `Guillaume Lemaitre`_.
 - Fixed a bug in :class:`ensemble.EasyEnsemble`, correction of the `random_state` generation. By `Guillaume Lemaitre`_ and `Christos Aridas`_.
 - Fixed a bug in :class:`under_sampling.RepeatedEditedNearestNeighbours`, add additional stopping criterion to avoid that the minority class become a majority class or that a class disappear. By `Guillaume Lemaitre`_.
@@ -53,6 +54,9 @@ API changes summary
 - Two base classes :class:`BaseBinaryclassSampler` and :class:`BaseMulticlassSampler` have been created to handle the target type and raise warning in case of abnormality. By `Guillaume Lemaitre`_ and `Christos Aridas`_.
 - Move `random_state` to be assigned in the :class:`SamplerMixin` initialization. By `Guillaume Lemaitre`_.
 - Provide estimators instead of parameters in :class:`combine.SMOTEENN` and :class:`combine.SMOTETomek`. Therefore, the list of parameters have been deprecated. By `Guillaume Lemaitre`_ and `Christos Aridas`_.
+- `k` has been deprecated in :class:`over_sampling.ADASYN`. Use `n_neighbors` instead. By `Guillaume Lemaitre`_.
+- `k` and `m` have been deprecated in :class:`over_sampling.SMOTE`. Use `k_neighbors` and `m_neighbors` instead. By `Guillaume Lemaitre`_.
+
 
 Documentation changes
 ~~~~~~~~~~~~~~~~~~~~~

diff --git a/imblearn/base.py b/imblearn/base.py
@@ -83,6 +83,10 @@ def fit(self, X, y):
 
         if hasattr(self, 'size_ngh'):
             self._validate_size_ngh_deprecation()
+        elif hasattr(self, 'k') and not hasattr(self, 'm'):
+            self._validate_k_deprecation()
+        elif hasattr(self, 'k') and hasattr(self, 'm'):
+            self._validate_k_m_deprecation()
 
         self.logger.info('Compute classes statistics ...')
 
@@ -161,6 +165,10 @@ def sample(self, X, y):
 
         if hasattr(self, 'size_ngh'):
             self._validate_size_ngh_deprecation()
+        elif hasattr(self, 'k') and not hasattr(self, 'm'):
+            self._validate_k_deprecation()
+        elif hasattr(self, 'k') and hasattr(self, 'm'):
+            self._validate_k_m_deprecation()
 
         return self._sample(X, y)
 
@@ -212,6 +220,25 @@ def _validate_size_ngh_deprecation(self):
                           ' `n_neighbors` instead.', DeprecationWarning)
             self.n_neighbors = self.size_ngh
 
+    def _validate_k_deprecation(self):
+        """Private function to warn about deprecation of k in ADASYN"""
+        if self.k is not None:
+            warnings.warn('`k` will be replaced in version 0.4. Use'
+                          ' `n_neighbors` instead.', DeprecationWarning)
+            self.n_neighbors = self.k
+
+    def _validate_k_m_deprecation(self):
+        """Private function to warn about deprecation of k in ADASYN"""
+        if self.k is not None:
+            warnings.warn('`k` will be replaced in version 0.4. Use'
+                          ' `k_neighbors` instead.', DeprecationWarning)
+            self.k_neighbors = self.k
+
+        if self.m is not None:
+            warnings.warn('`m` will be replaced in version 0.4. Use'
+                          ' `m_neighbors` instead.', DeprecationWarning)
+            self.m_neighbors = self.m
+
     @abstractmethod
     def _sample(self, X, y):
         """Resample the dataset.

diff --git a/imblearn/over_sampling/adasyn.py b/imblearn/over_sampling/adasyn.py
@@ -31,7 +31,13 @@ class ADASYN(BaseBinarySampler):
         If None, the random number generator is the RandomState instance used
         by np.random.
 
-    k : int, optional (default=5)
+    k : int, optional (default=None)
+        Number of nearest neighbours to used to construct synthetic samples.
+
+        NOTE: `k` is deprecated from 0.2 and will be replaced in 0.4
+        Use ``n_neighbors`` instead.
+
+    n_neighbours : int, optional (default=5)
         Number of nearest neighbours to used to construct synthetic samples.
 
     n_jobs : int, optional (default=1)
@@ -84,12 +90,15 @@ class ADASYN(BaseBinarySampler):
 
     """
 
-    def __init__(self, ratio='auto', random_state=None, k=5, n_jobs=1):
+    def __init__(self, ratio='auto', random_state=None, k=None, n_neighbors=5,
+                 n_jobs=1):
         super(ADASYN, self).__init__(ratio=ratio, random_state=random_state)
         self.k = k
+        self.n_neighbors = n_neighbors
         self.n_jobs = n_jobs
-        self.nearest_neighbour = NearestNeighbors(n_neighbors=self.k + 1,
-                                                  n_jobs=self.n_jobs)
+        self.nearest_neighbour = NearestNeighbors(
+            n_neighbors=self.n_neighbors + 1,
+            n_jobs=self.n_jobs)
 
     def _sample(self, X, y):
         """Resample the dataset.
@@ -130,7 +139,8 @@ def _sample(self, X, y):
         X_min = X[y == self.min_c_]
 
         # Print if verbose is true
-        self.logger.debug('Finding the %s nearest neighbours ...', self.k)
+        self.logger.debug('Finding the %s nearest neighbours ...',
+                          self.n_neighbors)
 
         # Look for k-th nearest neighbours, excluding, of course, the
         # point itself.
@@ -140,7 +150,8 @@ def _sample(self, X, y):
         _, ind_nn = self.nearest_neighbour.kneighbors(X_min)
 
         # Compute the ratio of majority samples next to minority samples
-        ratio_nn = np.sum(y[ind_nn[:, 1:]] == self.maj_c_, axis=1) / self.k
+        ratio_nn = (np.sum(y[ind_nn[:, 1:]] == self.maj_c_, axis=1) /
+                    self.n_neighbors)
         # Check that we found at least some neighbours belonging to the
         # majority class
         if not np.sum(ratio_nn):
@@ -158,7 +169,8 @@ def _sample(self, X, y):
         for x_i, x_i_nn, num_sample_i in zip(X_min, ind_nn, num_samples_nn):
 
             # Pick-up the neighbors wanted
-            nn_zs = random_state.randint(1, high=self.k + 1, size=num_sample_i)
+            nn_zs = random_state.randint(1, high=self.n_neighbors + 1,
+                                         size=num_sample_i)
 
             # Create a new sample
             for nn_z in nn_zs:

diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py
@@ -33,10 +33,23 @@ class SMOTE(BaseBinarySampler):
         If None, the random number generator is the RandomState instance used
         by np.random.
 
-    k : int, optional (default=5)
+    k : int, optional (default=None)
         Number of nearest neighbours to used to construct synthetic samples.
 
-    m : int, optional (default=10)
+        NOTE: `k` is deprecated from 0.2 and will be replaced in 0.4
+        Use ``k_neighbors`` instead.
+
+    k_neighbors : int, optional (default=5)
+        Number of nearest neighbours to used to construct synthetic samples.
+
+    m : int, optional (default=None)
+        Number of nearest neighbours to use to determine if a minority sample
+        is in danger.
+
+        NOTE: `m` is deprecated from 0.2 and will be replaced in 0.4
+        Use ``m_neighbors`` instead.
+
+    m_neighbors : int, optional (default=10)
         Number of nearest neighbours to use to determine if a minority sample
         is in danger.
 
@@ -102,12 +115,15 @@ class SMOTE(BaseBinarySampler):
 
     """
 
-    def __init__(self, ratio='auto', random_state=None, k=5, m=10,
-                 out_step=0.5, kind='regular', n_jobs=-1, **kwargs):
+    def __init__(self, ratio='auto', random_state=None, k=None, k_neighbors=5,
+                 m=None, m_neighbors=10, out_step=0.5, kind='regular',
+                 n_jobs=-1, **kwargs):
         super(SMOTE, self).__init__(ratio=ratio, random_state=random_state)
         self.kind = kind
         self.k = k
+        self.k_neighbors = k_neighbors
         self.m = m
+        self.m_neighbors = m_neighbors
         self.out_step = out_step
         self.n_jobs = n_jobs
         self.kwargs = kwargs
@@ -149,11 +165,11 @@ def _in_danger_noise(self, samples, y, kind='danger'):
 
         if kind == 'danger':
             # Samples are in danger for m/2 <= m' < m
-            return np.bitwise_and(n_maj >= float(self.m) / 2.,
-                                  n_maj < self.m)
+            return np.bitwise_and(n_maj >= float(self.m_neighbors) / 2.,
+                                  n_maj < self.m_neighbors)
         elif kind == 'noise':
             # Samples are noise for m = m'
-            return n_maj == self.m
+            return n_maj == self.m_neighbors
         else:
             raise NotImplementedError
 
@@ -281,7 +297,8 @@ def _sample(self, X, y):
         # If regular SMOTE is to be performed
         if self.kind == 'regular':
 
-            self.logger.debug('Finding the %s nearest neighbours ...', self.k)
+            self.logger.debug('Finding the %s nearest neighbours ...',
+                              self.k_neighbors)
 
             # Look for k-th nearest neighbours, excluding, of course, the
             # point itself.
@@ -312,7 +329,8 @@ def _sample(self, X, y):
 
         if self.kind == 'borderline1' or self.kind == 'borderline2':
 
-            self.logger.debug('Finding the %s nearest neighbours ...', self.m)
+            self.logger.debug('Finding the %s nearest neighbours ...',
+                              self.m_neighbors)
 
             # Find the NNs for all samples in the data set.
             self.nearest_neighbour.fit(X)
@@ -334,7 +352,8 @@ def _sample(self, X, y):
             #
             # We start by changing the number of NNs to consider from m + 1
             # to k + 1
-            self.nearest_neighbour.set_params(**{'n_neighbors': self.k + 1})
+            self.nearest_neighbour.set_params(**{'n_neighbors':
+                                                 self.k_neighbors + 1})
             self.nearest_neighbour.fit(X_min)
 
             # nns...#
@@ -358,7 +377,7 @@ def _sample(self, X, y):
 
                 # Reset the k-neighbours to m+1 neighbours
                 self.nearest_neighbour.set_params(
-                    **{'n_neighbors': self.m + 1})
+                    **{'n_neighbors': self.m_neighbors + 1})
 
                 return X_resampled, y_resampled
 
@@ -395,7 +414,7 @@ def _sample(self, X, y):
 
                 # Reset the k-neighbours to m+1 neighbours
                 self.nearest_neighbour.set_params(
-                    **{'n_neighbors': self.m + 1})
+                    **{'n_neighbors': self.m_neighbors + 1})
 
                 return X_resampled, y_resampled
 
@@ -416,7 +435,8 @@ def _sample(self, X, y):
 
             # First, find the nn of all the samples to identify samples
             # in danger and noisy ones
-            self.logger.debug('Finding the %s nearest neighbours ...', self.m)
+            self.logger.debug('Finding the %s nearest neighbours ...',
+                              self.m_neighbors)
 
             # As usual, fit a nearest neighbour model to the data
             self.nearest_neighbour.fit(X)
@@ -439,9 +459,11 @@ def _sample(self, X, y):
                               safety_bool.sum().astype(int))
 
             # Proceed to find support vectors NNs among the minority class
-            self.logger.debug('Finding the %s nearest neighbours ...', self.k)
+            self.logger.debug('Finding the %s nearest neighbours ...',
+                              self.k_neighbors)
 
-            self.nearest_neighbour.set_params(**{'n_neighbors': self.k + 1})
+            self.nearest_neighbour.set_params(**{'n_neighbors':
+                                                 self.k_neighbors + 1})
             self.nearest_neighbour.fit(X_min)
 
             self.logger.debug('Create synthetic samples ...')
@@ -496,7 +518,8 @@ def _sample(self, X, y):
                 y_resampled = np.concatenate((y, y_new_1), axis=0)
 
             # Reset the k-neighbours to m+1 neighbours
-            self.nearest_neighbour.set_params(**{'n_neighbors': self.m + 1})
+            self.nearest_neighbour.set_params(**{'n_neighbors':
+                                                 self.m_neighbors + 1})
 
             return X_resampled, y_resampled
 
@@ -509,17 +532,19 @@ def _get_smote_kind(self):
             # Regular smote does not look for samples in danger, instead it
             # creates synthetic samples directly from the k-th nearest
             # neighbours with not filtering
-            self.nearest_neighbour = NearestNeighbors(n_neighbors=self.k + 1,
-                                                      n_jobs=self.n_jobs)
+            self.nearest_neighbour = NearestNeighbors(
+                n_neighbors=self.k_neighbors + 1,
+                n_jobs=self.n_jobs)
         else:
             # Borderline1, 2 and SVM variations of smote must first look for
             # samples that could be considered noise and samples that live
             # near the boundary between the classes. Therefore, before
             # creating synthetic samples from the k-th nns, it first look
             # for m nearest neighbors to decide whether or not a sample is
             # noise or near the boundary.
-            self.nearest_neighbour = NearestNeighbors(n_neighbors=self.m + 1,
-                                                      n_jobs=self.n_jobs)
+            self.nearest_neighbour = NearestNeighbors(
+                n_neighbors=self.m_neighbors + 1,
+                n_jobs=self.n_jobs)
 
         # --- SVM smote
         # Unlike the borderline variations, the SVM variation uses the support

diff --git a/imblearn/pipeline.py b/imblearn/pipeline.py
@@ -100,8 +100,8 @@ class Pipeline(pipeline.Pipeline):
     >>> pipeline = Pipeline([('smt', smt), ('pca', pca), ('knn', knn)])
     >>> X_train, X_test, y_train, y_test = tts(X, y, random_state=42)
     >>> pipeline.fit(X_train, y_train)
-    Pipeline(steps=[('smt', SMOTE(k=5, kind='regular', m=10, n_jobs=-1, out_step=0.5, random_state=42,
-       ratio='auto')), ('pca', PCA(copy=True, n_components=None, whiten=False)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
+    Pipeline(steps=[('smt', SMOTE(k=None, k_neighbors=5, kind='regular', m=None, m_neighbors=10,
+       n_jobs=-1, out_step=0.5, random_state=42, ratio='auto')), ('pca', PCA(copy=True, n_components=None, whiten=False)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                metric_params=None, n_jobs=1, n_neighbors=5, p=2,
                weights='uniform'))])
     >>> y_hat = pipeline.predict(X_test)