Skip to content

Commit

Permalink
[MRG] Apply deprecation SMOTE and ADADYN (scikit-learn-contrib#183)
Browse files Browse the repository at this point in the history
* Apply deprecation SMOTE and ADADYN

* Update doc pipeline
  • Loading branch information
glemaitre committed Oct 31, 2016
1 parent e8f7874 commit 54f9d16
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 29 deletions.
4 changes: 4 additions & 0 deletions doc/whats_new.rst
Expand Up @@ -14,6 +14,7 @@ Changelog

Bug fixes
~~~~~~~~~

- Fixed a bug in :class:`under_sampling.NearMiss` which was not picking the right samples during under sampling for the method 3. By `Guillaume Lemaitre`_.
- Fixed a bug in :class:`ensemble.EasyEnsemble`, correction of the `random_state` generation. By `Guillaume Lemaitre`_ and `Christos Aridas`_.
- Fixed a bug in :class:`under_sampling.RepeatedEditedNearestNeighbours`, add additional stopping criterion to avoid that the minority class become a majority class or that a class disappear. By `Guillaume Lemaitre`_.
Expand Down Expand Up @@ -53,6 +54,9 @@ API changes summary
- Two base classes :class:`BaseBinaryclassSampler` and :class:`BaseMulticlassSampler` have been created to handle the target type and raise warning in case of abnormality. By `Guillaume Lemaitre`_ and `Christos Aridas`_.
- Move `random_state` to be assigned in the :class:`SamplerMixin` initialization. By `Guillaume Lemaitre`_.
- Provide estimators instead of parameters in :class:`combine.SMOTEENN` and :class:`combine.SMOTETomek`. Therefore, the list of parameters have been deprecated. By `Guillaume Lemaitre`_ and `Christos Aridas`_.
- `k` has been deprecated in :class:`over_sampling.ADASYN`. Use `n_neighbors` instead. By `Guillaume Lemaitre`_.
- `k` and `m` have been deprecated in :class:`over_sampling.SMOTE`. Use `k_neighbors` and `m_neighbors` instead. By `Guillaume Lemaitre`_.


Documentation changes
~~~~~~~~~~~~~~~~~~~~~
Expand Down
27 changes: 27 additions & 0 deletions imblearn/base.py
Expand Up @@ -83,6 +83,10 @@ def fit(self, X, y):

if hasattr(self, 'size_ngh'):
self._validate_size_ngh_deprecation()
elif hasattr(self, 'k') and not hasattr(self, 'm'):
self._validate_k_deprecation()
elif hasattr(self, 'k') and hasattr(self, 'm'):
self._validate_k_m_deprecation()

self.logger.info('Compute classes statistics ...')

Expand Down Expand Up @@ -161,6 +165,10 @@ def sample(self, X, y):

if hasattr(self, 'size_ngh'):
self._validate_size_ngh_deprecation()
elif hasattr(self, 'k') and not hasattr(self, 'm'):
self._validate_k_deprecation()
elif hasattr(self, 'k') and hasattr(self, 'm'):
self._validate_k_m_deprecation()

return self._sample(X, y)

Expand Down Expand Up @@ -212,6 +220,25 @@ def _validate_size_ngh_deprecation(self):
' `n_neighbors` instead.', DeprecationWarning)
self.n_neighbors = self.size_ngh

def _validate_k_deprecation(self):
"""Private function to warn about deprecation of k in ADASYN"""
if self.k is not None:
warnings.warn('`k` will be replaced in version 0.4. Use'
' `n_neighbors` instead.', DeprecationWarning)
self.n_neighbors = self.k

def _validate_k_m_deprecation(self):
"""Private function to warn about deprecation of k in ADASYN"""
if self.k is not None:
warnings.warn('`k` will be replaced in version 0.4. Use'
' `k_neighbors` instead.', DeprecationWarning)
self.k_neighbors = self.k

if self.m is not None:
warnings.warn('`m` will be replaced in version 0.4. Use'
' `m_neighbors` instead.', DeprecationWarning)
self.m_neighbors = self.m

@abstractmethod
def _sample(self, X, y):
"""Resample the dataset.
Expand Down
26 changes: 19 additions & 7 deletions imblearn/over_sampling/adasyn.py
Expand Up @@ -31,7 +31,13 @@ class ADASYN(BaseBinarySampler):
If None, the random number generator is the RandomState instance used
by np.random.
k : int, optional (default=5)
k : int, optional (default=None)
Number of nearest neighbours to used to construct synthetic samples.
NOTE: `k` is deprecated from 0.2 and will be replaced in 0.4
Use ``n_neighbors`` instead.
n_neighbours : int, optional (default=5)
Number of nearest neighbours to used to construct synthetic samples.
n_jobs : int, optional (default=1)
Expand Down Expand Up @@ -84,12 +90,15 @@ class ADASYN(BaseBinarySampler):
"""

def __init__(self, ratio='auto', random_state=None, k=5, n_jobs=1):
def __init__(self, ratio='auto', random_state=None, k=None, n_neighbors=5,
n_jobs=1):
super(ADASYN, self).__init__(ratio=ratio, random_state=random_state)
self.k = k
self.n_neighbors = n_neighbors
self.n_jobs = n_jobs
self.nearest_neighbour = NearestNeighbors(n_neighbors=self.k + 1,
n_jobs=self.n_jobs)
self.nearest_neighbour = NearestNeighbors(
n_neighbors=self.n_neighbors + 1,
n_jobs=self.n_jobs)

def _sample(self, X, y):
"""Resample the dataset.
Expand Down Expand Up @@ -130,7 +139,8 @@ def _sample(self, X, y):
X_min = X[y == self.min_c_]

# Print if verbose is true
self.logger.debug('Finding the %s nearest neighbours ...', self.k)
self.logger.debug('Finding the %s nearest neighbours ...',
self.n_neighbors)

# Look for k-th nearest neighbours, excluding, of course, the
# point itself.
Expand All @@ -140,7 +150,8 @@ def _sample(self, X, y):
_, ind_nn = self.nearest_neighbour.kneighbors(X_min)

# Compute the ratio of majority samples next to minority samples
ratio_nn = np.sum(y[ind_nn[:, 1:]] == self.maj_c_, axis=1) / self.k
ratio_nn = (np.sum(y[ind_nn[:, 1:]] == self.maj_c_, axis=1) /
self.n_neighbors)
# Check that we found at least some neighbours belonging to the
# majority class
if not np.sum(ratio_nn):
Expand All @@ -158,7 +169,8 @@ def _sample(self, X, y):
for x_i, x_i_nn, num_sample_i in zip(X_min, ind_nn, num_samples_nn):

# Pick-up the neighbors wanted
nn_zs = random_state.randint(1, high=self.k + 1, size=num_sample_i)
nn_zs = random_state.randint(1, high=self.n_neighbors + 1,
size=num_sample_i)

# Create a new sample
for nn_z in nn_zs:
Expand Down
65 changes: 45 additions & 20 deletions imblearn/over_sampling/smote.py
Expand Up @@ -33,10 +33,23 @@ class SMOTE(BaseBinarySampler):
If None, the random number generator is the RandomState instance used
by np.random.
k : int, optional (default=5)
k : int, optional (default=None)
Number of nearest neighbours to used to construct synthetic samples.
m : int, optional (default=10)
NOTE: `k` is deprecated from 0.2 and will be replaced in 0.4
Use ``k_neighbors`` instead.
k_neighbors : int, optional (default=5)
Number of nearest neighbours to used to construct synthetic samples.
m : int, optional (default=None)
Number of nearest neighbours to use to determine if a minority sample
is in danger.
NOTE: `m` is deprecated from 0.2 and will be replaced in 0.4
Use ``m_neighbors`` instead.
m_neighbors : int, optional (default=10)
Number of nearest neighbours to use to determine if a minority sample
is in danger.
Expand Down Expand Up @@ -102,12 +115,15 @@ class SMOTE(BaseBinarySampler):
"""

def __init__(self, ratio='auto', random_state=None, k=5, m=10,
out_step=0.5, kind='regular', n_jobs=-1, **kwargs):
def __init__(self, ratio='auto', random_state=None, k=None, k_neighbors=5,
m=None, m_neighbors=10, out_step=0.5, kind='regular',
n_jobs=-1, **kwargs):
super(SMOTE, self).__init__(ratio=ratio, random_state=random_state)
self.kind = kind
self.k = k
self.k_neighbors = k_neighbors
self.m = m
self.m_neighbors = m_neighbors
self.out_step = out_step
self.n_jobs = n_jobs
self.kwargs = kwargs
Expand Down Expand Up @@ -149,11 +165,11 @@ def _in_danger_noise(self, samples, y, kind='danger'):

if kind == 'danger':
# Samples are in danger for m/2 <= m' < m
return np.bitwise_and(n_maj >= float(self.m) / 2.,
n_maj < self.m)
return np.bitwise_and(n_maj >= float(self.m_neighbors) / 2.,
n_maj < self.m_neighbors)
elif kind == 'noise':
# Samples are noise for m = m'
return n_maj == self.m
return n_maj == self.m_neighbors
else:
raise NotImplementedError

Expand Down Expand Up @@ -281,7 +297,8 @@ def _sample(self, X, y):
# If regular SMOTE is to be performed
if self.kind == 'regular':

self.logger.debug('Finding the %s nearest neighbours ...', self.k)
self.logger.debug('Finding the %s nearest neighbours ...',
self.k_neighbors)

# Look for k-th nearest neighbours, excluding, of course, the
# point itself.
Expand Down Expand Up @@ -312,7 +329,8 @@ def _sample(self, X, y):

if self.kind == 'borderline1' or self.kind == 'borderline2':

self.logger.debug('Finding the %s nearest neighbours ...', self.m)
self.logger.debug('Finding the %s nearest neighbours ...',
self.m_neighbors)

# Find the NNs for all samples in the data set.
self.nearest_neighbour.fit(X)
Expand All @@ -334,7 +352,8 @@ def _sample(self, X, y):
#
# We start by changing the number of NNs to consider from m + 1
# to k + 1
self.nearest_neighbour.set_params(**{'n_neighbors': self.k + 1})
self.nearest_neighbour.set_params(**{'n_neighbors':
self.k_neighbors + 1})
self.nearest_neighbour.fit(X_min)

# nns...#
Expand All @@ -358,7 +377,7 @@ def _sample(self, X, y):

# Reset the k-neighbours to m+1 neighbours
self.nearest_neighbour.set_params(
**{'n_neighbors': self.m + 1})
**{'n_neighbors': self.m_neighbors + 1})

return X_resampled, y_resampled

Expand Down Expand Up @@ -395,7 +414,7 @@ def _sample(self, X, y):

# Reset the k-neighbours to m+1 neighbours
self.nearest_neighbour.set_params(
**{'n_neighbors': self.m + 1})
**{'n_neighbors': self.m_neighbors + 1})

return X_resampled, y_resampled

Expand All @@ -416,7 +435,8 @@ def _sample(self, X, y):

# First, find the nn of all the samples to identify samples
# in danger and noisy ones
self.logger.debug('Finding the %s nearest neighbours ...', self.m)
self.logger.debug('Finding the %s nearest neighbours ...',
self.m_neighbors)

# As usual, fit a nearest neighbour model to the data
self.nearest_neighbour.fit(X)
Expand All @@ -439,9 +459,11 @@ def _sample(self, X, y):
safety_bool.sum().astype(int))

# Proceed to find support vectors NNs among the minority class
self.logger.debug('Finding the %s nearest neighbours ...', self.k)
self.logger.debug('Finding the %s nearest neighbours ...',
self.k_neighbors)

self.nearest_neighbour.set_params(**{'n_neighbors': self.k + 1})
self.nearest_neighbour.set_params(**{'n_neighbors':
self.k_neighbors + 1})
self.nearest_neighbour.fit(X_min)

self.logger.debug('Create synthetic samples ...')
Expand Down Expand Up @@ -496,7 +518,8 @@ def _sample(self, X, y):
y_resampled = np.concatenate((y, y_new_1), axis=0)

# Reset the k-neighbours to m+1 neighbours
self.nearest_neighbour.set_params(**{'n_neighbors': self.m + 1})
self.nearest_neighbour.set_params(**{'n_neighbors':
self.m_neighbors + 1})

return X_resampled, y_resampled

Expand All @@ -509,17 +532,19 @@ def _get_smote_kind(self):
# Regular smote does not look for samples in danger, instead it
# creates synthetic samples directly from the k-th nearest
# neighbours with not filtering
self.nearest_neighbour = NearestNeighbors(n_neighbors=self.k + 1,
n_jobs=self.n_jobs)
self.nearest_neighbour = NearestNeighbors(
n_neighbors=self.k_neighbors + 1,
n_jobs=self.n_jobs)
else:
# Borderline1, 2 and SVM variations of smote must first look for
# samples that could be considered noise and samples that live
# near the boundary between the classes. Therefore, before
# creating synthetic samples from the k-th nns, it first look
# for m nearest neighbors to decide whether or not a sample is
# noise or near the boundary.
self.nearest_neighbour = NearestNeighbors(n_neighbors=self.m + 1,
n_jobs=self.n_jobs)
self.nearest_neighbour = NearestNeighbors(
n_neighbors=self.m_neighbors + 1,
n_jobs=self.n_jobs)

# --- SVM smote
# Unlike the borderline variations, the SVM variation uses the support
Expand Down
4 changes: 2 additions & 2 deletions imblearn/pipeline.py
Expand Up @@ -100,8 +100,8 @@ class Pipeline(pipeline.Pipeline):
>>> pipeline = Pipeline([('smt', smt), ('pca', pca), ('knn', knn)])
>>> X_train, X_test, y_train, y_test = tts(X, y, random_state=42)
>>> pipeline.fit(X_train, y_train)
Pipeline(steps=[('smt', SMOTE(k=5, kind='regular', m=10, n_jobs=-1, out_step=0.5, random_state=42,
ratio='auto')), ('pca', PCA(copy=True, n_components=None, whiten=False)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
Pipeline(steps=[('smt', SMOTE(k=None, k_neighbors=5, kind='regular', m=None, m_neighbors=10,
n_jobs=-1, out_step=0.5, random_state=42, ratio='auto')), ('pca', PCA(copy=True, n_components=None, whiten=False)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=5, p=2,
weights='uniform'))])
>>> y_hat = pipeline.predict(X_test)
Expand Down

0 comments on commit 54f9d16

Please sign in to comment.