forked from scikit-learn-contrib/imbalanced-learn
/
_neighbourhood_cleaning_rule.py
258 lines (211 loc) · 9.39 KB
/
_neighbourhood_cleaning_rule.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
"""Class performing under-sampling based on the neighbourhood cleaning rule."""
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# Christos Aridas
# License: MIT
import numbers
import warnings
from collections import Counter
import numpy as np
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.utils import _safe_indexing
from ...utils import Substitution
from ...utils._docstring import _n_jobs_docstring
from ...utils._param_validation import HasMethods, Hidden, Interval, StrOptions
from ..base import BaseCleaningSampler
from ._edited_nearest_neighbours import EditedNearestNeighbours
SEL_KIND = ("all", "mode")
@Substitution(
sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
)
class NeighbourhoodCleaningRule(BaseCleaningSampler):
"""Undersample based on the neighbourhood cleaning rule.
This class uses ENN and a k-NN to remove noisy samples from the datasets.
Read more in the :ref:`User Guide <condensed_nearest_neighbors>`.
Parameters
----------
{sampling_strategy}
edited_nearest_neighbours : estimator object, default=None
The :class:`~imblearn.under_sampling.EditedNearestNeighbours` (ENN)
object to clean the dataset. If `None`, a default ENN is created with
`kind_sel="mode"` and `n_neighbors=n_neighbors`.
n_neighbors : int or estimator object, default=3
If ``int``, size of the neighbourhood to consider to compute the
K-nearest neighbors. If object, an estimator that inherits from
:class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to
find the nearest-neighbors. By default, it will be a 3-NN.
kind_sel : {{"all", "mode"}}, default='all'
Strategy to use in order to exclude samples in the ENN sampling.
- If ``'all'``, all neighbours will have to agree with the samples of
interest to not be excluded.
- If ``'mode'``, the majority vote of the neighbours will be used in
order to exclude a sample.
The strategy `"all"` will be less conservative than `'mode'`. Thus,
more samples will be removed when `kind_sel="all"` generally.
.. deprecated:: 0.12
`kind_sel` is deprecated in 0.12 and will be removed in 0.14.
Currently the parameter has no effect and corresponds always to the
`"all"` strategy.
threshold_cleaning : float, default=0.5
Threshold used to whether consider a class or not during the cleaning
after applying ENN. A class will be considered during cleaning when:
Ci > C x T ,
where Ci and C is the number of samples in the class and the data set,
respectively and theta is the threshold.
{n_jobs}
Attributes
----------
sampling_strategy_ : dict
Dictionary containing the information to sample the dataset. The keys
corresponds to the class labels from which to sample and the values
are the number of samples to sample.
edited_nearest_neighbours_ : estimator object
The edited nearest neighbour object used to make the first resampling.
nn_ : estimator object
Validated K-nearest Neighbours object created from `n_neighbors` parameter.
classes_to_clean_ : list
The classes considered with under-sampling by `nn_` in the second cleaning
phase.
sample_indices_ : ndarray of shape (n_new_samples,)
Indices of the samples selected.
.. versionadded:: 0.4
n_features_in_ : int
Number of features in the input dataset.
.. versionadded:: 0.9
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during `fit`. Defined only when `X` has feature
names that are all strings.
.. versionadded:: 0.10
See Also
--------
EditedNearestNeighbours : Undersample by editing noisy samples.
Notes
-----
See the original paper: [1]_.
Supports multi-class resampling. A one-vs.-rest scheme is used when
sampling a class as proposed in [1]_.
References
----------
.. [1] J. Laurikkala, "Improving identification of difficult small classes
by balancing class distribution," Springer Berlin Heidelberg, 2001.
Examples
--------
>>> from collections import Counter
>>> from sklearn.datasets import make_classification
>>> from imblearn.under_sampling import NeighbourhoodCleaningRule
>>> X, y = make_classification(n_classes=2, class_sep=2,
... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
>>> print('Original dataset shape %s' % Counter(y))
Original dataset shape Counter({{1: 900, 0: 100}})
>>> ncr = NeighbourhoodCleaningRule()
>>> X_res, y_res = ncr.fit_resample(X, y)
>>> print('Resampled dataset shape %s' % Counter(y_res))
Resampled dataset shape Counter({{1: 888, 0: 100}})
"""
_parameter_constraints: dict = {
**BaseCleaningSampler._parameter_constraints,
"edited_nearest_neighbours": [
HasMethods(["fit_resample"]),
None,
],
"n_neighbors": [
Interval(numbers.Integral, 1, None, closed="left"),
HasMethods(["kneighbors", "kneighbors_graph"]),
],
"kind_sel": [StrOptions({"all", "mode"}), Hidden(StrOptions({"deprecated"}))],
"threshold_cleaning": [Interval(numbers.Real, 0, None, closed="neither")],
"n_jobs": [numbers.Integral, None],
}
def __init__(
self,
*,
sampling_strategy="auto",
edited_nearest_neighbours=None,
n_neighbors=3,
kind_sel="deprecated",
threshold_cleaning=0.5,
n_jobs=None,
):
super().__init__(sampling_strategy=sampling_strategy)
self.edited_nearest_neighbours = edited_nearest_neighbours
self.n_neighbors = n_neighbors
self.kind_sel = kind_sel
self.threshold_cleaning = threshold_cleaning
self.n_jobs = n_jobs
def _validate_estimator(self):
"""Create the objects required by NCR."""
if isinstance(self.n_neighbors, numbers.Integral):
self.nn_ = KNeighborsClassifier(
n_neighbors=self.n_neighbors, n_jobs=self.n_jobs
)
elif isinstance(self.n_neighbors, NearestNeighbors):
# backward compatibility when passing a NearestNeighbors object
self.nn_ = KNeighborsClassifier(
n_neighbors=self.n_neighbors.n_neighbors - 1, n_jobs=self.n_jobs
)
else:
self.nn_ = clone(self.n_neighbors)
if self.edited_nearest_neighbours is None:
self.edited_nearest_neighbours_ = EditedNearestNeighbours(
sampling_strategy=self.sampling_strategy,
n_neighbors=self.n_neighbors,
kind_sel="mode",
n_jobs=self.n_jobs,
)
else:
self.edited_nearest_neighbours_ = clone(self.edited_nearest_neighbours)
def _fit_resample(self, X, y):
if self.kind_sel != "deprecated":
warnings.warn(
"`kind_sel` is deprecated in 0.12 and will be removed in 0.14. "
"It already has not effect and corresponds to the `'all'` option.",
FutureWarning,
)
self._validate_estimator()
self.edited_nearest_neighbours_.fit_resample(X, y)
index_not_a1 = self.edited_nearest_neighbours_.sample_indices_
index_a1 = np.ones(y.shape, dtype=bool)
index_a1[index_not_a1] = False
index_a1 = np.flatnonzero(index_a1)
# clean the neighborhood
target_stats = Counter(y)
class_minority = min(target_stats, key=target_stats.get)
# compute which classes to consider for cleaning for the A2 group
self.classes_to_clean_ = [
c
for c, n_samples in target_stats.items()
if (
c in self.sampling_strategy_.keys()
and (n_samples > target_stats[class_minority] * self.threshold_cleaning)
)
]
self.nn_.fit(X, y)
class_minority_indices = np.flatnonzero(y == class_minority)
X_minority = _safe_indexing(X, class_minority_indices)
y_minority = _safe_indexing(y, class_minority_indices)
y_pred_minority = self.nn_.predict(X_minority)
# add an additional sample since the query points contains the original dataset
neighbors_to_minority_indices = self.nn_.kneighbors(
X_minority, n_neighbors=self.nn_.n_neighbors + 1, return_distance=False
)[:, 1:]
mask_misclassified_minority = y_pred_minority != y_minority
index_a2 = np.ravel(neighbors_to_minority_indices[mask_misclassified_minority])
index_a2 = np.array(
[
index
for index in np.unique(index_a2)
if y[index] in self.classes_to_clean_
]
)
union_a1_a2 = np.union1d(index_a1, index_a2).astype(int)
selected_samples = np.ones(y.shape, dtype=bool)
selected_samples[union_a1_a2] = False
self.sample_indices_ = np.flatnonzero(selected_samples)
return (
_safe_indexing(X, self.sample_indices_),
_safe_indexing(y, self.sample_indices_),
)
def _more_tags(self):
return {"sample_indices": True}