Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions instance_selection/_CNN.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,10 @@
import numpy as np
import pandas as pd

from .utils import transform, delete_multiple_element
from .utils import delete_multiple_element, transform


class CNN:

"""
Hart, P. (1968). The condensed nearest neighbor rule (corresp.). IEEE
transactions on information theory, 14(3), 515-516.
Expand Down Expand Up @@ -54,7 +53,7 @@ def filter(self, samples, y):
samples = transform(samples, y)
store_classes, indexes = np.unique(samples.target, return_index=True)
store_classes = store_classes.tolist()
store = [samples['data'][x] for x in indexes]
store = [samples["data"][x] for x in indexes]

handbag = []

Expand Down Expand Up @@ -82,8 +81,8 @@ def filter(self, samples, y):
delete_multiple_element(handbag, indexes)
del handbag
samples = pd.DataFrame(store, columns=self.x_attr)
y = pd.DataFrame(np.array(store_classes, dtype=object).flatten().astype(
int))
y = pd.DataFrame(
np.array(store_classes, dtype=object).flatten().astype(int))

return samples, y

Expand All @@ -106,4 +105,4 @@ class of the sample in the store that is closest to the sample
euc = np.array(euc)
euc_nn = np.amin(euc)
index_nn = np.ravel(np.where(euc == euc_nn))
return store_classes[index_nn[0]]
return store_classes[index_nn[0]]
99 changes: 57 additions & 42 deletions instance_selection/_DROP3.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@


class DROP3:

"""
Wilson, D. R., & Martinez, T. R. (2000). Reduction techniques for
instance-based learning algorithms. Machine learning, 38(3), 257-286.
Expand Down Expand Up @@ -63,11 +62,17 @@ def filter(self, samples, y):
:param y: DataFrame.
:return: the input dataset with the remaining samples.
"""
initial_distances, initial_samples, initial_targets, knn, \
samples_info = self._create_variables(samples, y)

self._find_associates(initial_distances, initial_samples,
initial_targets, knn, samples_info)
(
initial_distances,
initial_samples,
initial_targets,
knn,
samples_info,
) = self._create_variables(samples, y)

self._find_associates(
initial_distances, initial_samples, initial_targets, knn, samples_info
)

initial_distances.sort(key=lambda x: x[2], reverse=True)

Expand All @@ -79,32 +84,37 @@ def filter(self, samples, y):
with_, without = self._with_without(tuple(x_sample), samples_info)

if without >= with_:
initial_distances = initial_distances[:index_x - removed] + \
initial_distances[index_x - removed + 1:]
initial_distances = (
initial_distances[: index_x - removed]
+ initial_distances[index_x - removed + 1:]
)
removed += 1

for a_associate_of_x in samples_info[(tuple(x_sample))][1]:
a_neighs, remaining_samples = self._remove_from_neighs(
a_associate_of_x, initial_distances,
samples_info, x_sample)
a_associate_of_x, initial_distances, samples_info, x_sample
)

knn = NearestNeighbors(
n_neighbors=self.nearest_neighbors + 2,
n_jobs=1, p=self.power_parameter)
n_jobs=1,
p=self.power_parameter,
)
knn.fit(remaining_samples)
_, neigh_ind = knn.kneighbors([a_associate_of_x])
possible_neighs = [initial_distances[x][0] for x in
neigh_ind[0]]
possible_neighs = [initial_distances[x][0]
for x in neigh_ind[0]]

self._find_new_neighs(a_associate_of_x, a_neighs,
possible_neighs, samples_info)
self._find_new_neighs(
a_associate_of_x, a_neighs, possible_neighs, samples_info
)

new_neigh = a_neighs[-1]
samples_info[tuple(new_neigh)][1].append(
a_associate_of_x)
samples_info[tuple(new_neigh)][1].append(a_associate_of_x)

samples = pd.DataFrame([x for x, _, _ in initial_distances],
columns=self.x_attr)
samples = pd.DataFrame(
[x for x, _, _ in initial_distances], columns=self.x_attr
)
y = pd.DataFrame([x for _, x, _ in initial_distances])

return samples, y
Expand All @@ -122,23 +132,24 @@ def _create_variables(self, samples, y):
self.x_attr = samples.keys()
samples = transform(samples, y)
s = copy.deepcopy(samples)
initial_samples = s['data']
initial_targets = s['target']
initial_samples, samples_index = np.unique(ar=initial_samples,
return_index=True, axis=0)
initial_samples = s["data"]
initial_targets = s["target"]
initial_samples, samples_index = np.unique(
ar=initial_samples, return_index=True, axis=0
)
initial_targets = initial_targets[samples_index]
knn = NearestNeighbors(n_neighbors=self.nearest_neighbors + 2, n_jobs=1,
p=self.power_parameter)
knn = NearestNeighbors(
n_neighbors=self.nearest_neighbors + 2, n_jobs=1, p=self.power_parameter
)
knn.fit(initial_samples)
samples_info = {tuple(x): [[], [], y] for x, y in zip(initial_samples,
initial_targets)}
samples_info = {
tuple(x): [[], [], y] for x, y in zip(initial_samples, initial_targets)
}
initial_distances = []
return initial_distances, initial_samples, initial_targets, knn, \
samples_info
return initial_distances, initial_samples, initial_targets, knn, samples_info

@staticmethod
def _find_new_neighs(a_associate_of_x, a_neighs, possible_neighs,
samples_info):
def _find_new_neighs(a_associate_of_x, a_neighs, possible_neighs, samples_info):
"""
> The function takes a sample, finds its neighbors, and then checks if
any of the neighbors are not already in the list of neighbors. If
Expand All @@ -162,8 +173,9 @@ def _find_new_neighs(a_associate_of_x, a_neighs, possible_neighs,
samples_info[tuple(a_associate_of_x)][0] = a_neighs

@staticmethod
def _remove_from_neighs(a_associate_of_x, initial_distances,
samples_info, x_sample):
def _remove_from_neighs(
a_associate_of_x, initial_distances, samples_info, x_sample
):
"""
> It removes the sample `x_sample` from the list of neighbors of
`a_associate_of_x` and returns the updated list of neighbors of
Expand Down Expand Up @@ -191,8 +203,9 @@ def _remove_from_neighs(a_associate_of_x, initial_distances,
return a_neighs, remaining_samples

@staticmethod
def _find_associates(initial_distances, initial_samples, initial_targets,
knn, samples_info):
def _find_associates(
initial_distances, initial_samples, initial_targets, knn, samples_info
):
"""
For each sample in the initial set, find the closest sample from the
other class and store it in the initial_distances list
Expand Down Expand Up @@ -245,12 +258,13 @@ def _with_without(x_sample, samples_info):
associates_targets = [samples_info[tuple(x)][2] for x in x_associates]
associates_neighs = [samples_info[tuple(x)][0] for x in x_associates]

for _, a_target, a_neighs in zip(x_associates,
associates_targets,
associates_neighs):
for _, a_target, a_neighs in zip(
x_associates, associates_targets, associates_neighs
):

neighs_targets = np.ravel(np.array([samples_info[tuple(x)][2] for x
in a_neighs])).astype(int)
neighs_targets = np.ravel(
np.array([samples_info[tuple(x)][2] for x in a_neighs])
).astype(int)
neighs_targets = neighs_targets.tolist()

count = np.bincount(neighs_targets[:-1])
Expand All @@ -261,8 +275,9 @@ def _with_without(x_sample, samples_info):
for index_a, neigh in enumerate(a_neighs):
if np.array_equal(neigh, x_sample):
break
count = np.bincount(neighs_targets[:index_a] + neighs_targets[
index_a + 1:])
count = np.bincount(
neighs_targets[:index_a] + neighs_targets[index_a + 1:]
)
max_class = np.where(count == np.amax(count))[0][0]
if max_class == a_target:
without += 1
Expand Down
48 changes: 25 additions & 23 deletions instance_selection/_ENN.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@


class ENN:

"""
Wilson, D. L. (1972). Asymptotic properties of nearest neighbor rules
using edited data. IEEE Transactions on Systems, Man, and
Expand Down Expand Up @@ -62,12 +61,12 @@ def _neighs(self, s_samples, s_targets, index, removed):
"""
x_sample = s_samples[index - removed]
x_target = s_targets[index - removed]
knn = NearestNeighbors(n_jobs=-1,
n_neighbors=self.nearest_neighbors, p=2)
samples_not_x = s_samples[:index - removed] + s_samples[
index - removed + 1:]
targets_not_x = s_targets[:index - removed] + s_targets[
index - removed + 1:]
knn = NearestNeighbors(
n_jobs=-1, n_neighbors=self.nearest_neighbors, p=2)
samples_not_x = s_samples[: index - removed] + \
s_samples[index - removed + 1:]
targets_not_x = s_targets[: index - removed] + \
s_targets[index - removed + 1:]
knn.fit(samples_not_x)
_, neigh_ind = knn.kneighbors([x_sample])

Expand All @@ -88,16 +87,18 @@ def filter(self, samples, y):
"""
self.x_attr = samples.keys()
samples = transform(samples, y)
size = len(samples['data'])
s_samples = list(samples['data'])
s_targets = list(samples['target'])
size = len(samples["data"])
s_samples = list(samples["data"])
s_targets = list(samples["target"])
removed = 0

for index in range(size):
_, x_target, targets_not_x, samples_not_x, neigh_ind = \
self._neighs(s_samples, s_targets, index, removed)
_, x_target, targets_not_x, samples_not_x, neigh_ind = self._neighs(
s_samples, s_targets, index, removed
)
y_targets = np.ravel(
np.array([targets_not_x[x] for x in neigh_ind[0]])).astype(int)
np.array([targets_not_x[x] for x in neigh_ind[0]])
).astype(int)
count = np.bincount(y_targets)
max_class = np.where(count == np.amax(count))[0][0]
if max_class != x_target:
Expand All @@ -110,8 +111,7 @@ def filter(self, samples, y):

return samples, y

def filter_original_complete(self, original, original_y, complete,
complete_y):
def filter_original_complete(self, original, original_y, complete, complete_y):
"""
Modification of the Wilson Editing algorithm.

Expand All @@ -129,17 +129,19 @@ def filter_original_complete(self, original, original_y, complete,
:return: the input dataset with the remaining samples.
"""
self.x_attr = original.keys()
original, complete = transform_original_complete(original, original_y,
complete, complete_y)
size = len(complete['data'])
s_samples = list(complete['data'])
s_targets = list(complete['target'])
o_samples = list(original['data'])
original, complete = transform_original_complete(
original, original_y, complete, complete_y
)
size = len(complete["data"])
s_samples = list(complete["data"])
s_targets = list(complete["target"])
o_samples = list(original["data"])
removed = 0

for index in range(size):
x_sample, x_target, targets_not_x, samples_not_x, neigh_ind = \
self._neighs(s_samples, s_targets, index, removed)
x_sample, x_target, targets_not_x, samples_not_x, neigh_ind = self._neighs(
s_samples, s_targets, index, removed
)
y_targets = [targets_not_x[x] for x in neigh_ind[0]]
count = np.bincount(np.ravel(y_targets))
max_class = np.where(count == np.amax(count))[0][0]
Expand Down
Loading