Skip to content

Commit

Permalink
Merge pull request #89 from ealcobaca/missing-vals
Browse files Browse the repository at this point in the history
Handling more special cases in data encoding
  • Loading branch information
ealcobaca committed Jul 2, 2020
2 parents 80a0fc8 + 573475c commit 45530ec
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 20 deletions.
35 changes: 25 additions & 10 deletions pymfe/_internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -1430,7 +1430,15 @@ def transform_cat_gray(data_categoric: np.ndarray) -> t.Optional[np.ndarray]:

formula = "~ 0 + {}".format(" + ".join(dummy_attr_names))

return np.asarray(patsy.dmatrix(formula, named_data))
try:
enc_data = patsy.dmatrix(formula, named_data, NA_action="raise")
return np.asarray(enc_data, dtype=float)

except patsy.PatsyError:
raise ValueError("Categorical data encoding of type 'gray' has no "
"support for missing values. Please handle the "
"missing data manually before fitting it into the "
"MFE model.")


def transform_cat_onehot(
Expand All @@ -1451,14 +1459,21 @@ def transform_cat_onehot(
for attr_ind in np.arange(num_col):
cur_attr = data_categoric[:, attr_ind, np.newaxis]

if not use_all_columns and np.unique(cur_attr).size <= 1:
if not use_all_columns and len(set(cur_attr.ravel())) <= 1:
raise ValueError("This type of one-hot encoding does not "
"support features with 1 or less distinct "
"values. Drop the {}th categorical feature "
"or select another encoding strategy.".format(
attr_ind + 1))

one_cat_attrs.append(ohe.fit_transform(cur_attr))
try:
one_cat_attrs.append(ohe.fit_transform(cur_attr))

except ValueError:
raise ValueError("Categorical data encoding of type 'one-hot' has "
"no support for missing values. Please handle the"
" missing data manually before fitting it into "
"the MFE model.")

return np.hstack(one_cat_attrs)

Expand All @@ -1467,23 +1482,23 @@ def _equal_freq_discretization(data: np.ndarray,
num_bins: int,
tol: float = 1e-8) -> np.ndarray:
"""Discretize a 1-D numeric array into an equal-frequency histogram."""
perc_interval = 100.0 / num_bins
perc_range = np.arange(perc_interval, 100, perc_interval)
hist_divs = np.percentile(data, perc_range)
hist_divs = np.quantile(data, np.linspace(0, 1, num_bins + 1)[1:])

# Sometimes the 'hist_divs' is not appropriated.
# For example when all values are constants. It implies in 'hist_divs'
# repetitive values.
# To avoid partitions with the same value, we check if all partitions are
# different. Unfortunately, it leads to a non-equal frequency
# discretization.
aux = len(hist_divs)
diffs = np.append(True, np.diff(hist_divs))
hist_divs = hist_divs[diffs > tol]
if aux != len(hist_divs):
prev_size = hist_divs.size

hist_divs = hist_divs[np.append(True, np.diff(hist_divs) > tol)]

if prev_size != hist_divs.size:
warnings.warn("It is not possible make equal discretization")

hist_divs = np.unique(hist_divs)

return np.digitize(x=data, bins=hist_divs, right=True)


Expand Down
17 changes: 7 additions & 10 deletions pymfe/mfe.py
Original file line number Diff line number Diff line change
Expand Up @@ -726,12 +726,11 @@ def _set_data_categoric(self, transform_num: bool,
data_cat = self.X[:, self._attr_indexes_cat]

if transform_num:
data_num_discretized = _internal.transform_num(
data_num_disc = _internal.transform_num(
self.X[:, self._attr_indexes_num], num_bins=num_bins)

if data_num_discretized is not None:
data_cat = np.concatenate((data_cat, data_num_discretized),
axis=1)
if data_num_disc is not None and data_num_disc.size > 0:
data_cat = np.hstack((data_cat, data_num_disc))

return data_cat

Expand Down Expand Up @@ -803,19 +802,18 @@ def _set_data_numeric(

if transform_cat:
if transform_cat == "gray":
categorical_dummies = _internal.transform_cat_gray(
cat_dummies = _internal.transform_cat_gray(
self.X[:, self._attr_indexes_cat])

else:
_use_all_ohe_columns = transform_cat == "one-hot-full"

categorical_dummies = _internal.transform_cat_onehot(
cat_dummies = _internal.transform_cat_onehot(
self.X[:, self._attr_indexes_cat],
use_all_columns=_use_all_ohe_columns)

if categorical_dummies is not None:
data_num = np.concatenate((data_num, categorical_dummies),
axis=1).astype(float)
if cat_dummies is not None and cat_dummies.size > 0:
data_num = np.hstack((data_num, cat_dummies)).astype(float)

if rescale:
data_num = _internal.rescale_data(
Expand Down Expand Up @@ -984,7 +982,6 @@ def fit(self,
TypeError
If X or y (or both) is neither a :obj:`list` or a :obj:`np.ndarray`
object.
"""
if verbose >= 2:
print("Fitting data into model... ", end="")
Expand Down
24 changes: 24 additions & 0 deletions tests/test_architecture.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,18 @@ def test_no_cat_transformation(self):
mfe.fit(X.values, y.values, transform_cat=None)
assert mfe._custom_args_ft["N"].size == 0

def test_gray_encoding_missing_value(self):
X, y = utils.load_xy(1)
mfe = MFE()

X = np.copy(X.values)
y = y.values

X[5, 0] = np.nan

with pytest.raises(ValueError):
mfe.fit(X, y, transform_cat="gray")

def test_one_hot_encoding_01(self):
X, y = utils.load_xy(1)
mfe = MFE()
Expand Down Expand Up @@ -316,6 +328,18 @@ def test_one_hot_encoding_04(self):
with pytest.raises(ValueError):
mfe.fit(X=X, y=y, transform_cat="one-hot")

def test_ohe_full_encoding_missing_value(self):
X, y = utils.load_xy(1)
mfe = MFE()

X = np.copy(X.values)
y = y.values

X[5, 0] = np.nan

with pytest.raises(ValueError):
mfe.fit(X, y, transform_cat="one-hot")

@pytest.mark.parametrize("confidence", (0.95, 0.99))
def test_extract_with_confidence(self, confidence):
X, y = utils.load_xy(2)
Expand Down

0 comments on commit 45530ec

Please sign in to comment.