Skip to content

Commit

Permalink
Merge pull request #87 from ealcobaca/new-one-hot-enc
Browse files Browse the repository at this point in the history
One-hot encoding option with only 'k-1' features
  • Loading branch information
ealcobaca committed Jul 1, 2020
2 parents f477dc1 + b8f6cf6 commit b2d0f50
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 27 deletions.
28 changes: 21 additions & 7 deletions pymfe/_internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@
VALID_TRANSFORM_CAT = (
"gray",
"one-hot",
"one-hot-full",
)

_RESCALE_SCALERS = {
Expand Down Expand Up @@ -1432,21 +1433,34 @@ def transform_cat_gray(data_categoric: np.ndarray) -> t.Optional[np.ndarray]:
return np.asarray(patsy.dmatrix(formula, named_data))


def transform_cat_onehot(data_categoric: np.ndarray) -> t.Optional[np.ndarray]:
def transform_cat_onehot(
data_categoric: np.ndarray,
use_all_columns: bool = True) -> t.Optional[np.ndarray]:
"""Transform categorical data using one-hot encoding."""
if data_categoric.size == 0:
return None

_, num_col = data_categoric.shape

ohe = sklearn.preprocessing.OneHotEncoder(sparse=False)
_drop = None if use_all_columns else "first"

one_cat_attrs = np.hstack([
ohe.fit_transform(data_categoric[:, attr_ind, np.newaxis])
for attr_ind in np.arange(num_col)
])
ohe = sklearn.preprocessing.OneHotEncoder(drop=_drop, sparse=False)

return one_cat_attrs
one_cat_attrs = [] # type: t.List[np.ndarray]

for attr_ind in np.arange(num_col):
cur_attr = data_categoric[:, attr_ind, np.newaxis]

if not use_all_columns and np.unique(cur_attr).size <= 1:
raise ValueError("This type of one-hot encoding does not "
"support features with 1 or less distinct "
"values. Drop the {}th categorical feature "
"or select another encoding strategy.".format(
attr_ind + 1))

one_cat_attrs.append(ohe.fit_transform(cur_attr))

return np.hstack(one_cat_attrs)


def _equal_freq_discretization(data: np.ndarray,
Expand Down
67 changes: 48 additions & 19 deletions pymfe/mfe.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,26 +737,31 @@ def _set_data_categoric(self, transform_num: bool,

def _set_data_numeric(
self,
transform_cat: str,
transform_cat: str = None,
rescale: t.Optional[str] = None,
rescale_args: t.Optional[t.Dict[str, t.Any]] = None) -> np.ndarray:
"""Returns numeric data from the fitted dataset.
Parameters
----------
transform_cat: :obj:`bool`
transform_cat: :obj:`str`, optional
If `gray`, then all categoric-type data will be binarized with a
model matrix strategy. If `one-hot`, then all categoric-type
data will be transformed using the one-hot encoding strategy.
If None, then categorical attributes are not transformed.
data will be transformed using the k-1 one-hot encoding strategy
(for a traditional one-hot encoding, the first column is dropped
out). If `one-hot-full`, the strategy used is the one-hot encoding
with all encoded features (`k` features for an attribute with `k`
unique values; not recommended due to multicollinearity problems
due to the `dummy variable trap`). If None, then the categorical
attributes are not transformed.
rescale : :obj:`str`, optional
Check ``fit`` documentation for more information about this
parameter.
Check the documentation of the method ``fit`` for more information
about this.
rescale_args : :obj:`dict`, optional
Check ``fit`` documentation for more information about this
parameter.
Check the documentation of the method ``fit`` for more information
about this.
Returns
-------
Expand All @@ -775,8 +780,8 @@ def _set_data_numeric(
this method.
ValueError
If `transform_cat` is neither None nor a value among `one-hot` and
`gray`.
If `transform_cat` is not in the set {None, `one-hot`, `gray`,
`one-hot-full`}.
"""
if self.X is None:
raise TypeError("It is necessary to fit valid data into the "
Expand All @@ -802,8 +807,11 @@ def _set_data_numeric(
self.X[:, self._attr_indexes_cat])

else:
_use_all_ohe_columns = transform_cat == "one-hot-full"

categorical_dummies = _internal.transform_cat_onehot(
self.X[:, self._attr_indexes_cat])
self.X[:, self._attr_indexes_cat],
use_all_columns=_use_all_ohe_columns)

if categorical_dummies is not None:
data_num = np.concatenate((data_num, categorical_dummies),
Expand Down Expand Up @@ -859,16 +867,37 @@ def fit(self,
binarized ones.
If `one-hot`, categorical attributes are binarized using one-hot
encoding.
encoding with `k-1` features for a categorical attribute with `k`
distinct values. This algorithm works as follows:
If `gray`, categorical attributes are binarized using a model
matrix.
For each categorical attribute C:
1. Encode C with traditional one-hot encoding.
2. Arbitrarily drop the first column of the encoding result.
The formula used for this transformation is just the union (+) of
all categoric attributes using formula language from ``patsy``
package API, removing the intercept terms:
``~ 0 + A_1 + ... + A_n``, where ``n`` is the number of attributes
and A_i is the ith categoric attribute, 1 <= i <= n.
The unique value previously represented by the k-length vector
[1, 0, ..., 0] will now be presented by the (k-1)-length vector
[0, 0, ..., 0]. Note that all other unique values will also now be
represented by (k-1)-length vectors (the first `0` is dropped out).
This algorithm avoids the `dummy variable trap`, which may raise
multicollinearity problems due to the unnecessary extra feature.
Note that the decision of dropping the very first encoded feature
is arbitrary, as any other encoded feature could have been dropped
instead.
If `gray`, categorical attributes are binarized using a model
matrix. The formula used for this transformation is just the union
(+) of all categoric attributes using formula language from `patsy`
package API, removing the intercept terms: `~ 0 + A_1 + ... + A_n`,
where `n` is the number of features and `A_i` is the ith categoric
attribute, 1 <= i <= n.
If `one-hot-full`, categorical attributes are binarized using one-
hot encoding with `k` features for a categorical attributes with
`k` distinct values. This option is not recommended due to the
`dummy variable trap`, which may cause multicollinearity problems
due to an extra unnecessary variable (a label can be encoded using
the null vector [0, ..., 0]^T).
If None, then categorical attributes are not transformed.
Expand Down
22 changes: 21 additions & 1 deletion tests/test_architecture.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,11 +284,20 @@ def test_one_hot_encoding_01(self):
mfe = MFE()
mfe.fit(X.values, y.values, transform_cat="one-hot")

exp_value = np.sum([np.unique(attr).size for attr in X.values.T])
exp_value = np.sum([np.unique(attr).size - 1 for attr in X.values.T])

assert mfe._custom_args_ft["N"].shape[1] == exp_value

def test_one_hot_encoding_02(self):
X, y = utils.load_xy(1)
mfe = MFE()
mfe.fit(X.values, y.values, transform_cat="one-hot-full")

exp_value = np.sum([np.unique(attr).size for attr in X.values.T])

assert mfe._custom_args_ft["N"].shape[1] == exp_value

def test_one_hot_encoding_03(self):
X, y = utils.load_xy(2)
mfe = MFE()
mfe.fit(X.values, y.values, transform_cat="one-hot")
Expand All @@ -297,6 +306,16 @@ def test_one_hot_encoding_02(self):

assert mfe._custom_args_ft["N"].shape[1] == exp_value

def test_one_hot_encoding_04(self):
X, y = utils.load_xy(2)
mfe = MFE()

X = np.hstack((X.values, np.ones((y.size, 1), dtype=str)))
y = y.values

with pytest.raises(ValueError):
mfe.fit(X=X, y=y, transform_cat="one-hot")

@pytest.mark.parametrize("confidence", (0.95, 0.99))
def test_extract_with_confidence(self, confidence):
X, y = utils.load_xy(2)
Expand Down Expand Up @@ -462,6 +481,7 @@ def test_extract_from_model_invalid4(self):
with pytest.raises(ValueError):
MFE(groups="general").extract_from_model(model)


class TestArchitectureWarnings:
def test_feature_warning1(self):
"""Test exception handling of feature extraction."""
Expand Down

0 comments on commit b2d0f50

Please sign in to comment.