Skip to content

Commit

Permalink
Merge pull request #74 from ealcobaca/one-hot-encoding
Browse files Browse the repository at this point in the history
Categorical attributes one-hot encoding option
  • Loading branch information
ealcobaca committed Mar 6, 2020
2 parents c5f0045 + a118a17 commit 44344a5
Show file tree
Hide file tree
Showing 6 changed files with 92 additions and 16 deletions.
24 changes: 23 additions & 1 deletion pymfe/_internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,11 @@
"total_summ",
)

VALID_TRANSFORM_CAT = (
"gray",
"one-hot",
)

_RESCALE_SCALERS = {
"standard": sklearn.preprocessing.StandardScaler,
"min-max": sklearn.preprocessing.MinMaxScaler,
Expand Down Expand Up @@ -1312,7 +1317,7 @@ def timeit(func: t.Callable, *args) -> t.Tuple[t.Any, float]:
return ret_val, time_total


def transform_cat(data_categoric: np.ndarray) -> t.Optional[np.ndarray]:
def transform_cat_gray(data_categoric: np.ndarray) -> t.Optional[np.ndarray]:
"""Transform categorical data using a model matrix.
The formula used for this transformation is just the union (+) of all cat-
Expand Down Expand Up @@ -1342,6 +1347,23 @@ def transform_cat(data_categoric: np.ndarray) -> t.Optional[np.ndarray]:
return np.asarray(patsy.dmatrix(formula, named_data))


def transform_cat_onehot(data_categoric: np.ndarray) -> t.Optional[np.ndarray]:
"""Transform categorical data using one-hot encoding."""
if data_categoric.size == 0:
return None

_, num_col = data_categoric.shape

ohe = sklearn.preprocessing.OneHotEncoder(sparse=False)

one_cat_attrs = np.hstack([
ohe.fit_transform(data_categoric[:, attr_ind, np.newaxis])
for attr_ind in np.arange(num_col)
])

return one_cat_attrs


def _equal_freq_discretization(data: np.ndarray,
num_bins: int,
tol: float = 1e-8) -> np.ndarray:
Expand Down
48 changes: 36 additions & 12 deletions pymfe/mfe.py
Original file line number Diff line number Diff line change
Expand Up @@ -727,16 +727,18 @@ def _set_data_categoric(self, transform_num: bool,

def _set_data_numeric(
self,
transform_cat: bool,
transform_cat: str,
rescale: t.Optional[str] = None,
rescale_args: t.Optional[t.Dict[str, t.Any]] = None) -> np.ndarray:
"""Returns numeric data from the fitted dataset.
Parameters
----------
transform_cat: :obj:`bool`
If True, then all categoric-type data will be binarized with a
model matrix strategy.
If `gray`, then all categoric-type data will be binarized with a
model matrix strategy. If `one-hot`, then all categoric-type
data will be transformed using the one-hot encoding strategy.
If None, then categorical attributes are not transformed.
rescale : :obj:`str`, optional
Check ``fit`` documentation for more information about this
Expand All @@ -761,6 +763,10 @@ def _set_data_numeric(
:obj:`NoneType`. This can be avoided passing valid data to fit and
first calling ``_fill_col_ind_by_type`` instance method before
this method.
ValueError
If `transform_cat` is neither None nor a value among `one-hot` and
`gray`.
"""
if self.X is None:
raise TypeError("It is necessary to fit valid data into the "
Expand All @@ -772,11 +778,22 @@ def _set_data_numeric(
"attributes. Please be sure to call method "
'"_fill_col_ind_by_type" before this method.')

if (transform_cat is not None and
transform_cat not in _internal.VALID_TRANSFORM_CAT):
raise ValueError("Invalid 'transform_cat' value ('{}'). Must be "
"a value in {}.".format(
transform_cat, _internal.VALID_TRANSFORM_CAT))

data_num = self.X[:, self._attr_indexes_num]

if transform_cat:
categorical_dummies = _internal.transform_cat(
self.X[:, self._attr_indexes_cat])
if transform_cat == "gray":
categorical_dummies = _internal.transform_cat_gray(
self.X[:, self._attr_indexes_cat])

else:
categorical_dummies = _internal.transform_cat_onehot(
self.X[:, self._attr_indexes_cat])

if categorical_dummies is not None:
data_num = np.concatenate((data_num, categorical_dummies),
Expand All @@ -792,7 +809,7 @@ def fit(self,
X: t.Sequence,
y: t.Sequence,
transform_num: bool = True,
transform_cat: bool = True,
transform_cat: str = "gray",
rescale: t.Optional[str] = None,
rescale_args: t.Optional[t.Dict[str, t.Any]] = None,
cat_cols: t.Optional[t.Union[str, t.Iterable[int]]] = "auto",
Expand Down Expand Up @@ -820,19 +837,26 @@ def fit(self,
discretized ones. If False, then numeric attributes are ignored for
categorical-only meta-features.
transform_cat : :obj:`bool`, optional
If True, categorical attributes are binarized using a model matrix
to use when alongside numerical data while extracting numeric-only
metafeatures. Note that categoric-only features still uses the
original categoric values, not the binarized ones. If False, then
categorical attributes are ignored for numeric-only metafeatures.
transform_cat : :obj:`str`, optional
Transform categorical data to use alongside numerical data while
extracting numeric-only metafeatures. Note that categoric-only
features still uses the original categoric values, and not the
binarized ones.
If `one-hot`, categorical attributes are binarized using one-hot
encoding.
If `gray`, categorical attributes are binarized using a model
matrix.
The formula used for this transformation is just the union (+) of
all categoric attributes using formula language from ``patsy``
package API, removing the intercept terms:
``~ 0 + A_1 + ... + A_n``, where ``n`` is the number of attributes
and A_i is the ith categoric attribute, 1 <= i <= n.
If None, then categorical attributes are not transformed.
rescale : :obj:`str`, optional
If :obj:`NoneType`, the model keeps all numeric data with its
original values. Otherwise, this argument can assume one of the
Expand Down
4 changes: 2 additions & 2 deletions pymfe/statistical.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,8 +215,8 @@ def _calc_can_cors(
At most min(num_classes, num_attr) canonical correlations are
kept.
"""
y_bin = sklearn.preprocessing.OneHotEncoder().fit_transform(
y.reshape(-1, 1)).todense()
y_bin = sklearn.preprocessing.OneHotEncoder(
sparse=False).fit_transform(y.reshape(-1, 1))

num_classes, num_attr = y_bin.shape[1], N.shape[1]
# Note: 'n_components' is a theoretical upper bound, so it is not
Expand Down
24 changes: 24 additions & 0 deletions tests/test_architecture.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,3 +222,27 @@ def test_parse_valid_metafeatures(self, groups):
names, _ = mfe.parse_by_group(groups, res)

assert not set(names).symmetric_difference(target_mtf)

def test_no_cat_transformation(self):
X, y = load_xy(1)
mfe = MFE()
mfe.fit(X.values, y.values, transform_cat=None)
assert mfe._custom_args_ft["N"].size == 0

def test_one_hot_encoding_01(self):
X, y = load_xy(1)
mfe = MFE()
mfe.fit(X.values, y.values, transform_cat="one-hot")

exp_value = np.sum([np.unique(attr).size for attr in X.values.T])

assert mfe._custom_args_ft["N"].shape[1] == exp_value

def test_one_hot_encoding_02(self):
X, y = load_xy(2)
mfe = MFE()
mfe.fit(X.values, y.values, transform_cat="one-hot")

exp_value = X.values.shape[1]

assert mfe._custom_args_ft["N"].shape[1] == exp_value
6 changes: 6 additions & 0 deletions tests/test_errors_warnings.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,3 +293,9 @@ def test_error__set_data_numeric(self):
mfe = MFE()
mfe.X = np.array([])
mfe._set_data_numeric(True)

def test_invalid_cat_transf(self):
X, y = load_xy(0)
with pytest.raises(ValueError):
mfe = MFE()
mfe.fit(X.values, y.values, transform_cat="invalid")
2 changes: 1 addition & 1 deletion tests/test_scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_output_lengths_2(self, dt_id, scaler, exp_mean, exp_var, exp_min,
exp_max):
X, y = load_xy(dt_id)
model = MFE().fit(
X=X.values, y=y.values, rescale=scaler, transform_cat=False)
X=X.values, y=y.values, rescale=scaler, transform_cat=None)

numeric_data = model._custom_args_ft["N"]

Expand Down

0 comments on commit 44344a5

Please sign in to comment.