Skip to content
This repository has been archived by the owner on Jan 9, 2024. It is now read-only.

Commit

Permalink
treat NaN value as a category for categorical value and temporarily u… (
Browse files Browse the repository at this point in the history
#183)

* treat NaN value as a category for categorical value and temporarily use CategoricalEncoder for Neither type
  • Loading branch information
jzhang-gp committed Dec 19, 2019
1 parent c013774 commit 79a96b3
Show file tree
Hide file tree
Showing 9 changed files with 196 additions and 49 deletions.
2 changes: 2 additions & 0 deletions foreshadow/concrete/internals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from foreshadow.concrete.internals.labelencoder import ( # noqa: F403, F401
FixedLabelEncoder,
)
from foreshadow.concrete.internals.nan_filler import NaNFiller # noqa: F401
from foreshadow.concrete.internals.notransform import NoTransform # noqa: F401
from foreshadow.concrete.internals.tfidf import ( # noqa: F403, F401
FixedTfidfVectorizer,
Expand Down Expand Up @@ -48,4 +49,5 @@
"DropCleaner",
"StandardJsonFlattener",
"NoTransform",
"NaNFiller",
] + c_all
53 changes: 53 additions & 0 deletions foreshadow/concrete/internals/nan_filler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""Fill NaNs."""

import numpy as np

from foreshadow.base import BaseEstimator, TransformerMixin
from foreshadow.utils import Constant
from foreshadow.wrapper import pandas_wrap


@pandas_wrap
class NaNFiller(BaseEstimator, TransformerMixin):
"""Fill NaN values in data."""

def __init__(self, fill_value=Constant.NAN_FILL_VALUE):
self.fill_value = fill_value

def fit(self, X, y=None):
"""Empty fit.
Args:
X: input observations
y: input labels
Returns:
self
"""
return self

def transform(self, X, y=None):
"""Fill nans in a column with defined fill_value.
Args:
X (:obj:`pandas.DataFrame`): X data
y: input labels
Returns:
:obj:`pandas.DataFrame`: Transformed data
"""
return X.fillna(self.fill_value)

def inverse_transform(self, X):
"""Reverse nan filling transform.
Args:
X (:obj:`numpy.ndarray`): Transformed X data
Returns:
:obj:`numpy.ndarray`: Original data
"""
return X.replace(to_replace=Constant.NAN_FILL_VALUE, value=np.nan)
9 changes: 8 additions & 1 deletion foreshadow/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,14 @@
# "Numeric": {"Preprocessor": ["Imputer", "Scaler"]},
"Categorical": {"Preprocessor": ["CategoricalEncoder"]},
"Text": {"Preprocessor": ["TextEncoder"]},
"Neither": {"Preprocessor": ["NeitherProcessor"]},
# "Neither": {"Preprocessor": ["NeitherProcessor"]},
# TODO we have to use CategoricalEncoder for Neither Type temporarily as
# some columns in Neither type has missing data. By default,
# number-like columns are treated by estimator as numerical while
# string-like columns are treated as categories. Using preprocessing
# for Numeric will fail the second case while using the preprocessing
# for Categorical work for both cases.
"Neither": {"Preprocessor": ["CategoricalEncoder"]},
}


Expand Down
102 changes: 64 additions & 38 deletions foreshadow/smart/all.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import pandas as pd
import scipy.stats as ss

from foreshadow.concrete import Imputer, NoTransform
from foreshadow.concrete import Imputer, NaNFiller, NoTransform
from foreshadow.concrete.externals import (
HashingEncoder,
MinMaxScaler,
Expand Down Expand Up @@ -97,11 +97,33 @@ def pick_transformer(self, X, y=None, **fit_params):
return distributions[best_dist]


def will_remove_uncommon(X, temp_uncommon_remover):
"""Check if the transformer will modify the data.
Uses current settings.
Args:
X: input observations column
temp_uncommon_remover: transformer
Returns:
(tuple) bool and category counts
"""
X = check_df(X, single_column=True).iloc[:, 0].values
out = temp_uncommon_remover.fit_transform(X).values.ravel()

return (
not (np.array_equal(X, out) | (pd.isnull(X) & pd.isnull(out))).all(),
pd.unique(out).size,
)


class CategoricalEncoder(SmartTransformer):
"""Automatically encode categorical features.
If there are less than 30 categories, then OneHotEncoder is used, if there
are more then HashingEncoder is used. If the columns containing a
If there are no more than 30 categories, then OneHotEncoder is used,
if there are more then HashingEncoder is used. If the columns containing a
delimmeter exceed delim_cuttoff then a DummyEncoder is used (set cutoff to
-1 to force). If used in a y_var context, LabelEncoder is used.
Expand All @@ -117,29 +139,6 @@ def __init__(self, unique_num_cutoff=30, merge_thresh=0.01, **kwargs):
self.merge_thresh = merge_thresh
super().__init__(**kwargs)

def will_transform(self, X, temp_ur):
"""Check if the transformer will modify the data.
Uses current settings.
Args:
X: input observations column
temp_ur: transformer
Returns:
(tuple) bool and category counts
"""
X = check_df(X, single_column=True).iloc[:, 0].values
out = temp_ur.fit_transform(X).values.ravel()

return (
not (
np.array_equal(X, out) | (pd.isnull(X) & pd.isnull(out))
).all(),
pd.unique(out).size,
)

def pick_transformer(self, X, y=None, **fit_params):
"""Determine the appropriate encoding method for an input dataset.
Expand All @@ -153,37 +152,64 @@ def pick_transformer(self, X, y=None, **fit_params):
An initialized encoding transformer
"""
# NaN is treated as a separate category. In order to take it into
# account during the econder selection, we fill the na value with
# the string "NaN". In the final pipeline, it has a pre-defined
# filler as the first step, which will take effect during the real
# transformation.
X = X.fillna("NaN")
data = X.iloc[:, 0]
unique_count = len(data.value_counts())

# TODO performance drag. We may want to apply sampling on this part
# and the uncommon_remove.
# Calculate stats for DummyEncoder
delimeters = [",", ";", "\t"]
delim_count = [
len(list(data.astype("str").str.get_dummies(sep=d)))
for d in delimeters
]
delim_diff = min(delim_count) - len(list(pd.get_dummies(data)))
temp_ur = UncommonRemover(threshold=self.merge_thresh)
will_reduce, reduce_count = self.will_transform(X, temp_ur)

# Calculate stats for UncommonRemover
temp_uncommon_remover = UncommonRemover(threshold=self.merge_thresh)
will_reduce, potential_reduced_count = will_remove_uncommon(
X, temp_uncommon_remover
)

ohe = OneHotEncoder(
return_df=True, use_cat_names=True, handle_unknown="ignore"
)

final_pipeline = SerializablePipeline(
[("fill_na", NaNFiller(fill_value="NaN"))]
)

if self.y_var:
return LabelEncoder()
elif delim_diff < 0:
delim = delimeters[delim_count.index(min(delim_count))]
return DummyEncoder(delimeter=delim)
final_pipeline.steps.append(
("dummy_encodeer", DummyEncoder(delimeter=delim))
)
elif unique_count <= self.unique_num_cutoff:
return ohe
elif (reduce_count <= self.unique_num_cutoff) and will_reduce:
return SerializablePipeline(
[
("ur", UncommonRemover(threshold=self.merge_thresh)),
("ohe", ohe),
]
final_pipeline.steps.append(("one_hot_encoder", ohe))
elif (
potential_reduced_count <= self.unique_num_cutoff
) and will_reduce:
final_pipeline.steps.append(
(
"uncommon_remover",
UncommonRemover(threshold=self.merge_thresh),
)
)
final_pipeline.steps.append(("one_hot_encoder", ohe))
else:
return HashingEncoder(n_components=30)
final_pipeline.steps.append(
("hash_encoder", HashingEncoder(n_components=30))
)

return final_pipeline


class SimpleImputer(SmartTransformer):
Expand Down Expand Up @@ -481,6 +507,6 @@ def _can_fit(self, transformer, X, y=None, sampling_threshold=0.1):
# TODO change to ValueError once TFIDF is fixed.
# logging.warning("Error during fit: ".format(str(e)))
logging.warning(
"Revert to NoTransform for Neither " "Type temporarily."
"Revert to NoTransform for Neither type temporarily."
)
return NoTransform()
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,37 @@
from foreshadow.utils.testing import get_file_path


def test_nan_filler():
import pandas as pd
import numpy as np

from foreshadow.concrete import NaNFiller
from foreshadow.utils import Constant

data = pd.DataFrame(
{
"a": ["123", "a", "b", np.nan],
"b": [np.nan, "q", "w", "v"],
"c": [np.nan, "1", "0", "1"],
}
)

check = pd.DataFrame(
{
"a": ["123", "a", "b", Constant.NAN_FILL_VALUE],
"b": [Constant.NAN_FILL_VALUE, "q", "w", "v"],
"c": [Constant.NAN_FILL_VALUE, "1", "0", "1"],
}
)

filler = NaNFiller()
df_transformed = filler.transform(data)
assert check.equals(df_transformed)

df_original = filler.inverse_transform(df_transformed)
assert data.equals(df_original)


def test_dummy_encoder():
import pandas as pd

Expand Down
40 changes: 30 additions & 10 deletions foreshadow/tests/test_transformers/test_smart/test_smart.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,27 +170,35 @@ def test_smart_encoder_less_than_30_levels():

from foreshadow.smart import CategoricalEncoder
from foreshadow.concrete import OneHotEncoder
from foreshadow.concrete import NaNFiller
from foreshadow.pipeline import SerializablePipeline

np.random.seed(0)
leq_30_random_data = np.random.choice(30, size=500)
smart_coder = CategoricalEncoder()
assert isinstance(
smart_coder.fit(leq_30_random_data).transformer, OneHotEncoder
)
transformer = smart_coder.fit(leq_30_random_data).transformer
assert isinstance(transformer, SerializablePipeline)
assert isinstance(transformer.steps[0][1], NaNFiller)
assert isinstance(transformer.steps[1][1], OneHotEncoder)
assert len(transformer.steps) == 2


def test_smart_encoder_more_than_30_levels():
import numpy as np

from foreshadow.smart import CategoricalEncoder
from foreshadow.concrete import HashingEncoder
from foreshadow.concrete import NaNFiller
from foreshadow.pipeline import SerializablePipeline

np.random.seed(0)
gt_30_random_data = np.random.choice(31, size=500)
smart_coder = CategoricalEncoder()
assert isinstance(
smart_coder.fit(gt_30_random_data).transformer, HashingEncoder
)
transformer = smart_coder.fit(gt_30_random_data).transformer
assert isinstance(transformer, SerializablePipeline)
assert isinstance(transformer.steps[0][1], NaNFiller)
assert isinstance(transformer.steps[1][1], HashingEncoder)
assert len(transformer.steps) == 2


def test_smart_encoder_more_than_30_levels_that_reduces():
Expand Down Expand Up @@ -357,23 +365,35 @@ def test_smart_encoder_delimmited():
import pandas as pd
from foreshadow.smart import CategoricalEncoder
from foreshadow.concrete import DummyEncoder
from foreshadow.concrete import NaNFiller
from foreshadow.pipeline import SerializablePipeline

data = pd.DataFrame({"test": ["a", "a,b,c", "a,b", "a,c"]})
smart_coder = CategoricalEncoder()
assert isinstance(smart_coder.fit(data).transformer, DummyEncoder)
transformer = smart_coder.fit(data).transformer

assert isinstance(transformer, SerializablePipeline)
assert isinstance(transformer.steps[0][1], NaNFiller)
assert isinstance(transformer.steps[1][1], DummyEncoder)
assert len(transformer.steps) == 2


def test_smart_encoder_more_than_30_levels_with_overwritten_cutoff():
import numpy as np
from foreshadow.smart import CategoricalEncoder
from foreshadow.concrete import OneHotEncoder
from foreshadow.concrete import NaNFiller
from foreshadow.pipeline import SerializablePipeline

np.random.seed(0)
gt_30_random_data = np.random.choice(31, size=500)
smart_coder = CategoricalEncoder(unique_num_cutoff=35)
assert isinstance(
smart_coder.fit(gt_30_random_data).transformer, OneHotEncoder
)
transformer = smart_coder.fit(gt_30_random_data).transformer

assert isinstance(transformer, SerializablePipeline)
assert isinstance(transformer.steps[0][1], NaNFiller)
assert isinstance(transformer.steps[1][1], OneHotEncoder)
assert len(transformer.steps) == 2


def test_smart_financial_cleaner_us():
Expand Down
2 changes: 2 additions & 0 deletions foreshadow/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
)
from foreshadow.utils.constants import (
ConfigKey,
Constant,
DefaultConfig,
EstimatorFamily,
ProblemType,
Expand Down Expand Up @@ -56,4 +57,5 @@
"Override",
"ConfigKey",
"DefaultConfig",
"Constant",
]
6 changes: 6 additions & 0 deletions foreshadow/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,9 @@ class ConfigKey:

N_JOBS = "n_jobs"
PROCESSED_DATA_EXPORT_PATH = "processed_data_export_path"


class Constant:
"""General constants in Foreshadow."""

NAN_FILL_VALUE = "NaN"

0 comments on commit 79a96b3

Please sign in to comment.