Skip to content
This repository has been archived by the owner on Jan 9, 2024. It is now read-only.

Commit

Permalink
Add OneHotEncoder Ability to collapse infrequent values issue 26 (#31)
Browse files Browse the repository at this point in the history
* * Add UncommonRemover internal transformer to remove infrequent values in a categorical column
* Re-format smart transformers to have constructors and document parameters
* Add UncommonRemover as a preprocessing step to OneHotEncoder
* Add relevant tests

* Address CR requests and fix pandas wrapper
* Change the internal implementation of UncommonRemover to pandas
* Add field to check_df to validate single column DataFrame and add
  relevant tests
* Update smart encoder to simplify categorical column if it can using UncommonRemover
  • Loading branch information
adithyabsk committed Jan 19, 2019
1 parent 3b3b017 commit 6579deb
Show file tree
Hide file tree
Showing 7 changed files with 190 additions and 34 deletions.
42 changes: 36 additions & 6 deletions foreshadow/tests/test_transformers/test_internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def test_transformer_onehotencoder_fit_transform():
from foreshadow.transformers.externals import OneHotEncoder

df = pd.DataFrame({"neat": ["apple", "apple", "orange", "apple", "orange"]})
ohe = OneHotEncoder(use_cat_names=True, cols=["neat"], handle_unknown="ignore")
ohe = OneHotEncoder(use_cat_names=True, handle_unknown="ignore")
assert ohe.fit(df) == ohe
assert list(ohe.transform(df)) == [
"neat_OneHotEncoder_neat_apple",
Expand All @@ -137,11 +137,7 @@ def test_transformer_onehotencoder_fit_transform_keep_cols():

df = pd.DataFrame({"neat": ["apple", "apple", "orange", "apple", "orange"]})
ohe = OneHotEncoder(
keep_columns=True,
name="encoder",
use_cat_names=True,
cols=["neat"],
handle_unknown="ignore",
keep_columns=True, name="encoder", use_cat_names=True, handle_unknown="ignore"
)
assert ohe.fit(df) == ohe
assert list(ohe.transform(df)) == [
Expand Down Expand Up @@ -212,3 +208,37 @@ def test_drop_transformer_string_input():
assert np.array_equal(
x.values.ravel(), DropFeature().fit_transform(x).values.ravel()
)


def test_uncommon_remover_integers():
import numpy as np
import pandas as pd
from foreshadow.transformers.internals import UncommonRemover

x = pd.DataFrame({"A": np.array([0, 2, 10] + [1] * 400 + [3] * 400)})
standard = UncommonRemover().fit_transform(x)
set_replacement = UncommonRemover(replacement=1).fit_transform(x)

assert np.array_equal(
pd.unique(standard.values.ravel()),
np.array(["UncommonRemover_Other", 1, 3], dtype="object"),
)
assert np.array_equal(pd.unique(set_replacement.values.ravel()), np.array([1, 3]))


def test_uncommon_remover_strings():
import numpy as np
import pandas as pd
from foreshadow.transformers.internals import UncommonRemover

x = pd.DataFrame({"A": np.array(["A", "B", "C"] + ["D"] * 400 + ["E"] * 400)})
standard = UncommonRemover().fit_transform(x)
set_replacement = UncommonRemover(replacement="D").fit_transform(x)

assert np.array_equal(
pd.unique(standard.values.ravel()),
np.array(["UncommonRemover_Other", "D", "E"], dtype="object"),
)
assert np.array_equal(
pd.unique(set_replacement.values.ravel()), np.array(["D", "E"])
)
14 changes: 14 additions & 0 deletions foreshadow/tests/test_transformers/test_smart.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,20 @@ def test_smart_encoder_more_than_30_levels():
assert isinstance(smart_coder.fit(gt_30_random_data), HashingEncoder)


def test_smart_encoder_more_than_30_levels_that_reduces():
import numpy as np

from foreshadow.transformers.smart import Encoder
from foreshadow.transformers.externals import OneHotEncoder

np.random.seed(0)
gt_30_random_data = np.concatenate(
[np.random.choice(29, size=500), np.array([31, 32, 33, 34, 35, 36])]
)
smart_coder = Encoder()
assert isinstance(smart_coder.fit(gt_30_random_data).steps[-1][1], OneHotEncoder)


def test_smart_encoder_y_var():
import numpy as np
import pandas as pd
Expand Down
19 changes: 19 additions & 0 deletions foreshadow/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,25 @@ def test_check_df_raises_on_invalid():
)


def test_check_df_passthrough_none():
from foreshadow.utils import check_df

input_df = None
assert check_df(input_df, ignore_none=True) is None


def test_check_df_single_column():
import numpy as np
from foreshadow.utils import check_df

input_arr = np.arange(8).reshape((4, 2))

with pytest.raises(ValueError) as e:
input_df = check_df(input_arr, single_column=True)

assert str(e.value) == ("Input Dataframe must have only one column")


def test_module_not_installed():
from foreshadow.utils import check_module_installed

Expand Down
50 changes: 50 additions & 0 deletions foreshadow/transformers/internals/uncommonremover.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import pandas as pd
import numpy as np
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.base import TransformerMixin, BaseEstimator

from foreshadow.utils import check_df


class UncommonRemover(BaseEstimator, TransformerMixin):
"""Merges uncommon values in a categorical column to an other value
Note: Unseen values from fitting will alse be merged.
Args:
threshold (float): data that is less frequant than this percentage will
be merged into a singular unique value
replacement (Optional): value with which to replace uncommon values
"""

def __init__(self, threshold=0.01, replacement="UncommonRemover_Other"):
self.threshold = threshold
self.replacement = replacement

def fit(self, X, y=None):
"""
Finds the uncommon values and sets the replacement value
Args:
X (:obj:`pandas.DataFrame`): input dataframe
returns:
(self) object instance
"""
X = check_df(X, single_column=True).iloc[:, 0]

vc_series = X.value_counts()
self.values_ = vc_series.index.values.tolist()
self.merge_values_ = vc_series[
vc_series <= (self.threshold * X.size)
].index.values.tolist()

return self

def transform(self, X, y=None):
X = check_df(X, single_column=True).iloc[:, 0]
check_is_fitted(self, ["values_", "merge_values_"])
X[X.isin(self.merge_values_) | ~X.isin(self.values_)] = self.replacement
X = X.to_frame()

return X
71 changes: 54 additions & 17 deletions foreshadow/transformers/smart.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from sklearn.pipeline import Pipeline

from ..transformers.base import SmartTransformer
from ..transformers.internals import BoxCox, FancyImputer, DummyEncoder
from ..transformers.internals import BoxCox, FancyImputer, UncommonRemover, DummyEncoder
from ..transformers.externals import (
MinMaxScaler,
StandardScaler,
Expand All @@ -22,6 +22,8 @@
OneHotEncoder,
)

from foreshadow.utils import check_df


class Scaler(SmartTransformer):
"""Automatically Scales Numerical Features
Expand All @@ -30,9 +32,16 @@ class Scaler(SmartTransformer):
StandardScaler is used, if it is uniform, MinMaxScaler is used, and if neither
distribution fits then a BoxCox transformation is applied and a RobustScaler
is used.
Args:
p_val (float): p value cutoff for the ks-test
"""

def _get_transformer(self, X, y=None, p_val_cutoff=0.05, **fit_params):
def __init__(self, p_val=0.05, **kwargs):
self.p_val = p_val
super().__init__(**kwargs)

def _get_transformer(self, X, y=None, **fit_params):
data = X.iloc[:, 0]
# statistically invalid but good enough measure of relative closeness
# ks-test does not allow estimated parameters
Expand All @@ -42,7 +51,7 @@ def _get_transformer(self, X, y=None, p_val_cutoff=0.05, **fit_params):
dist = getattr(ss.distributions, d)
p_vals[d] = ss.kstest(data, d, args=dist.fit(data)).pvalue
best_dist = max(p_vals, key=p_vals.get)
best_dist = best_dist if p_vals[best_dist] >= p_val_cutoff else None
best_dist = best_dist if p_vals[best_dist] >= self.p_val else None
if best_dist is None:
return Pipeline([("box_cox", BoxCox()), ("robust_scaler", RobustScaler())])
else:
Expand All @@ -53,36 +62,61 @@ class Encoder(SmartTransformer):
"""Automatically Encodes Categorical Features
If there are less than 30 categories, then OneHotEncoder is used, if there are more
then HashingEncoder is used. If the columns containing a delimmeter exceed delim_cuttoff then a
DummyEncoder is used (set cutoff to -1 to force). If used in a y_var context, LabelEncoder is used.
then HashingEncoder is used. If the columns containing a delimmeter exceed
delim_cuttoff then a DummyEncoder is used (set cutoff to -1 to force). If used
in a y_var context, LabelEncoder is used.
Args:
unique_num_cutoff (float): number of allowable unique categories
merge_thresh (float): threshold passed into UncommonRemover if selected
"""

def _get_transformer(self, X, y=None, unique_num_cutoff=30, **fit_params):
def __init__(self, unique_num_cutoff=30, merge_thresh=0.01, **kwargs):
self.unique_num_cutoff = unique_num_cutoff
self.merge_thresh = merge_thresh
super().__init__(**kwargs)

def will_transform(self, X, temp_ur):
"""Checks if the transformer with the current settings will modify the data
Returns: (tuple) bool and category counts
"""

X = check_df(X, single_column=True).iloc[:, 0].values
out = temp_ur.fit_transform(X).values.ravel()

return (
not (np.array_equal(X, out) | (pd.isnull(X) & pd.isnull(out))).all(),
pd.unique(out).size,
)

def _get_transformer(self, X, y=None, **fit_params):
data = X.iloc[:, 0]
col_name = X.columns[0]
unique_count = len(data.value_counts())

delimeters = [",", ";", "\t"]
delim_count = [
len(list(data.astype("str").str.get_dummies(sep=d))) for d in delimeters
]
delim_diff = min(delim_count) - len(list(pd.get_dummies(data)))
temp_ur = UncommonRemover(threshold=self.merge_thresh)
will_reduce, reduce_count = self.will_transform(X, temp_ur)
ohe = OneHotEncoder(return_df=True, use_cat_names=True, handle_unknown="ignore")

if self.y_var:
return LabelEncoder()
elif delim_diff < 0:
delim = delimeters[delim_count.index(min(delim_count))]
return DummyEncoder(delimeter=delim)
elif unique_count <= unique_num_cutoff:
return OneHotEncoder(
cols=[col_name],
return_df=True,
use_cat_names=True,
handle_unknown="ignore",
elif unique_count <= self.unique_num_cutoff:
return ohe
elif (reduce_count <= self.unique_num_cutoff) and will_reduce:
return Pipeline(
[("ur", UncommonRemover(threshold=self.merge_thresh)), ("ohe", ohe)]
)
else:
return HashingEncoder(n_components=30, cols=[col_name])
return HashingEncoder(n_components=30)


class SimpleImputer(SmartTransformer):
Expand All @@ -91,7 +125,10 @@ class SimpleImputer(SmartTransformer):
Performs z-score test to determine whether to use mean or median imputation. If
too many data points are missing then imputation is not attempted in favor of
multiple imputation later in the pipeline.
Args:
threshold (float): threshold of missing data where to use these
strategies
"""

def __init__(self, threshold=0.1, **kwargs):
Expand All @@ -103,13 +140,13 @@ def _choose_simple(self, X):

# Uses modified z score method http://colingorrie.github.io/outlier-detection.html
# Assumes data is has standard distribution
threshold = 3.5
z_threshold = 3.5

med_y = np.median(X)
mad_y = np.median(np.abs(np.subtract(X, med_y)))
z_scor = [0.6745 * (y - med_y) / mad_y for y in X]

z_bool = np.where(np.abs(z_scor) > threshold)[0].shape[0] / X.shape[0] > 0.05
z_bool = np.where(np.abs(z_scor) > z_threshold)[0].shape[0] / X.shape[0] > 0.05

if z_bool:
return FancyImputer("SimpleFill", fill_method="median")
Expand Down
8 changes: 3 additions & 5 deletions foreshadow/transformers/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,13 +226,9 @@ def pandas_wrapper(self, func, df, *args, **kwargs):
"""

stack = inspect.stack()
caller = None

caller = stack[1][0].f_locals["self"].__class__
current = inspect.currentframe()
calframe = inspect.getouterframes(current, 3)
# import pdb
# pdb.set_trace()
if calframe[2][3] != "pandas_wrapper":
return func(self, df, *args, **kwargs)

Expand Down Expand Up @@ -272,6 +268,8 @@ def pandas_wrapper(self, func, df, *args, **kwargs):
]
return pd.concat([df, out], axis=1)

out.from_transformer = True

return out

# If output is numpy array (transform has occurred)
Expand Down Expand Up @@ -299,7 +297,7 @@ def pandas_wrapper(self, func, df, *args, **kwargs):
}
df = df.assign(**kw)

df.from_transformer = True
df.from_transformer = True

return df

Expand Down
20 changes: 14 additions & 6 deletions foreshadow/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@
PipelineStep = {"NAME": 0, "CLASS": 1, "COLS": 2}


def check_df(input_data, ignore_none=False):
"""Convert non dataframe inputs into dataframes.
def check_df(input_data, ignore_none=False, single_column=False):
"""Convert non dataframe inputs into dataframes. (or series)
Args:
input_data (:obj:`pandas.DataFrame`, :obj:`numpy.ndarray`, list):
input to convert
ignore_none (bool): allow None to pass through check_df
single_column (bool): check if frame is of a single column and return series
Returns:
:obj:`pandas.DataFrame`: Converted and validated input dataframes
Expand All @@ -26,22 +28,28 @@ def check_df(input_data, ignore_none=False):
if input_data is None and ignore_none:
return None

ret_df = None
if isinstance(input_data, pd.DataFrame):
if len(input_data.columns) > len(set(input_data.columns)):
warnings.warn("Columns are not all uniquely named, automatically resolving")
input_data.columns = pd.io.parsers.ParserBase(
{"names": input_data.columns}
)._maybe_dedup_names(input_data.columns)
return input_data
ret_df = input_data
elif isinstance(input_data, pd.Series):
return input_data.to_frame()
elif isinstance(input_data, np.ndarray) or isinstance(input_data, list):
return pd.DataFrame(input_data)
ret_df = input_data.to_frame()
elif isinstance(input_data, np.ndarray) or isinstance(input_data, (list, tuple)):
ret_df = pd.DataFrame(input_data)
else:
raise ValueError(
"Invalid input type, neither pd.DataFrame, pd.Series, np.ndarray, nor list"
)

if single_column and len(ret_df.columns) != 1:
raise ValueError("Input Dataframe must have only one column")

return ret_df


def check_module_installed(name):
"""Checks whether a module is available for import"""
Expand Down

0 comments on commit 6579deb

Please sign in to comment.