Skip to content
This repository has been archived by the owner on Jan 9, 2024. It is now read-only.

Commit

Permalink
Merge pull request #30 from georgianpartners/issue_13
Browse files Browse the repository at this point in the history
Fixes #13
  • Loading branch information
alexrallen committed Jan 14, 2019
2 parents 9a4b577 + e81aabb commit 171c35c
Show file tree
Hide file tree
Showing 4 changed files with 144 additions and 3 deletions.
47 changes: 47 additions & 0 deletions foreshadow/tests/test_transformers/test_internal.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,53 @@
import pytest


def test_dummy_encoder():
import numpy as np
import pandas as pd

from foreshadow.transformers.internals import DummyEncoder

data = pd.DataFrame({"test": ["a", "a,b,c", "a,b", "a,c"]})
de = DummyEncoder()
de.fit(data)
df = de.transform(data)

check = pd.DataFrame(
{
"test_DummyEncoder_a": [1, 1, 1, 1],
"test_DummyEncoder_b": [0, 1, 1, 0],
"test_DummyEncoder_c": [0, 1, 0, 1],
}
)

assert check.equals(df)


def test_dummy_encoder_other():
import numpy as np
import pandas as pd

from foreshadow.transformers.internals import DummyEncoder

data = pd.DataFrame(
{"test": ["a", "a,b,c", "a,b", "a,c,d", "a,b,c", "a,b,c", "a,b,c,e"]}
)
de = DummyEncoder(other_cutoff=0.25)
de.fit(data)
df = de.transform(data)

check = pd.DataFrame(
{
"test_DummyEncoder_a": [1, 1, 1, 1, 1, 1, 1],
"test_DummyEncoder_b": [0, 1, 1, 0, 1, 1, 1],
"test_DummyEncoder_c": [0, 1, 0, 1, 1, 1, 1],
"test_DummyEncoder_other": [0, 0, 0, 1, 0, 0, 1],
}
)

assert check.equals(df)


def test_box_cox():
import numpy as np
import pandas as pd
Expand Down
11 changes: 11 additions & 0 deletions foreshadow/tests/test_transformers/test_smart.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,17 @@ def test_preprocessor_hashencoder_no_name_collision():
assert len(set(output.columns)) == 60


def test_smart_encoder_delimmited():
import numpy as np
import pandas as pd
from foreshadow.transformers.smart import Encoder
from foreshadow.transformers.internals import DummyEncoder

data = pd.DataFrame({"test": ["a", "a,b,c", "a,b", "a,c"]})
smart_coder = Encoder()
assert isinstance(smart_coder.fit(data), DummyEncoder)


def test_smart_encoder_more_than_30_levels_with_overwritten_cutoff():
import numpy as np
from foreshadow.transformers.smart import Encoder
Expand Down
71 changes: 71 additions & 0 deletions foreshadow/transformers/internals/dummyencoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import numpy as np
import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted


class DummyEncoder(BaseEstimator, TransformerMixin):
"""Dummy encodes delimmited data within column of dataframe"""

def __init__(self, delimeter=",", other_cutoff=0.1, other_name="other"):
self.delimeter = delimeter
self.other_cutoff = other_cutoff
self.other_name = other_name

def fit(self, X, y=None):
"""Determines dummy categories
Args:
X (:obj:`numpy.ndarray`): Fit data
Returns:
self
"""
X = X.iloc[:, 0]
X = X.str.get_dummies(sep=self.delimeter)
self.other = (X.fillna(0).sum(axis=0) / X.count()) < self.other_cutoff

self.categories = [c for c in list(X) if not self.other[c]]
self.other = [c for c in list(X) if self.other[c]]
if len(self.other) > 0:
self.categories += [self.other_name]

return self

def transform(self, X, y=None):
"""Performs Dummy Encoding on data
Args:
X (:obj:`numpy.ndarray`): X data
Returns:
:obj:`numpy.ndarray`: Transformed data
"""

check_is_fitted(self, ["categories"])

kwargs = {
k: X.applymap(separate(k, self.delimeter, self.other, self.other_name))
.iloc[:, 0]
.tolist()
for k in self.categories
}
df = pd.DataFrame(kwargs)

return df


def separate(cat, delim, other, other_name):
def sep(X):
if cat == other_name:
if set(other) & set(X.split(delim)):
return 1
return 0
if cat in X.split(delim):
return 1
return 0

return sep
18 changes: 15 additions & 3 deletions foreshadow/transformers/smart.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@

import numpy as np
import scipy.stats as ss
import pandas as pd
from sklearn.pipeline import Pipeline

from ..transformers.base import SmartTransformer
from ..transformers.internals import BoxCox, FancyImputer
from ..transformers.internals import BoxCox, FancyImputer, DummyEncoder
from ..transformers.externals import (
MinMaxScaler,
StandardScaler,
Expand Down Expand Up @@ -52,17 +53,28 @@ class Encoder(SmartTransformer):
"""Automatically Encodes Categorical Features
If there are less than 30 categories, then OneHotEncoder is used, if there are more
then HashingEncoder is used. If used in a y_var context, LabelEncoder is used.
then HashingEncoder is used. If the columns containing a delimmeter exceed delim_cuttoff then a
DummyEncoder is used (set cutoff to -1 to force). If used in a y_var context, LabelEncoder is used.
"""

def _get_transformer(self, X, y=None, unique_num_cutoff=30, **fit_params):
data = X.iloc[:, 0]
col_name = X.columns[0]
unique_count = len(data.value_counts())

delimeters = [",", ";", "\t"]
delim_count = [
len(list(data.astype("str").str.get_dummies(sep=d))) for d in delimeters
]
delim_diff = min(delim_count) - len(list(pd.get_dummies(data)))

if self.y_var:
return LabelEncoder()
if unique_count <= unique_num_cutoff:
elif delim_diff < 0:
delim = delimeters[delim_count.index(min(delim_count))]
return DummyEncoder(delimeter=delim)
elif unique_count <= unique_num_cutoff:
return OneHotEncoder(
cols=[col_name],
return_df=True,
Expand Down

0 comments on commit 171c35c

Please sign in to comment.