Merge pull request #30 from georgianpartners/issue_13

Fixes #13
georgian-io-archive · Jan 14, 2019 · 171c35c · 171c35c
2 parents 9a4b577 + e81aabb
commit 171c35c
Show file tree

Hide file tree

Showing 4 changed files with 144 additions and 3 deletions.
diff --git a/foreshadow/tests/test_transformers/test_internal.py b/foreshadow/tests/test_transformers/test_internal.py
@@ -1,6 +1,53 @@
 import pytest
 
 
+def test_dummy_encoder():
+    import numpy as np
+    import pandas as pd
+
+    from foreshadow.transformers.internals import DummyEncoder
+
+    data = pd.DataFrame({"test": ["a", "a,b,c", "a,b", "a,c"]})
+    de = DummyEncoder()
+    de.fit(data)
+    df = de.transform(data)
+
+    check = pd.DataFrame(
+        {
+            "test_DummyEncoder_a": [1, 1, 1, 1],
+            "test_DummyEncoder_b": [0, 1, 1, 0],
+            "test_DummyEncoder_c": [0, 1, 0, 1],
+        }
+    )
+
+    assert check.equals(df)
+
+
+def test_dummy_encoder_other():
+    import numpy as np
+    import pandas as pd
+
+    from foreshadow.transformers.internals import DummyEncoder
+
+    data = pd.DataFrame(
+        {"test": ["a", "a,b,c", "a,b", "a,c,d", "a,b,c", "a,b,c", "a,b,c,e"]}
+    )
+    de = DummyEncoder(other_cutoff=0.25)
+    de.fit(data)
+    df = de.transform(data)
+
+    check = pd.DataFrame(
+        {
+            "test_DummyEncoder_a": [1, 1, 1, 1, 1, 1, 1],
+            "test_DummyEncoder_b": [0, 1, 1, 0, 1, 1, 1],
+            "test_DummyEncoder_c": [0, 1, 0, 1, 1, 1, 1],
+            "test_DummyEncoder_other": [0, 0, 0, 1, 0, 0, 1],
+        }
+    )
+
+    assert check.equals(df)
+
+
 def test_box_cox():
     import numpy as np
     import pandas as pd

diff --git a/foreshadow/tests/test_transformers/test_smart.py b/foreshadow/tests/test_transformers/test_smart.py
@@ -199,6 +199,17 @@ def test_preprocessor_hashencoder_no_name_collision():
     assert len(set(output.columns)) == 60
 
 
+def test_smart_encoder_delimmited():
+    import numpy as np
+    import pandas as pd
+    from foreshadow.transformers.smart import Encoder
+    from foreshadow.transformers.internals import DummyEncoder
+
+    data = pd.DataFrame({"test": ["a", "a,b,c", "a,b", "a,c"]})
+    smart_coder = Encoder()
+    assert isinstance(smart_coder.fit(data), DummyEncoder)
+
+
 def test_smart_encoder_more_than_30_levels_with_overwritten_cutoff():
     import numpy as np
     from foreshadow.transformers.smart import Encoder

diff --git a/foreshadow/transformers/internals/dummyencoder.py b/foreshadow/transformers/internals/dummyencoder.py
@@ -0,0 +1,71 @@
+import numpy as np
+import pandas as pd
+from sklearn.base import TransformerMixin, BaseEstimator
+from sklearn.utils import check_array
+from sklearn.utils.validation import check_is_fitted
+
+
+class DummyEncoder(BaseEstimator, TransformerMixin):
+    """Dummy encodes delimmited data within column of dataframe"""
+
+    def __init__(self, delimeter=",", other_cutoff=0.1, other_name="other"):
+        self.delimeter = delimeter
+        self.other_cutoff = other_cutoff
+        self.other_name = other_name
+
+    def fit(self, X, y=None):
+        """Determines dummy categories
+
+        Args:
+            X (:obj:`numpy.ndarray`): Fit data
+
+        Returns:
+            self
+
+        """
+        X = X.iloc[:, 0]
+        X = X.str.get_dummies(sep=self.delimeter)
+        self.other = (X.fillna(0).sum(axis=0) / X.count()) < self.other_cutoff
+
+        self.categories = [c for c in list(X) if not self.other[c]]
+        self.other = [c for c in list(X) if self.other[c]]
+        if len(self.other) > 0:
+            self.categories += [self.other_name]
+
+        return self
+
+    def transform(self, X, y=None):
+        """Performs Dummy Encoding on data
+
+        Args:
+            X (:obj:`numpy.ndarray`): X data
+
+        Returns:
+            :obj:`numpy.ndarray`: Transformed data
+
+        """
+
+        check_is_fitted(self, ["categories"])
+
+        kwargs = {
+            k: X.applymap(separate(k, self.delimeter, self.other, self.other_name))
+            .iloc[:, 0]
+            .tolist()
+            for k in self.categories
+        }
+        df = pd.DataFrame(kwargs)
+
+        return df
+
+
+def separate(cat, delim, other, other_name):
+    def sep(X):
+        if cat == other_name:
+            if set(other) & set(X.split(delim)):
+                return 1
+            return 0
+        if cat in X.split(delim):
+            return 1
+        return 0
+
+    return sep
diff --git a/foreshadow/transformers/smart.py b/foreshadow/transformers/smart.py
@@ -8,10 +8,11 @@
 
 import numpy as np
 import scipy.stats as ss
+import pandas as pd
 from sklearn.pipeline import Pipeline
 
 from ..transformers.base import SmartTransformer
-from ..transformers.internals import BoxCox, FancyImputer
+from ..transformers.internals import BoxCox, FancyImputer, DummyEncoder
 from ..transformers.externals import (
     MinMaxScaler,
     StandardScaler,
@@ -52,17 +53,28 @@ class Encoder(SmartTransformer):
     """Automatically Encodes Categorical Features
 
     If there are less than 30 categories, then OneHotEncoder is used, if there are more
-    then HashingEncoder is used. If used in a y_var context, LabelEncoder is used.
+    then HashingEncoder is used. If the columns containing a delimmeter exceed delim_cuttoff then a
+    DummyEncoder is used (set cutoff to -1 to force). If used in a y_var context, LabelEncoder is used.
 
     """
 
     def _get_transformer(self, X, y=None, unique_num_cutoff=30, **fit_params):
         data = X.iloc[:, 0]
         col_name = X.columns[0]
         unique_count = len(data.value_counts())
+
+        delimeters = [",", ";", "\t"]
+        delim_count = [
+            len(list(data.astype("str").str.get_dummies(sep=d))) for d in delimeters
+        ]
+        delim_diff = min(delim_count) - len(list(pd.get_dummies(data)))
+
         if self.y_var:
             return LabelEncoder()
-        if unique_count <= unique_num_cutoff:
+        elif delim_diff < 0:
+            delim = delimeters[delim_count.index(min(delim_count))]
+            return DummyEncoder(delimeter=delim)
+        elif unique_count <= unique_num_cutoff:
             return OneHotEncoder(
                 cols=[col_name],
                 return_df=True,