SmartText, HTMLRemover, related tests, and devops upgrades (#45)

* Fix pytest.ini file * Update poetry.lock and pyproject.toml to include lxml * Modified wrapper to accept sklearn vectorizers * Add SmartText transformer * Add a HTMLRemover transformer * Add a ToString transformer * Convert .ix to .iloc to address deprecation warnings * Increase speed of param_optimize test * Abstract column summary so that TextIntent and categorical intent can share it * Add wrapper ability to handle sparse array output * Temporarily suppress internal sklearn --> scipy warnings
georgian-io-archive · May 14, 2019 · bb35c7a · bb35c7a
1 parent 0abb2bc
commit bb35c7a
Show file tree

Hide file tree

Showing 19 changed files with 427 additions and 279 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,4 @@
+[flake8]
+exclude = .git,__pycache__
+per-file-ignores =
+	foreshadow/transformers/externals.py:F401
diff --git a/.travis.yml b/.travis.yml
@@ -1,39 +1,34 @@
 language: python
-
 cache:
   pip: true
   directories:
     - "$HOME/.cache/pypoetry"
     - "$HOME/.cache/pre-commit"
-
 stages:
   - linting
   - test
-
-before_install:
-    # linux
-  - if [ "$TRAVIS_OS_NAME" == "linux" ]; then sudo apt update; fi
-  - if [ "$TRAVIS_OS_NAME" == "linux" ]; then sudo apt install swig3.0; fi
-  - if [ "$TRAVIS_OS_NAME" == "linux" ]; then sudo ln -s /usr/bin/swig3.0 /usr/bin/swig; fi
-  - pip install poetry
-install:
-  - poetry install -v -E dev
-script:
-  - poetry run tox
-after_success:
-  - poetry run coveralls
-
-env:
-  - FORESHADOW_TESTS="ALL"
-
 jobs:
   include:
-    - python: "3.6"
-
     - stage: linting
       python: "3.6"
       install:
         - pip install pre-commit
         - pre-commit install-hooks
       script:
         - pre-commit run --all-files
+    - stage: test
+      python: "3.6"
+      env:
+        - FORESHADOW_TESTS="ALL"
+      before_install:
+          # linux
+        - if [ "$TRAVIS_OS_NAME" == "linux" ]; then sudo apt update; fi
+        - if [ "$TRAVIS_OS_NAME" == "linux" ]; then sudo apt install swig3.0; fi
+        - if [ "$TRAVIS_OS_NAME" == "linux" ]; then sudo ln -s /usr/bin/swig3.0 /usr/bin/swig; fi
+        - pip install poetry
+      install:
+        - poetry install -v -E dev
+      script:
+        - poetry run tox
+after_success:
+  - poetry run coveralls
diff --git a/foreshadow/intents/__init__.py b/foreshadow/intents/__init__.py
@@ -7,6 +7,7 @@
     CategoricalIntent,
     GenericIntent,
     NumericIntent,
+    TextIntent,
 )
 from foreshadow.intents.registry import registry_eval
 from foreshadow.intents.subnumeric import FinancialIntent
@@ -21,4 +22,5 @@
     "NumericIntent",
     "registry_eval",
     "FinancialIntent",
+    "TextIntent",
 ]
diff --git a/foreshadow/intents/general.py b/foreshadow/intents/general.py
@@ -13,6 +13,7 @@
     MultiImputer,
     Scaler,
     SimpleImputer,
+    SmartText,
 )
 
 
@@ -49,6 +50,14 @@ def _outliers(s, count=10):
     return out_df.loc[out_df["selector"].nlargest(count).index].iloc[:, 0]
 
 
+def _standard_col_summary(df):
+    data = df.iloc[:, 0]
+    nan_num = int(data.isnull().sum())
+    mode, top10 = _mode_freq(data)
+
+    return OrderedDict([("nan", nan_num), ("mode", mode), ("top10", top10)])
+
+
 class GenericIntent(BaseIntent):
     """See base class.
 
@@ -57,7 +66,7 @@ class GenericIntent(BaseIntent):
 
     """
 
-    children = ["NumericIntent", "CategoricalIntent"]
+    children = ["TextIntent", "NumericIntent", "CategoricalIntent"]
     """Matches to CategoricalIntent over NumericIntent"""
 
     single_pipeline_template = []
@@ -103,7 +112,7 @@ class NumericIntent(GenericIntent):
     def is_intent(cls, df):
         """Returns true if data is numeric according to pandas."""
         return (
-            not pd.to_numeric(df.ix[:, 0], errors="coerce")
+            not pd.to_numeric(df.iloc[:, 0], errors="coerce")
             .isnull()
             .values.ravel()
             .all()
@@ -130,10 +139,10 @@ def column_summary(cls, df):
 
         """
 
-        data = df.ix[:, 0]
+        data = df.iloc[:, 0]
         nan_num = int(data.isnull().sum())
         invalid_num = int(
-            pd.to_numeric(df.ix[:, 0], errors="coerce").isnull().sum()
+            pd.to_numeric(df.iloc[:, 0], errors="coerce").isnull().sum()
             - nan_num
         )
         outliers = _outliers(data).values.tolist()
@@ -179,7 +188,7 @@ class CategoricalIntent(GenericIntent):
     @classmethod
     def is_intent(cls, df):
         """Returns true if the majority of data is categorical by uniqueness"""
-        data = df.ix[:, 0]
+        data = df.iloc[:, 0]
         if not np.issubdtype(data.dtype, np.number):
             return True
         else:
@@ -195,10 +204,43 @@ def column_summary(cls, df):
             top10: top 10 most frequent values or empty array if mostly
                 unique [(value, count),...,]
         """
-        data = df.ix[:, 0]
-        nan_num = int(data.isnull().sum())
-        mode, top10 = _mode_freq(data)
 
-        return OrderedDict(
-            [("nan", nan_num), ("mode", mode), ("top10", top10)]
-        )
+        return _standard_col_summary(df)
+
+
+class TextIntent(GenericIntent):
+    """See base class.
+
+    All features can be treated as text
+
+    """
+
+    children = []
+    """No children"""
+
+    single_pipeline_template = [
+        PipelineTemplateEntry("text", SmartText, False)
+    ]
+    """Encodes the column automatically"""
+
+    multi_pipeline_template = []
+    """No multi pipeline"""
+
+    @classmethod
+    def is_intent(cls, df):
+        """Every column can be interpreted as a text"""
+
+        return True
+
+    @classmethod
+    def column_summary(cls, df):
+        """Returns standard computed statistics for a TextIntent column
+
+            The following are computed:
+                nan: count of nans pass into dataset
+                mode: mode or np.nan if data is mostly unique
+                top10: top 10 most frequent values or empty array if mostly
+                    unique [(value, count),...,]
+        """
+
+        return _standard_col_summary(df)
diff --git a/foreshadow/tests/test_estimators/test_meta.py b/foreshadow/tests/test_estimators/test_meta.py
@@ -27,11 +27,13 @@ def test_metaestimator_predict():
 def test_metaestimator_predict_proba():
     import numpy as np
 
-    from sklearn.preprocessing import LabelEncoder
     from sklearn.linear_model import LogisticRegression
     from sklearn.model_selection import train_test_split
 
     from foreshadow.estimators import MetaEstimator
+    from foreshadow.transformers.internals import (
+        FixedLabelEncoder as LabelEncoder,
+    )
 
     np.random.seed(0)
 
@@ -51,11 +53,13 @@ def test_metaestimator_predict_proba():
 def test_metaestimator_score():
     import numpy as np
 
-    from sklearn.preprocessing import LabelEncoder
     from sklearn.linear_model import LogisticRegression
     from sklearn.model_selection import train_test_split
 
     from foreshadow.estimators import MetaEstimator
+    from foreshadow.transformers.internals import (
+        FixedLabelEncoder as LabelEncoder,
+    )
 
     np.random.seed(0)
 

diff --git a/foreshadow/tests/test_foreshadow.py b/foreshadow/tests/test_foreshadow.py
@@ -335,37 +335,52 @@ def test_foreshadow_predict_diff_cols():
     )
 
 
-def test_foreshadow_param_optimize_fit():
+@patch("foreshadow.preprocessor.Preprocessor")
+def test_foreshadow_param_optimize_fit(mock_p):
     import pandas as pd
-    from sklearn.base import BaseEstimator
+    from sklearn.base import BaseEstimator, TransformerMixin
     from sklearn.model_selection._search import BaseSearchCV
 
     from foreshadow import Foreshadow
-    from foreshadow.preprocessor import Preprocessor
 
     data = pd.read_csv("./foreshadow/tests/test_data/boston_housing.csv")
 
-    class DummyRegressor(BaseEstimator):
+    class DummyRegressor(BaseEstimator, TransformerMixin):
         def fit(self, X, y):
-            pass
+            return self
 
     class DummySearch(BaseSearchCV):
         def __init__(self, estimator, params):
             self.best_estimator_ = estimator
 
         def fit(self, X, y=None, **fit_params):
-            pass
+            return self
+
+    class DummyPreprocessor(BaseEstimator, TransformerMixin):
+        def fit(self, X, y):
+            return self
 
-    fs = Foreshadow(Preprocessor(), False, DummyRegressor(), DummySearch)
+    mock_p.return_value = DummyPreprocessor()
 
+    fs = Foreshadow(estimator=DummyRegressor(), optimizer=DummySearch)
     x = data.drop(["medv"], axis=1, inplace=False)
     y = data[["medv"]]
 
     fs.fit(x, y)
-    assert isinstance(fs.pipeline.steps[1][1], DummyRegressor)
+    assert isinstance(fs.pipeline.steps[-1][1].estimator, DummyRegressor)
+
+    fs2 = Foreshadow(
+        X_preprocessor=False,
+        y_preprocessor=False,
+        estimator=DummyRegressor(),
+        optimizer=DummySearch,
+    )
+
+    fs2.fit(x, y)
+    assert isinstance(fs2.pipeline.steps[-1][1], DummyRegressor)
 
 
-def test_foreshadow_param_optimize():
+def test_foreshadow_param_optimize():  # TODO: Make this test faster
     import pickle
     import json
 

diff --git a/foreshadow/tests/test_intents/test_general.py b/foreshadow/tests/test_intents/test_general.py
@@ -158,10 +158,10 @@ def test_categorical_intent_is_intent_string():
     assert CategoricalIntent.is_intent(X)
 
 
-def test_categorical_intent_column_summary():
+def test_standard_intent_column_summary():
     import numpy as np
     import pandas as pd
-    from foreshadow.intents import CategoricalIntent
+    from foreshadow.intents.general import _standard_col_summary
 
     X = pd.DataFrame(["test"] * 5 + ["hi"] * 10 + [np.nan] * 5)
     expected_dict = {
@@ -170,4 +170,15 @@ def test_categorical_intent_column_summary():
         "top10": [["hi", 10, 0.5], ["test", 5, 0.25]],
     }
 
-    assert CategoricalIntent.column_summary(X) == expected_dict
+    assert _standard_col_summary(X) == expected_dict
+
+
+def test_standard_intent_column_summary_calls():
+    import numpy as np
+    import pandas as pd
+    from foreshadow.intents import CategoricalIntent, TextIntent
+
+    X = pd.DataFrame(["test"] * 5 + ["hi"] * 10 + [np.nan] * 5)
+
+    CategoricalIntent.column_summary(X)
+    TextIntent.column_summary(X)
diff --git a/foreshadow/tests/test_transformers/test_internal.py b/foreshadow/tests/test_transformers/test_internal.py
@@ -337,3 +337,47 @@ def test_uncommon_remover_strings():
     assert np.array_equal(
         pd.unique(set_replacement.values.ravel()), np.array(["D", "E"])
     )
+
+
+def test_html_remover_basic():
+    import numpy as np
+    import pandas as pd
+    from foreshadow.transformers.internals import HTMLRemover
+
+    df = pd.DataFrame(
+        ["<h1>Header<h1/>", "Normal Text", "<br/><br/>More text"]
+    )
+    df_assert = pd.DataFrame(["Header", "Normal Text", "More text"])
+
+    hr = HTMLRemover()
+
+    assert np.array_equal(
+        hr.fit_transform(df).values.ravel(), df_assert.values.ravel()
+    )
+
+
+def test_html_remover_is_html():
+    from foreshadow.transformers.internals import HTMLRemover
+
+    html = "<b>Real Tag</b> Test"
+    not_html = "<not tag>"
+
+    assert HTMLRemover.is_html(html)
+    assert not HTMLRemover.is_html(not_html)
+
+
+def test_to_string_tf():
+    import numpy as np
+    import pandas as pd
+    from foreshadow.transformers.internals import ToString
+
+    data = [0, 1, 2, 3, np.nan]
+    arr = np.array(data)
+    df = pd.DataFrame(data)
+
+    expected = ["0.0", "1.0", "2.0", "3.0", "nan"]
+
+    ts = ToString()
+
+    assert expected == ts.transform(arr).values.ravel().tolist()
+    assert expected == ts.transform(df).values.ravel().tolist()
diff --git a/foreshadow/tests/test_transformers/test_smart.py b/foreshadow/tests/test_transformers/test_smart.py
@@ -94,7 +94,9 @@ def test_smart_encoder_y_var():
     import pandas as pd
 
     from foreshadow.transformers.smart import Encoder
-    from foreshadow.transformers.externals import LabelEncoder
+    from foreshadow.transformers.internals import (
+        FixedLabelEncoder as LabelEncoder,
+    )
 
     y_df = pd.DataFrame({"A": np.array([1, 2, 10] * 3)})
     smart_coder = Encoder(y_var=True)
@@ -290,3 +292,25 @@ def test_smart_financial_cleaner_eu():
     out = FinancialCleaner().fit_transform(x).values
 
     assert np.all((out == expected) | (pd.isnull(out) == pd.isnull(expected)))
+
+
+def test_smart_text():
+    import numpy as np
+    import pandas as pd
+
+    from foreshadow.transformers.smart import SmartText
+    from foreshadow.transformers.externals import TfidfVectorizer
+    from foreshadow.transformers.internals import HTMLRemover
+
+    X1 = pd.DataFrame(["abc", "def", "1321", "tester"])
+    tf1 = SmartText().fit(X1)
+
+    assert isinstance(tf1, TfidfVectorizer)
+
+    X2 = pd.DataFrame(["<p> Hello </p>", "World", "<h1> Tag </h1>"])
+    tf2 = SmartText().fit(X2)
+
+    assert any(isinstance(tf, HTMLRemover) for n, tf in tf2.steps)
+    assert isinstance(tf2.steps[-1][1], TfidfVectorizer)
+
+    assert SmartText().fit(pd.DataFrame([1, 2, 3, np.nan]))