Skip to content
This repository has been archived by the owner on Jan 9, 2024. It is now read-only.

Commit

Permalink
SmartText, HTMLRemover, related tests, and devops upgrades (#45)
Browse files Browse the repository at this point in the history
* Fix pytest.ini file
* Update poetry.lock and pyproject.toml to include lxml
* Modified wrapper to accept sklearn vectorizers
* Add SmartText transformer
* Add a HTMLRemover transformer
* Add a ToString transformer
* Convert .ix to .iloc to address deprecation warnings
* Increase speed of param_optimize test
* Abstract column summary so that TextIntent and categorical intent can
  share it
* Add wrapper ability to handle sparse array output
* Temporarily suppress internal sklearn --> scipy warnings
  • Loading branch information
adithyabsk committed May 14, 2019
1 parent 0abb2bc commit bb35c7a
Show file tree
Hide file tree
Showing 19 changed files with 427 additions and 279 deletions.
4 changes: 4 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[flake8]
exclude = .git,__pycache__
per-file-ignores =
foreshadow/transformers/externals.py:F401
37 changes: 16 additions & 21 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,39 +1,34 @@
language: python

cache:
pip: true
directories:
- "$HOME/.cache/pypoetry"
- "$HOME/.cache/pre-commit"

stages:
- linting
- test

before_install:
# linux
- if [ "$TRAVIS_OS_NAME" == "linux" ]; then sudo apt update; fi
- if [ "$TRAVIS_OS_NAME" == "linux" ]; then sudo apt install swig3.0; fi
- if [ "$TRAVIS_OS_NAME" == "linux" ]; then sudo ln -s /usr/bin/swig3.0 /usr/bin/swig; fi
- pip install poetry
install:
- poetry install -v -E dev
script:
- poetry run tox
after_success:
- poetry run coveralls

env:
- FORESHADOW_TESTS="ALL"

jobs:
include:
- python: "3.6"

- stage: linting
python: "3.6"
install:
- pip install pre-commit
- pre-commit install-hooks
script:
- pre-commit run --all-files
- stage: test
python: "3.6"
env:
- FORESHADOW_TESTS="ALL"
before_install:
# linux
- if [ "$TRAVIS_OS_NAME" == "linux" ]; then sudo apt update; fi
- if [ "$TRAVIS_OS_NAME" == "linux" ]; then sudo apt install swig3.0; fi
- if [ "$TRAVIS_OS_NAME" == "linux" ]; then sudo ln -s /usr/bin/swig3.0 /usr/bin/swig; fi
- pip install poetry
install:
- poetry install -v -E dev
script:
- poetry run tox
after_success:
- poetry run coveralls
2 changes: 2 additions & 0 deletions foreshadow/intents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
CategoricalIntent,
GenericIntent,
NumericIntent,
TextIntent,
)
from foreshadow.intents.registry import registry_eval
from foreshadow.intents.subnumeric import FinancialIntent
Expand All @@ -21,4 +22,5 @@
"NumericIntent",
"registry_eval",
"FinancialIntent",
"TextIntent",
]
64 changes: 53 additions & 11 deletions foreshadow/intents/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
MultiImputer,
Scaler,
SimpleImputer,
SmartText,
)


Expand Down Expand Up @@ -49,6 +50,14 @@ def _outliers(s, count=10):
return out_df.loc[out_df["selector"].nlargest(count).index].iloc[:, 0]


def _standard_col_summary(df):
data = df.iloc[:, 0]
nan_num = int(data.isnull().sum())
mode, top10 = _mode_freq(data)

return OrderedDict([("nan", nan_num), ("mode", mode), ("top10", top10)])


class GenericIntent(BaseIntent):
"""See base class.
Expand All @@ -57,7 +66,7 @@ class GenericIntent(BaseIntent):
"""

children = ["NumericIntent", "CategoricalIntent"]
children = ["TextIntent", "NumericIntent", "CategoricalIntent"]
"""Matches to CategoricalIntent over NumericIntent"""

single_pipeline_template = []
Expand Down Expand Up @@ -103,7 +112,7 @@ class NumericIntent(GenericIntent):
def is_intent(cls, df):
"""Returns true if data is numeric according to pandas."""
return (
not pd.to_numeric(df.ix[:, 0], errors="coerce")
not pd.to_numeric(df.iloc[:, 0], errors="coerce")
.isnull()
.values.ravel()
.all()
Expand All @@ -130,10 +139,10 @@ def column_summary(cls, df):
"""

data = df.ix[:, 0]
data = df.iloc[:, 0]
nan_num = int(data.isnull().sum())
invalid_num = int(
pd.to_numeric(df.ix[:, 0], errors="coerce").isnull().sum()
pd.to_numeric(df.iloc[:, 0], errors="coerce").isnull().sum()
- nan_num
)
outliers = _outliers(data).values.tolist()
Expand Down Expand Up @@ -179,7 +188,7 @@ class CategoricalIntent(GenericIntent):
@classmethod
def is_intent(cls, df):
"""Returns true if the majority of data is categorical by uniqueness"""
data = df.ix[:, 0]
data = df.iloc[:, 0]
if not np.issubdtype(data.dtype, np.number):
return True
else:
Expand All @@ -195,10 +204,43 @@ def column_summary(cls, df):
top10: top 10 most frequent values or empty array if mostly
unique [(value, count),...,]
"""
data = df.ix[:, 0]
nan_num = int(data.isnull().sum())
mode, top10 = _mode_freq(data)

return OrderedDict(
[("nan", nan_num), ("mode", mode), ("top10", top10)]
)
return _standard_col_summary(df)


class TextIntent(GenericIntent):
"""See base class.
All features can be treated as text
"""

children = []
"""No children"""

single_pipeline_template = [
PipelineTemplateEntry("text", SmartText, False)
]
"""Encodes the column automatically"""

multi_pipeline_template = []
"""No multi pipeline"""

@classmethod
def is_intent(cls, df):
"""Every column can be interpreted as a text"""

return True

@classmethod
def column_summary(cls, df):
"""Returns standard computed statistics for a TextIntent column
The following are computed:
nan: count of nans pass into dataset
mode: mode or np.nan if data is mostly unique
top10: top 10 most frequent values or empty array if mostly
unique [(value, count),...,]
"""

return _standard_col_summary(df)
8 changes: 6 additions & 2 deletions foreshadow/tests/test_estimators/test_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@ def test_metaestimator_predict():
def test_metaestimator_predict_proba():
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from foreshadow.estimators import MetaEstimator
from foreshadow.transformers.internals import (
FixedLabelEncoder as LabelEncoder,
)

np.random.seed(0)

Expand All @@ -51,11 +53,13 @@ def test_metaestimator_predict_proba():
def test_metaestimator_score():
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from foreshadow.estimators import MetaEstimator
from foreshadow.transformers.internals import (
FixedLabelEncoder as LabelEncoder,
)

np.random.seed(0)

Expand Down
33 changes: 24 additions & 9 deletions foreshadow/tests/test_foreshadow.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,37 +335,52 @@ def test_foreshadow_predict_diff_cols():
)


def test_foreshadow_param_optimize_fit():
@patch("foreshadow.preprocessor.Preprocessor")
def test_foreshadow_param_optimize_fit(mock_p):
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection._search import BaseSearchCV

from foreshadow import Foreshadow
from foreshadow.preprocessor import Preprocessor

data = pd.read_csv("./foreshadow/tests/test_data/boston_housing.csv")

class DummyRegressor(BaseEstimator):
class DummyRegressor(BaseEstimator, TransformerMixin):
def fit(self, X, y):
pass
return self

class DummySearch(BaseSearchCV):
def __init__(self, estimator, params):
self.best_estimator_ = estimator

def fit(self, X, y=None, **fit_params):
pass
return self

class DummyPreprocessor(BaseEstimator, TransformerMixin):
def fit(self, X, y):
return self

fs = Foreshadow(Preprocessor(), False, DummyRegressor(), DummySearch)
mock_p.return_value = DummyPreprocessor()

fs = Foreshadow(estimator=DummyRegressor(), optimizer=DummySearch)
x = data.drop(["medv"], axis=1, inplace=False)
y = data[["medv"]]

fs.fit(x, y)
assert isinstance(fs.pipeline.steps[1][1], DummyRegressor)
assert isinstance(fs.pipeline.steps[-1][1].estimator, DummyRegressor)

fs2 = Foreshadow(
X_preprocessor=False,
y_preprocessor=False,
estimator=DummyRegressor(),
optimizer=DummySearch,
)

fs2.fit(x, y)
assert isinstance(fs2.pipeline.steps[-1][1], DummyRegressor)


def test_foreshadow_param_optimize():
def test_foreshadow_param_optimize(): # TODO: Make this test faster
import pickle
import json

Expand Down
17 changes: 14 additions & 3 deletions foreshadow/tests/test_intents/test_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,10 +158,10 @@ def test_categorical_intent_is_intent_string():
assert CategoricalIntent.is_intent(X)


def test_categorical_intent_column_summary():
def test_standard_intent_column_summary():
import numpy as np
import pandas as pd
from foreshadow.intents import CategoricalIntent
from foreshadow.intents.general import _standard_col_summary

X = pd.DataFrame(["test"] * 5 + ["hi"] * 10 + [np.nan] * 5)
expected_dict = {
Expand All @@ -170,4 +170,15 @@ def test_categorical_intent_column_summary():
"top10": [["hi", 10, 0.5], ["test", 5, 0.25]],
}

assert CategoricalIntent.column_summary(X) == expected_dict
assert _standard_col_summary(X) == expected_dict


def test_standard_intent_column_summary_calls():
import numpy as np
import pandas as pd
from foreshadow.intents import CategoricalIntent, TextIntent

X = pd.DataFrame(["test"] * 5 + ["hi"] * 10 + [np.nan] * 5)

CategoricalIntent.column_summary(X)
TextIntent.column_summary(X)
44 changes: 44 additions & 0 deletions foreshadow/tests/test_transformers/test_internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,3 +337,47 @@ def test_uncommon_remover_strings():
assert np.array_equal(
pd.unique(set_replacement.values.ravel()), np.array(["D", "E"])
)


def test_html_remover_basic():
import numpy as np
import pandas as pd
from foreshadow.transformers.internals import HTMLRemover

df = pd.DataFrame(
["<h1>Header<h1/>", "Normal Text", "<br/><br/>More text"]
)
df_assert = pd.DataFrame(["Header", "Normal Text", "More text"])

hr = HTMLRemover()

assert np.array_equal(
hr.fit_transform(df).values.ravel(), df_assert.values.ravel()
)


def test_html_remover_is_html():
from foreshadow.transformers.internals import HTMLRemover

html = "<b>Real Tag</b> Test"
not_html = "<not tag>"

assert HTMLRemover.is_html(html)
assert not HTMLRemover.is_html(not_html)


def test_to_string_tf():
import numpy as np
import pandas as pd
from foreshadow.transformers.internals import ToString

data = [0, 1, 2, 3, np.nan]
arr = np.array(data)
df = pd.DataFrame(data)

expected = ["0.0", "1.0", "2.0", "3.0", "nan"]

ts = ToString()

assert expected == ts.transform(arr).values.ravel().tolist()
assert expected == ts.transform(df).values.ravel().tolist()
26 changes: 25 additions & 1 deletion foreshadow/tests/test_transformers/test_smart.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,9 @@ def test_smart_encoder_y_var():
import pandas as pd

from foreshadow.transformers.smart import Encoder
from foreshadow.transformers.externals import LabelEncoder
from foreshadow.transformers.internals import (
FixedLabelEncoder as LabelEncoder,
)

y_df = pd.DataFrame({"A": np.array([1, 2, 10] * 3)})
smart_coder = Encoder(y_var=True)
Expand Down Expand Up @@ -290,3 +292,25 @@ def test_smart_financial_cleaner_eu():
out = FinancialCleaner().fit_transform(x).values

assert np.all((out == expected) | (pd.isnull(out) == pd.isnull(expected)))


def test_smart_text():
import numpy as np
import pandas as pd

from foreshadow.transformers.smart import SmartText
from foreshadow.transformers.externals import TfidfVectorizer
from foreshadow.transformers.internals import HTMLRemover

X1 = pd.DataFrame(["abc", "def", "1321", "tester"])
tf1 = SmartText().fit(X1)

assert isinstance(tf1, TfidfVectorizer)

X2 = pd.DataFrame(["<p> Hello </p>", "World", "<h1> Tag </h1>"])
tf2 = SmartText().fit(X2)

assert any(isinstance(tf, HTMLRemover) for n, tf in tf2.steps)
assert isinstance(tf2.steps[-1][1], TfidfVectorizer)

assert SmartText().fit(pd.DataFrame([1, 2, 3, np.nan]))

0 comments on commit bb35c7a

Please sign in to comment.