Skip to content
This repository has been archived by the owner on Jan 9, 2024. It is now read-only.

Commit

Permalink
Integrate autointent (#176)
Browse files Browse the repository at this point in the history
* Cleaned version with NoTransform Neither processor

Disable TFIDF and use NoTransform for NeitherType

Integrating updated auto intent resolving
  • Loading branch information
jzhang-gp committed Nov 21, 2019
1 parent 2a77c84 commit 7b0d201
Show file tree
Hide file tree
Showing 40 changed files with 2,843 additions and 33 deletions.
6 changes: 6 additions & 0 deletions foreshadow/concrete/internals/cleaners/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from foreshadow.base import BaseEstimator, TransformerMixin
from foreshadow.exceptions import InvalidDataFrame
from foreshadow.logging import logging
from foreshadow.metrics import MetricWrapper, avg_col_regex, regex_rows
from foreshadow.utils import check_df

Expand Down Expand Up @@ -86,12 +87,15 @@ def metric_score(self, X):
float: confidence value.
"""
# TODO can we also do a sampling here?
logging.info("Calculating scores....")
scores = []
for metric_wrapper, weight in self.confidence_computation.items():
scores.append(
metric_wrapper.calculate(X, cleaner=self.transform_row)
* weight
)
logging.info("End calculating scores...")
return sum(scores)

def transform_row(self, row_of_feature, return_tuple=True):
Expand Down Expand Up @@ -185,7 +189,9 @@ def transform(self, X, y=None):
# over each row for a given column on my own, which requires me to
# leave

logging.info("Starting cleaning rows...")
out = X[X.columns[0]].apply(self.transform_row, return_tuple=False)
logging.info("Ending cleaning rows...")
# access single column as series and apply the list of
# transformations to each row in the series.
if any(
Expand Down
1 change: 1 addition & 0 deletions foreshadow/concrete/internals/cleaners/drop.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def drop_transform(text):
Otherwise: None, original text.
"""
# TODO may want to optimize, no need for regex.
regex = "^$"
text = str(text)
res = re.search(regex, text)
Expand Down
1 change: 1 addition & 0 deletions foreshadow/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
# "Numeric": {"Preprocessor": ["Imputer", "Scaler"]},
"Categorical": {"Preprocessor": ["CategoricalEncoder"]},
"Text": {"Preprocessor": ["TextEncoder"]},
"Neither": {"Preprocessor": ["NeitherProcessor"]},
}


Expand Down
10 changes: 9 additions & 1 deletion foreshadow/intents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,16 @@
from .base import BaseIntent
from .categorical import Categorical
from .intent_type import IntentType
from .neither import Neither
from .numeric import Numeric
from .text import Text


__all__ = ["Categorical", "Numeric", "Text", "BaseIntent", "IntentType"]
__all__ = [
"Categorical",
"Numeric",
"Text",
"BaseIntent",
"Neither",
"IntentType",
]
59 changes: 59 additions & 0 deletions foreshadow/intents/neither.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""Neither Numerical Nor Categorical."""

from foreshadow.metrics import (
MetricWrapper,
has_long_text,
is_numeric,
is_string,
num_valid,
unique_heur,
)
from foreshadow.utils import standard_col_summary

from .base import BaseIntent


class Neither(BaseIntent):
"""Defines a Neither column type.
For now it mimics the Text intent.
"""

confidence_computation = {
MetricWrapper(num_valid): 0.2,
MetricWrapper(unique_heur): 0.2,
MetricWrapper(is_numeric, invert=True): 0.2,
MetricWrapper(is_string): 0.2,
MetricWrapper(has_long_text): 0.2,
}

def fit(self, X, y=None, **fit_params):
"""Empty fit.
Args:
X: The input data
y: The response variable
**fit_params: Additional parameters for the fit
Returns:
self
"""
return self

def transform(self, X, y=None):
"""Convert a column to a text form.
Args:
X: The input data
y: The response variable
Returns:
A column with all rows converted to text.
"""
return X.astype(str)

@classmethod
def column_summary(cls, df): # noqa
return standard_col_summary(df)
4 changes: 3 additions & 1 deletion foreshadow/smart/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
CategoricalEncoder,
FinancialCleaner,
MultiImputer,
NeitherProcessor,
Scaler,
SimpleImputer,
TextEncoder,
Expand All @@ -13,7 +14,7 @@
from foreshadow.smart.feature_reducer import FeatureReducer
from foreshadow.smart.feature_summarizer import FeatureSummarizer # noqa: F401
from foreshadow.smart.flatten import Flatten # noqa: F401
from foreshadow.smart.intentresolver import IntentResolver
from foreshadow.smart.intent_resolving import IntentResolver
from foreshadow.smart.smart import SmartTransformer # noqa: F401


Expand All @@ -24,6 +25,7 @@
"Scaler",
"SimpleImputer",
"TextEncoder",
"NeitherProcessor",
"Flatten",
"Cleaner",
"IntentResolver",
Expand Down
109 changes: 108 additions & 1 deletion foreshadow/smart/all.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import pandas as pd
import scipy.stats as ss

from foreshadow.concrete import Imputer
from foreshadow.concrete import Imputer, NoTransform
from foreshadow.concrete.externals import (
HashingEncoder,
MinMaxScaler,
Expand All @@ -32,6 +32,7 @@
ToString,
UncommonRemover,
)
from foreshadow.logging import logging
from foreshadow.pipeline import SerializablePipeline
from foreshadow.utils import check_df

Expand Down Expand Up @@ -371,3 +372,109 @@ def pick_transformer(self, X, y=None, **fit_params):
return tfidf
else:
return SerializablePipeline(steps)


class NeitherProcessor(SmartTransformer):
"""A temporary no transform processor for the Neither intent."""

def __init__(self, html_cutoff=0.4, **kwargs):
self.html_cutoff = html_cutoff

super().__init__(**kwargs)

def pick_transformer(self, X, y=None, **fit_params):
"""Determine the appropriate preprocessing method for Neither intent.
Args:
X (:obj:`pandas.DataFrame`): Input X data
y (:obj: 'pandas.DataFrame'): labels Y for data
**fit_params (dict): Parameters to apply to transformers when
fitting
Returns:
A NoTransformer
"""
return self._pick_transformer(X, y, **fit_params)

def _pick_transformer(self, X, y=None, **fit_params):
"""Determine the appropriate nlp method.
Args:
X (:obj:`pandas.DataFrame`): Input X data
y (:obj: 'pandas.DataFrame'): labels Y for data
**fit_params (dict): Parameters to apply to transformers when
fitting
Returns:
An initialized nlp transformer
"""
data = X.iloc[:, 0]

steps = []

if (data.dtype.type is not np.str_) and not all(
[isinstance(i, str) for i in data]
):
steps.append(("num", ToString()))

html_ratio = (
data.astype("str").apply(HTMLRemover.is_html).sum()
) / len(data)
if html_ratio > self.html_cutoff:
steps.append(("hr", HTMLRemover()))

# TODO: find heuristic for finding optimal values for values
tfidf = TfidfVectorizer(
decode_error="replace",
strip_accents="unicode",
stop_words="english",
ngram_range=(1, 2),
max_df=0.9,
min_df=0.05,
max_features=None,
sublinear_tf=True,
)
steps.append(("tfidf", tfidf))

if len(steps) == 1:
transformer = tfidf
else:
transformer = SerializablePipeline(steps)

return self._can_fit(transformer, X)

def _can_fit(self, transformer, X, y=None, sampling_threshold=0.1):
"""Check if the TFIDF can be fitted on the sampled data.
If not, it will default back to NoTransform.
TODO: At this moment TFIDF is broken so it always default back to
NoTransform.
Args:
transformer: selected transformer with TFIDF vectorizor
X: the data frame
y: the y variable data frame
sampling_threshold: the threshold of the sampling
Returns:
Either the original transformer or the NoTransform
"""
if len(X) * sampling_threshold < 30:
# the rule of 30 to be statistically significant
sampling_threshold = 1
sample_df = X.sample(
frac=sampling_threshold, replace=True, random_state=1
)
try:
transformer.fit(sample_df)
return transformer
except Exception:
# TODO change to ValueError once TFIDF is fixed.
# logging.warning("Error during fit: ".format(str(e)))
logging.warning(
"Revert to NoTransform for Neither " "Type temporarily."
)
return NoTransform()
3 changes: 3 additions & 0 deletions foreshadow/smart/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,11 @@ def pick_transformer(self, X, y=None, **fit_params):
Best data cleaning transformer.
"""
# TODO do we want to parallize this step?
cleaners = config.get_cleaners(cleaners=True)
best_score = 0
best_cleaner = None
logging.info("Picking cleaners...")
for cleaner in cleaners:
cleaner = cleaner()
score = cleaner.metric_score(X)
Expand All @@ -43,6 +45,7 @@ def pick_transformer(self, X, y=None, **fit_params):
best_cleaner = cleaner
if best_cleaner is None:
return NoTransform()
logging.info("Picked...")
return best_cleaner

def should_force_reresolve_based_on_override(self, X):
Expand Down
5 changes: 5 additions & 0 deletions foreshadow/smart/intent_resolving/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Intent resolver definition."""
from foreshadow.smart.intent_resolving.intentresolver import IntentResolver


__all__ = ["IntentResolver"]
47 changes: 47 additions & 0 deletions foreshadow/smart/intent_resolving/core/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# automl_research
Code repository for AutoML research to support Foreshadow project

---

## Feature Type Inference (Intent Resolution)
When analyzing raw data set feature columns in `Foreshadow`, the type (intent) of the each feature column has to be known a priori to select the appropriate feature transformation downstream.

The goal of this research project is to build an intent resolver that can separate numerical and categorical raw feature columns. More classes can be added in the future.

### Installation
This library was developed on Python 3.6.8 and uses the same package dependencies as `Foreshadow` as of Oct. 17, 2019.

To install additional package dependencies for research-based functionalities, run the following:
```
pip install -r research_requirements.txt
```

### Usage
The functionality of this library is exposed through the `IntentResolver` class API as shown below. The class outputs a prediction of "Numerical", "Categorical" or "Neither" for each raw feature column. Predictions with confidences lower than the `threshold` parameter (default = 0.7) in the `.predict` method are set to "Neither".

```
import pandas as pd
from lib import IntentResolver
# Initialise object
raw = pd.read_csv('path_to_dataset.csv', encoding='latin', low_memory=False)
resolver = IntentResolver(raw)
# Predict intent
# Outputs a pd.Series of predicted intents
resolver.predict()
# OR: Predict intent with confidences at a lower threshold (i.e. less rigorous prediction)
# Outputs a pd.DataFrame of predicted intent and confidences
resolver.predict(threshold=0.6, return_conf=True)
```


### Data Sources
- [Original Meta Data Set (OMDS)](https://github.com/pvn25/ML-Data-Prep-Zoo/tree/master/ML%20Schema%20Inference/Data)
- [360 Raw Data Sets (RDSs)](https://drive.google.com/file/d/1HGmDRBSZg-Olym2envycHPkb3uwVWHJX/view) (Sourced from the [GitHub README.md](https://github.com/pvn25/ML-Data-Prep-Zoo/tree/master/ML%20Schema%20Inference))


### References
1. V. Shah, P. Kumar, K. Yang, and A. Kumar, “Towards semi-automatic mlfeature type inference."
2. N. Hynes, D. Sculley, and M. Terry, “The data linter: Lightweight, auto-mated sanity checking for ml data sets,” in NIPS MLSys Workshop, 2017.
8 changes: 8 additions & 0 deletions foreshadow/smart/intent_resolving/core/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
"""Module containing the core IntentResolver logic to be used in production."""
from . import heuristics, io
from .data_set_parsers import DataFrameDataSetParser
from .intent_resolver import IntentResolver
from .secondary_featurizers import (
FeaturizerCurator,
factory as featurizer_factory,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""Module containing data set parser class defintions."""
from .base_data_set_parser import DataSetParser
from .dataframe_data_set_parser import DataFrameDataSetParser

0 comments on commit 7b0d201

Please sign in to comment.