Integrate autointent (#176)

* Cleaned version with NoTransform Neither processor Disable TFIDF and use NoTransform for NeitherType Integrating updated auto intent resolving
georgian-io-archive · Nov 21, 2019 · 7b0d201 · 7b0d201
1 parent 2a77c84
commit 7b0d201
Show file tree

Hide file tree

Showing 40 changed files with 2,843 additions and 33 deletions.
diff --git a/foreshadow/concrete/internals/cleaners/base.py b/foreshadow/concrete/internals/cleaners/base.py
@@ -6,6 +6,7 @@
 
 from foreshadow.base import BaseEstimator, TransformerMixin
 from foreshadow.exceptions import InvalidDataFrame
+from foreshadow.logging import logging
 from foreshadow.metrics import MetricWrapper, avg_col_regex, regex_rows
 from foreshadow.utils import check_df
 
@@ -86,12 +87,15 @@ def metric_score(self, X):
             float: confidence value.
 
         """
+        # TODO can we also do a sampling here?
+        logging.info("Calculating scores....")
         scores = []
         for metric_wrapper, weight in self.confidence_computation.items():
             scores.append(
                 metric_wrapper.calculate(X, cleaner=self.transform_row)
                 * weight
             )
+        logging.info("End calculating scores...")
         return sum(scores)
 
     def transform_row(self, row_of_feature, return_tuple=True):
@@ -185,7 +189,9 @@ def transform(self, X, y=None):
         # over each row for a given column on my own, which requires me to
         # leave
 
+        logging.info("Starting cleaning rows...")
         out = X[X.columns[0]].apply(self.transform_row, return_tuple=False)
+        logging.info("Ending cleaning rows...")
         # access single column as series and apply the list of
         # transformations to each row in the series.
         if any(

diff --git a/foreshadow/concrete/internals/cleaners/drop.py b/foreshadow/concrete/internals/cleaners/drop.py
@@ -19,6 +19,7 @@ def drop_transform(text):
         Otherwise: None, original text.
 
     """
+    # TODO may want to optimize, no need for regex.
     regex = "^$"
     text = str(text)
     res = re.search(regex, text)

diff --git a/foreshadow/config.py b/foreshadow/config.py
@@ -25,6 +25,7 @@
     # "Numeric": {"Preprocessor": ["Imputer", "Scaler"]},
     "Categorical": {"Preprocessor": ["CategoricalEncoder"]},
     "Text": {"Preprocessor": ["TextEncoder"]},
+    "Neither": {"Preprocessor": ["NeitherProcessor"]},
 }
 
 

diff --git a/foreshadow/intents/__init__.py b/foreshadow/intents/__init__.py
@@ -2,8 +2,16 @@
 from .base import BaseIntent
 from .categorical import Categorical
 from .intent_type import IntentType
+from .neither import Neither
 from .numeric import Numeric
 from .text import Text
 
 
-__all__ = ["Categorical", "Numeric", "Text", "BaseIntent", "IntentType"]
+__all__ = [
+    "Categorical",
+    "Numeric",
+    "Text",
+    "BaseIntent",
+    "Neither",
+    "IntentType",
+]
diff --git a/foreshadow/intents/neither.py b/foreshadow/intents/neither.py
@@ -0,0 +1,59 @@
+"""Neither Numerical Nor Categorical."""
+
+from foreshadow.metrics import (
+    MetricWrapper,
+    has_long_text,
+    is_numeric,
+    is_string,
+    num_valid,
+    unique_heur,
+)
+from foreshadow.utils import standard_col_summary
+
+from .base import BaseIntent
+
+
+class Neither(BaseIntent):
+    """Defines a Neither column type.
+
+    For now it mimics the Text intent.
+    """
+
+    confidence_computation = {
+        MetricWrapper(num_valid): 0.2,
+        MetricWrapper(unique_heur): 0.2,
+        MetricWrapper(is_numeric, invert=True): 0.2,
+        MetricWrapper(is_string): 0.2,
+        MetricWrapper(has_long_text): 0.2,
+    }
+
+    def fit(self, X, y=None, **fit_params):
+        """Empty fit.
+
+        Args:
+            X: The input data
+            y: The response variable
+            **fit_params: Additional parameters for the fit
+
+        Returns:
+            self
+
+        """
+        return self
+
+    def transform(self, X, y=None):
+        """Convert a column to a text form.
+
+        Args:
+            X: The input data
+            y: The response variable
+
+        Returns:
+            A column with all rows converted to text.
+
+        """
+        return X.astype(str)
+
+    @classmethod
+    def column_summary(cls, df):  # noqa
+        return standard_col_summary(df)
diff --git a/foreshadow/smart/__init__.py b/foreshadow/smart/__init__.py
@@ -4,6 +4,7 @@
     CategoricalEncoder,
     FinancialCleaner,
     MultiImputer,
+    NeitherProcessor,
     Scaler,
     SimpleImputer,
     TextEncoder,
@@ -13,7 +14,7 @@
 from foreshadow.smart.feature_reducer import FeatureReducer
 from foreshadow.smart.feature_summarizer import FeatureSummarizer  # noqa: F401
 from foreshadow.smart.flatten import Flatten  # noqa: F401
-from foreshadow.smart.intentresolver import IntentResolver
+from foreshadow.smart.intent_resolving import IntentResolver
 from foreshadow.smart.smart import SmartTransformer  # noqa: F401
 
 
@@ -24,6 +25,7 @@
     "Scaler",
     "SimpleImputer",
     "TextEncoder",
+    "NeitherProcessor",
     "Flatten",
     "Cleaner",
     "IntentResolver",

diff --git a/foreshadow/smart/all.py b/foreshadow/smart/all.py
@@ -12,7 +12,7 @@
 import pandas as pd
 import scipy.stats as ss
 
-from foreshadow.concrete import Imputer
+from foreshadow.concrete import Imputer, NoTransform
 from foreshadow.concrete.externals import (
     HashingEncoder,
     MinMaxScaler,
@@ -32,6 +32,7 @@
     ToString,
     UncommonRemover,
 )
+from foreshadow.logging import logging
 from foreshadow.pipeline import SerializablePipeline
 from foreshadow.utils import check_df
 
@@ -371,3 +372,109 @@ def pick_transformer(self, X, y=None, **fit_params):
             return tfidf
         else:
             return SerializablePipeline(steps)
+
+
+class NeitherProcessor(SmartTransformer):
+    """A temporary no transform processor for the Neither intent."""
+
+    def __init__(self, html_cutoff=0.4, **kwargs):
+        self.html_cutoff = html_cutoff
+
+        super().__init__(**kwargs)
+
+    def pick_transformer(self, X, y=None, **fit_params):
+        """Determine the appropriate preprocessing method for Neither intent.
+
+        Args:
+            X (:obj:`pandas.DataFrame`): Input X data
+            y (:obj: 'pandas.DataFrame'): labels Y for data
+            **fit_params (dict): Parameters to apply to transformers when
+                fitting
+
+        Returns:
+            A NoTransformer
+
+        """
+        return self._pick_transformer(X, y, **fit_params)
+
+    def _pick_transformer(self, X, y=None, **fit_params):
+        """Determine the appropriate nlp method.
+
+        Args:
+            X (:obj:`pandas.DataFrame`): Input X data
+            y (:obj: 'pandas.DataFrame'): labels Y for data
+            **fit_params (dict): Parameters to apply to transformers when
+                fitting
+
+        Returns:
+            An initialized nlp transformer
+
+        """
+        data = X.iloc[:, 0]
+
+        steps = []
+
+        if (data.dtype.type is not np.str_) and not all(
+            [isinstance(i, str) for i in data]
+        ):
+            steps.append(("num", ToString()))
+
+        html_ratio = (
+            data.astype("str").apply(HTMLRemover.is_html).sum()
+        ) / len(data)
+        if html_ratio > self.html_cutoff:
+            steps.append(("hr", HTMLRemover()))
+
+        # TODO: find heuristic for finding optimal values for values
+        tfidf = TfidfVectorizer(
+            decode_error="replace",
+            strip_accents="unicode",
+            stop_words="english",
+            ngram_range=(1, 2),
+            max_df=0.9,
+            min_df=0.05,
+            max_features=None,
+            sublinear_tf=True,
+        )
+        steps.append(("tfidf", tfidf))
+
+        if len(steps) == 1:
+            transformer = tfidf
+        else:
+            transformer = SerializablePipeline(steps)
+
+        return self._can_fit(transformer, X)
+
+    def _can_fit(self, transformer, X, y=None, sampling_threshold=0.1):
+        """Check if the TFIDF can be fitted on the sampled data.
+
+        If not, it will default back to NoTransform.
+        TODO: At this moment TFIDF is broken so it always default back to
+         NoTransform.
+
+        Args:
+            transformer: selected transformer with TFIDF vectorizor
+            X: the data frame
+            y: the y variable data frame
+            sampling_threshold: the threshold of the sampling
+
+        Returns:
+            Either the original transformer or the NoTransform
+
+        """
+        if len(X) * sampling_threshold < 30:
+            # the rule of 30 to be statistically significant
+            sampling_threshold = 1
+        sample_df = X.sample(
+            frac=sampling_threshold, replace=True, random_state=1
+        )
+        try:
+            transformer.fit(sample_df)
+            return transformer
+        except Exception:
+            # TODO change to ValueError once TFIDF is fixed.
+            # logging.warning("Error during fit: ".format(str(e)))
+            logging.warning(
+                "Revert to NoTransform for Neither " "Type temporarily."
+            )
+            return NoTransform()
diff --git a/foreshadow/smart/cleaner.py b/foreshadow/smart/cleaner.py
@@ -32,9 +32,11 @@ def pick_transformer(self, X, y=None, **fit_params):
             Best data cleaning transformer.
 
         """
+        # TODO do we want to parallize this step?
         cleaners = config.get_cleaners(cleaners=True)
         best_score = 0
         best_cleaner = None
+        logging.info("Picking cleaners...")
         for cleaner in cleaners:
             cleaner = cleaner()
             score = cleaner.metric_score(X)
@@ -43,6 +45,7 @@ def pick_transformer(self, X, y=None, **fit_params):
                 best_cleaner = cleaner
         if best_cleaner is None:
             return NoTransform()
+        logging.info("Picked...")
         return best_cleaner
 
     def should_force_reresolve_based_on_override(self, X):

diff --git a/foreshadow/smart/intent_resolving/__init__.py b/foreshadow/smart/intent_resolving/__init__.py
@@ -0,0 +1,5 @@
+"""Intent resolver definition."""
+from foreshadow.smart.intent_resolving.intentresolver import IntentResolver
+
+
+__all__ = ["IntentResolver"]
diff --git a/foreshadow/smart/intent_resolving/core/README.md b/foreshadow/smart/intent_resolving/core/README.md
@@ -0,0 +1,47 @@
+# automl_research
+Code repository for AutoML research to support Foreshadow project
+
+---
+
+## Feature Type Inference (Intent Resolution)
+When analyzing raw data set feature columns in `Foreshadow`, the type (intent) of the each feature column has to be known a priori to select the appropriate feature transformation downstream.
+
+The goal of this research project is to build an intent resolver that can separate numerical and categorical raw feature columns. More classes can be added in the future.
+
+### Installation
+This library was developed on Python 3.6.8 and uses the same package dependencies as `Foreshadow` as of Oct. 17, 2019.
+
+To install additional package dependencies for research-based functionalities, run the following:
+```
+pip install -r research_requirements.txt
+```
+
+### Usage
+The functionality of this library is exposed through the `IntentResolver` class API as shown below. The class outputs a prediction of "Numerical", "Categorical" or "Neither" for each raw feature column. Predictions with confidences lower than the `threshold` parameter (default = 0.7) in the `.predict` method are set to "Neither".
+
+```
+import pandas as pd
+from lib import IntentResolver
+
+# Initialise object
+raw = pd.read_csv('path_to_dataset.csv', encoding='latin', low_memory=False)
+resolver = IntentResolver(raw)
+
+# Predict intent
+# Outputs a pd.Series of predicted intents
+resolver.predict()
+
+# OR: Predict intent with confidences at a lower threshold (i.e. less rigorous prediction)
+# Outputs a pd.DataFrame of predicted intent and confidences
+resolver.predict(threshold=0.6, return_conf=True)
+```
+
+
+### Data Sources
+- [Original Meta Data Set (OMDS)](https://github.com/pvn25/ML-Data-Prep-Zoo/tree/master/ML%20Schema%20Inference/Data)
+- [360 Raw Data Sets (RDSs)](https://drive.google.com/file/d/1HGmDRBSZg-Olym2envycHPkb3uwVWHJX/view) (Sourced from the [GitHub README.md](https://github.com/pvn25/ML-Data-Prep-Zoo/tree/master/ML%20Schema%20Inference))
+
+
+### References
+1. V. Shah, P. Kumar, K. Yang, and A. Kumar, “Towards semi-automatic mlfeature type inference."
+2. N. Hynes, D. Sculley, and M. Terry, “The data linter: Lightweight, auto-mated sanity checking for ml data sets,” in NIPS MLSys Workshop, 2017.
diff --git a/foreshadow/smart/intent_resolving/core/__init__.py b/foreshadow/smart/intent_resolving/core/__init__.py
@@ -0,0 +1,8 @@
+"""Module containing the core IntentResolver logic to be used in production."""
+from . import heuristics, io
+from .data_set_parsers import DataFrameDataSetParser
+from .intent_resolver import IntentResolver
+from .secondary_featurizers import (
+    FeaturizerCurator,
+    factory as featurizer_factory,
+)
diff --git a/foreshadow/smart/intent_resolving/core/data_set_parsers/__init__.py b/foreshadow/smart/intent_resolving/core/data_set_parsers/__init__.py
@@ -0,0 +1,3 @@
+"""Module containing data set parser class defintions."""
+from .base_data_set_parser import DataSetParser
+from .dataframe_data_set_parser import DataFrameDataSetParser