Skip to content
This repository has been archived by the owner on Jan 9, 2024. It is now read-only.

Commit

Permalink
Issue 104 intent mapper and metric features (#113)
Browse files Browse the repository at this point in the history
* Add Intent mapper
* Setup newintents directory and add new style intents
* Add additional functionality to metric (default value and invert)
* Rename _param_mapping to param_mapping
* Remove improper use of patch in and use pytest-mock in the code
* Rename Metric, the class, to MetricWrapper
* Patch bug in the way validate_wrapped worked and add test to verify
functionality.
  • Loading branch information
adithyabsk committed Jul 31, 2019
2 parents 2293792 + 2ec3115 commit f657290
Show file tree
Hide file tree
Showing 39 changed files with 910 additions and 258 deletions.
2 changes: 1 addition & 1 deletion ci/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@ steps:
checkCoverage: true
coverageFailOption: 'fixed'
coverageType: 'branches'
coverageThreshold: '90'
coverageThreshold: '75'
displayName: 'Check build quality'
30 changes: 26 additions & 4 deletions foreshadow/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,25 @@
"""An end-to-end AutoML package to streamline the datascience workflow."""

from foreshadow.foreshadow import Foreshadow
from foreshadow.preprocessor import Preprocessor
"""An AutoML package to streamline the data science work flow."""

# # Make sure to remove temporary F401
# from foreshadow.foreshadow import Foreshadow
# from foreshadow.preprocessor import Preprocessor
# from foreshadow import console

# This is temporary
import foreshadow.cleaners # noqa: F401
import foreshadow.config # noqa: F401
import foreshadow.console # noqa: F401
import foreshadow.core # noqa: F401
import foreshadow.estimators # noqa: F401
import foreshadow.exceptions # noqa: F401
import foreshadow.foreshadow # noqa: F401
import foreshadow.intents # noqa: F401
import foreshadow.metrics # noqa: F401
import foreshadow.newintents # noqa: F401
import foreshadow.optimizers # noqa: F401
import foreshadow.preprocessor # noqa: F401
import foreshadow.transformers # noqa: F401
import foreshadow.utils # noqa: F401


__doc__ = """
Expand All @@ -27,4 +45,8 @@ def get_version():

__version__ = get_version()

# __all__ = ["Foreshadow", "Preprocessor", "console", "__version__"]

__all__ = ["Foreshadow", "Preprocessor", "console", "__version__"]

del get_version
2 changes: 1 addition & 1 deletion foreshadow/cleaners/data_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from foreshadow.core.preparerstep import PreparerStep
from foreshadow.exceptions import InvalidDataFrame
from foreshadow.metrics.internals import avg_col_regex, regex_rows
from foreshadow.transformers.core import SmartTransformer
from foreshadow.transformers.core.notransform import NoTransform
from foreshadow.transformers.core.smarttransformer import SmartTransformer
from foreshadow.utils.testing import dynamic_import
from foreshadow.utils.validation import check_df

Expand Down
2 changes: 1 addition & 1 deletion foreshadow/cleaners/internals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import inspect
import os

from foreshadow.transformers.core import _get_modules
from foreshadow.transformers.core.wrapper import _get_modules


def _get_classes():
Expand Down
39 changes: 23 additions & 16 deletions foreshadow/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,10 @@

DEFAULT_CONFIG = {
"cleaner": [],
"engineerer": {},
"preprocessor": {
"numerical": ["Imputer", "Scaler"],
"categorical": ["CategoricalEncoder"],
"text": ["TextEncoder"],
},
"reducer": {},
"resolver": ["Numeric", "Categoric", "Text"],
"Numeric": {"preprocessor": ["Imputer", "Scaler"]},
"Categoric": {"preprocessor": ["CategoricalEncoder"]},
"Text": {"preprocessor": ["TextEncoder"]},
}

_cfg = {}
Expand Down Expand Up @@ -50,7 +47,7 @@ def get_config(base):


def reset_config():
"""Reset internal configuration
"""Reset internal configuration.
Note:
This is useful in an IDLE setting when the configuration file might
Expand All @@ -61,6 +58,16 @@ def reset_config():
_cfg = {}


def get_intents():
"""Get the intents defined in the config.
Returns:
list: A list of strings for the specific configuration.
"""
return resolve_config()["resolver"]


def resolve_config():
"""Resolve the configuration to actual classes.
Expand All @@ -78,8 +85,6 @@ def resolve_config():
local_path = os.path.abspath("")
local = get_config(local_path)

# import pdb; pdb.set_trace()

global _cfg
if local_path in _cfg:
return _cfg.get(local_path)
Expand All @@ -88,20 +93,22 @@ def resolve_config():
_resolved = {**default, **user, **local}

resolved = {}
for step, data in _resolved.items():
# key is cleaner, resolver, or intent
# all individual steps are converted to classes
for key, data in _resolved.items():
if not len(data):
resolved[step] = data
resolved[key] = data
elif isinstance(data, list):
resolved[step] = [
resolved[key] = [
get_transformer(transformer) for transformer in data
]
elif isinstance(data, dict):
resolved[step] = {
intent: [
resolved[key] = {
step: [
get_transformer(transformer)
for transformer in transformer_list
]
for intent, transformer_list in data.items()
for step, transformer_list in data.items()
}

_cfg[local_path] = resolved
Expand Down
3 changes: 2 additions & 1 deletion foreshadow/console.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split

from foreshadow import Foreshadow, Preprocessor
from foreshadow.estimators import AutoEstimator
from foreshadow.estimators.auto import determine_problem_type
from foreshadow.foreshadow import Foreshadow
from foreshadow.preprocessor import Preprocessor


def generate_model(args): # noqa: C901
Expand Down
3 changes: 3 additions & 0 deletions foreshadow/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Core components to foreshadow."""

from foreshadow.core.column_sharer import ColumnSharer
from foreshadow.core.resolver import IntentResolver, Resolver
from foreshadow.core.serializers import (
BaseTransformerSerializer,
ConcreteSerializerMixin,
Expand All @@ -13,4 +14,6 @@
"BaseTransformerSerializer",
"ConcreteSerializerMixin",
"PipelineSerializerMixin",
"IntentResolver",
"Resolver",
]
30 changes: 26 additions & 4 deletions foreshadow/core/column_sharer.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,43 @@
"""Cache utility for foreshadow pipeline workflow to share data."""
import pprint
from collections import MutableMapping, defaultdict


class PrettyDefaultDict(defaultdict):
"""A default dict wrapper that allows simple printing."""

__repr__ = dict.__repr__


class ColumnSharer(MutableMapping):
"""Main cache-class to be used as single-instance to share data.
Note:
This object is not thread safe for reads but is thread safe for writes.
.. automethod:: __getitem__
.. automethod:: __setitem__
.. automethod:: __delitem__
.. automethod:: __iter__
.. automethod:: __len__
"""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.store = defaultdict(lambda: defaultdict(lambda: None)) # will
# have a nested defaultdict for every key, which holds {column:
# key-column info} and gives None by default. It is the users
self.store = PrettyDefaultDict(lambda: PrettyDefaultDict(lambda: None))
# will have a nested PrettyDefaultDict for every key, which holds
# {column: key-column info} and gives None by default. It is the users
# responsibility to make sure returned values are useful.
acceptable_keys = {
"intent": True,
"domain": True,
"metastat": True,
"graph": True,
}
self.__acceptable_keys = defaultdict(lambda: False, acceptable_keys)
self.__acceptable_keys = PrettyDefaultDict(
lambda: False, acceptable_keys
)

def __getitem__(self, key_list):
"""Override getitem to support multi key accessing simultaneously.
Expand Down Expand Up @@ -189,3 +202,12 @@ def __len__(self):
"""
return sum([len(self.store[key]) for key in self.store])

def __str__(self):
"""Get a string representation of the internal store.
Returns:
A pretty printed version of the internal store.
"""
return pprint.pformat(self.store, indent=2)
28 changes: 24 additions & 4 deletions foreshadow/core/preparerstep.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from sklearn.pipeline import Pipeline

from foreshadow.core import logging
from foreshadow.transformers.core import ParallelProcessor
from foreshadow.transformers.core.notransform import NoTransform
from foreshadow.transformers.core.parallelprocessor import ParallelProcessor
from foreshadow.transformers.core.pipeline import (
SingleInputPipeline,
TransformersPipeline,
Expand Down Expand Up @@ -132,7 +132,6 @@ class PreparerStep(BaseEstimator, TransformerMixin):
usable format for ParallelProcessor and given mismatched columns,
can handle that with the flag use_single_pipeline set to True.
The transformer_list represents the mapping from columns to
transformers, in the form of ['name', 'transformer', ['cols']],
where the [cols] are the cols for transformer 'transformer. These
Expand All @@ -153,7 +152,7 @@ def __init__(
Args:
column_sharer: ColumnSharer instance to be shared across all steps.
use_single_pipeline: Creates pipelines using SinglePipeline
use_single_pipeline: Creates pipelines using SingleInputPipeline
class instead of normal Pipelines. .. #noqa: I102
*args: args to Pipeline constructor.
**kwargs: kwargs to PIpeline constructor.
Expand Down Expand Up @@ -240,7 +239,6 @@ def parallelize_mapping(self, column_mapping):
if self._use_single_pipeline:
PipelineClass = SingleInputPipeline
for group_number in column_mapping:

transformer_list = _check_parallelizable_batch(
column_mapping, group_number, PipelineClass=PipelineClass
)
Expand All @@ -249,6 +247,28 @@ def parallelize_mapping(self, column_mapping):
else: # could be separated and parallelized
final_mapping[group_number] = transformer_list
parallelized[group_number] = True
# pipeline = column_mapping[group_number]
# if isinstance(pipeline['steps'], Pipeline):
# transformer_list = _check_parallelizable_batch(
# column_mapping, group_number, PipelineClass=PipelineClass
# )
# if transformer_list is None: # could not be separated out
# parallelized[group_number] = False
# else: # could be separated and parallelized
# final_mapping[group_number] = transformer_list
# parallelized[group_number] = True
# elif isinstance(pipeline['steps'], BaseEstimator):
# # TODO: This is a hacky way to get non-pipelines to pass
# # passthrough, what I'd like to see is a better abstraction
# # for this.
# transformer_list = [
# "group: {:d}".format(group_number),
# pipeline['steps'],
# pipeline["inputs"][0],
# ]
# final_mapping[group_number] = transformer_list
# parallelized[group_number] = True

if len(final_mapping) < len(column_mapping) and False: # then there
# must be groups of columns that have interdependcies.
# CURRENTLy DISABLED.
Expand Down

0 comments on commit f657290

Please sign in to comment.