Skip to content
This repository has been archived by the owner on Jan 9, 2024. It is now read-only.

Commit

Permalink
Issue 118 Preprocessor (#120)
Browse files Browse the repository at this point in the history
* Add working preprocessor
* Update pre-commit file
* Update config object
* Making auto create ColumnSharer for each PreparerStep if not passed in. Useful if they are pulled out and not used as part of the DataPreparer Pipeline.
* Make cleaners use config
* fixing setup.
  • Loading branch information
adithyabsk committed Aug 6, 2019
1 parent c8cd26b commit 8a26b35
Show file tree
Hide file tree
Showing 40 changed files with 695 additions and 222 deletions.
3 changes: 2 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.2.3
hooks:
- id: flake8
- id: flake8
additional_dependencies: ['flake8-docstrings', 'darglint', 'pydocstyle<4.0']
2 changes: 1 addition & 1 deletion foreshadow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from foreshadow import console
from foreshadow.foreshadow import Foreshadow
from foreshadow.preparer.preparer import DataPreparer
from foreshadow.preparer import DataPreparer


__doc__ = """
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
from collections import MutableMapping, defaultdict


# TODO: Make this multi processor safe using managers


class PrettyDefaultDict(defaultdict):
"""A default dict wrapper that allows simple printing."""

Expand Down
237 changes: 174 additions & 63 deletions foreshadow/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""Foreshadow system config resolver."""

import json
import os
from collections import MutableMapping

import yaml

Expand All @@ -9,18 +11,23 @@

CONFIG_FILE_NAME = "config.yml"

DEFAULT_CONFIG = {
"cleaner": [],
"resolver": ["Numeric", "Categoric", "Text"],
"Numeric": {"preprocessor": ["Imputer", "Scaler"]},
"Categoric": {"preprocessor": ["CategoricalEncoder"]},
"Text": {"preprocessor": ["TextEncoder"]},
_DEFAULT_CONFIG = {
"Cleaner": {
"Flatteners": ["StandardJsonFlattener"],
"Cleaners": [
"YYYYMMDDDateCleaner",
"DropCleaner",
"DollarFinancialCleaner",
],
},
"Tiebreak": ["Numeric", "Categoric", "Text"],
"Numeric": {"Preprocessor": ["Imputer", "Scaler"]},
"Categoric": {"Preprocessor": ["CategoricalEncoder"]},
"Text": {"Preprocessor": ["TextEncoder"]},
}

_cfg = {}


def get_config(base):
def load_config(base):
"""Try to load configuration data from specific folder path.
Args:
Expand All @@ -46,71 +53,175 @@ def get_config(base):
return data


def reset_config():
"""Reset internal configuration.
class ConfigStore(MutableMapping):
"""Define a single-instance config store with convenience methods.
Note:
This is useful in an IDLE setting when the configuration file might
have been modified but you don't want to reload the system.
Attributues:
system_config: The default system configuration dictionary
user_config: The specific user configuration dictionary
"""
global _cfg
_cfg = {}

def __init__(self, *args, **kwarg):
self.system_config = _DEFAULT_CONFIG
self.user_config = load_config(get_config_path())
self._cfg_list = {} # key is path

def get_config(self):
"""Resolve a config instance.
Returns:
dict: A resolved version of the system configuration that merges \
system, user, and local configuration setups.
"""
local_path = os.path.abspath("")
local_config = load_config(local_path)

# Expand the dictionaries in order of precedence
resolved_strs = {
**self.system_config,
**self.user_config,
**local_config,
}

resolved_hash = hash(json.dumps(resolved_strs, sort_keys=True))

if resolved_hash in self._cfg_list:
return self._cfg_list[resolved_hash]

resolved = {}
# key is cleaner, resolver, or intent
# all individual steps are converted to classes
for key, data in resolved_strs.items():
if not len(data):
resolved[key] = data
elif isinstance(data, list):
resolved[key] = [
get_transformer(transformer) for transformer in data
]
elif isinstance(data, dict):
resolved[key] = {
step: [
get_transformer(transformer)
for transformer in transformer_list
]
for step, transformer_list in data.items()
}

def get_intents():
"""Get the intents defined in the config.
self._cfg_list[resolved_hash] = resolved

Returns:
list: A list of strings for the specific configuration.
return resolved

"""
return resolve_config()["resolver"]
def get_cleaners(self, flatteners=False, cleaners=False):
"""Get cleaner setup.
Args:
flatteners (bool): get flatteners
cleaners (bool): get cleaners
def resolve_config():
"""Resolve the configuration to actual classes.
Returns:
list: A list of all the relavent classes
Note:
The order is resolution is as follows in increasing precedence order:
framework, user, local.
Raises:
ValueError: Both flatteners and cleaners cannot be false
Returns:
A dictionary with the same keys as `foreshadow.config.DEFAULT_CONFIG`
with the correct overrides.
"""
if not (flatteners or cleaners):
raise ValueError("Both flatteners and cleaners cannot be false.")

"""
default = DEFAULT_CONFIG
user = get_config(get_config_path())
local_path = os.path.abspath("")
local = get_config(local_path)

global _cfg
if local_path in _cfg:
return _cfg.get(local_path)

# Expand the dictionaries in order of precedence
_resolved = {**default, **user, **local}

resolved = {}
# key is cleaner, resolver, or intent
# all individual steps are converted to classes
for key, data in _resolved.items():
if not len(data):
resolved[key] = data
elif isinstance(data, list):
resolved[key] = [
get_transformer(transformer) for transformer in data
]
elif isinstance(data, dict):
resolved[key] = {
step: [
get_transformer(transformer)
for transformer in transformer_list
]
for step, transformer_list in data.items()
}
config = self.get_config()

flatteners = config["Cleaner"]["Flatteners"] if flatteners else []
cleaners = config["Cleaner"]["Cleaners"] if cleaners else []

return [*flatteners, *cleaners]

def get_intents(self):
"""Get the intent resolution order.
Returns:
list: A list of intent objects in order
"""
return self.get_config()["Tiebreak"]

def get_preprocessor_steps(self, intent):
"""Get the preprocessor list for a given intent.
Args:
intent: A string of the intent to select upon.
Returns:
list: A list of transformation classes for an intent
"""
return self.get_config()[intent]["Preprocessor"]

def clear(self):
"""Clear all cached configuration stores."""
self._cfg_list = {}

def __delitem__(self, key):
"""Delete an item from the config cache for a given hash value.
Args:
key: A hash value for the item to delete
"""
del self._cfg_list[key]

def __getitem__(self, key):
"""Get an item from the config cache for a given hash value.
Args:
key: A hash value for the item to get
Returns:
dict: The configuration for a particular hash value.
"""
return self._cfg_list[key]

def __iter__(self):
"""Get the iterable the config cache.
Yields:
The key, value pairs of hash and its associated configuration.
"""
for data in self._cfg_list:
yield data

def __len__(self):
"""Get the number of hashes saved in the cache.
Returns:
The number of hashes saved in the internal cache.
"""
return len(self._cfg_list)

def __setitem__(self):
"""Values cannot be set in the cache.
Raises:
NotImplementedError: The config cannot be manually set.
"""
raise NotImplementedError("The config cannot be manually set.")

def __eq__(self, other):
"""Check the equality of the cache with another cache instance.
Args:
other: Another cache instance or dictionary.
Returns:
bool: True if the caches are equal, False otherwise
"""
return self._cfg_list == other

_cfg[local_path] = resolved

return resolved
config = ConfigStore()
8 changes: 3 additions & 5 deletions foreshadow/foreshadow.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,11 @@
from sklearn.base import BaseEstimator
from sklearn.model_selection._search import BaseSearchCV

from foreshadow.columnsharer import ColumnSharer
from foreshadow.estimators.auto import AutoEstimator
from foreshadow.estimators.meta import MetaEstimator
from foreshadow.preparer import (
ColumnSharer,
DataPreparer,
SerializablePipeline,
)
from foreshadow.pipeline import SerializablePipeline
from foreshadow.preparer import DataPreparer
from foreshadow.utils import check_df


Expand Down
6 changes: 2 additions & 4 deletions foreshadow/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,7 @@ def regex_rows(feature, cleaner):
"""
f = feature
matched_lens = [
cleaner(f.get_value(i, f.columns[0])).match_lens for i in f.index
]
matched_lens = [cleaner(f.at[i, f.columns[0]]).match_lens for i in f.index]
return sum([min(list_lens) for list_lens in matched_lens]) / len(feature)


Expand Down Expand Up @@ -214,7 +212,7 @@ def avg_col_regex(feature, cleaner, mode=min):
"""
f = feature
matched_lens = [
(cleaner(f.get_value(i, f.columns[0])).match_lens, len(f.iloc[i]))
(cleaner(f.at[i, f.columns[0]]).match_lens, len(f.iloc[i]))
for i in f.index
]
return sum(
Expand Down
File renamed without changes.
5 changes: 2 additions & 3 deletions foreshadow/preparer/pipeline.py → foreshadow/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@
from sklearn.pipeline import Pipeline, _fit_transform_one # noqa: F401
from sklearn.utils.validation import check_memory # noqa: F401

from foreshadow.serializers import PipelineSerializerMixin

from .parallelprocessor import ParallelProcessor # noqa: F401 see below.
from .parallelprocessor import ParallelProcessor # noqa: F401
from .serializers import PipelineSerializerMixin


# Above imports used in runtime override.
Expand Down
5 changes: 2 additions & 3 deletions foreshadow/preparer/preparer.py → foreshadow/preparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@

from sklearn.pipeline import Pipeline

from foreshadow.preparer.pipeline import PipelineSerializerMixin
from foreshadow.preparer.steps import IntentMapper
from foreshadow.preparer.steps.cleaner import CleanerMapper
from foreshadow.pipeline import PipelineSerializerMixin
from foreshadow.steps import CleanerMapper, IntentMapper


def _none_to_dict(name, val, column_sharer=None):
Expand Down
18 changes: 0 additions & 18 deletions foreshadow/preparer/__init__.py

This file was deleted.

3 changes: 2 additions & 1 deletion foreshadow/smart/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
)
from foreshadow.smart.cleaner import Cleaner # noqa: F401
from foreshadow.smart.flatten import Flatten # noqa: F401
from foreshadow.smart.intentresolver import IntentResolver # noqa: F401
from foreshadow.smart.intentresolver import IntentResolver
from foreshadow.smart.smart import SmartTransformer # noqa: F401


Expand All @@ -23,4 +23,5 @@
"TextEncoder",
"Flatten",
"Cleaner",
"IntentResolver",
]

0 comments on commit 8a26b35

Please sign in to comment.