This repository has been archived by the owner on Jan 9, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Updating DataPreparer Base Classes and Project Restructure
* DataCleaner changes * Final Project restructure: Included: tests skipped or changed. Some left failing to change as we integrate DataPreparer. V1 components removed V2 file structure in place with proper import system (some small changes still to be made). * foreshadow.concrete import rollup complete.
- Loading branch information
1 parent
f657290
commit c8cd26b
Showing
115 changed files
with
4,107 additions
and
3,957 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
"""All the concrete transformers provided by foreshadow.""" | ||
|
||
from foreshadow.concrete.externals import * # noqa: F403, F401 | ||
from foreshadow.concrete.externals import __all__ as e_all | ||
from foreshadow.concrete.internals import * # noqa: F403, F401 | ||
from foreshadow.concrete.internals import __all__ as i_all | ||
|
||
|
||
__all__ = i_all + e_all |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
"""External transformers. | ||
All sklearn transformers imported here will be wrapped and made available in | ||
the module :mod:`foreshadow.transformers.concrete` | ||
""" | ||
|
||
from category_encoders import HashingEncoder, OneHotEncoder # noqa: F401 | ||
from sklearn.decomposition import PCA # noqa: F401 | ||
from sklearn.feature_extraction.text import ( # noqa: F401 | ||
TfidfTransformer, | ||
TfidfVectorizer, | ||
) | ||
from sklearn.preprocessing import ( # noqa: F401 | ||
Imputer, | ||
MinMaxScaler, | ||
RobustScaler, | ||
StandardScaler, | ||
) | ||
|
||
from foreshadow.utils import is_transformer | ||
from foreshadow.wrapper import pandas_wrap | ||
|
||
|
||
no_serialize_params = {"OneHotEncoder": ["cols"], "HashingEncoder": ["cols"]} | ||
|
||
|
||
def _get_modules(classes, globals_, mname): # TODO auto import all | ||
# TODO sklearn transformers and test each one generically. | ||
"""Import sklearn transformers from transformers directory. | ||
Searches transformers directory for classes implementing BaseEstimator and | ||
TransformerMixin and duplicates them, wraps their init methods and public | ||
functions to support pandas dataframes, and exposes them as | ||
foreshadow.transformers.[name] | ||
Args: | ||
classes: A list of classes | ||
globals_: The globals in the callee's context | ||
mname: The module name | ||
Returns: | ||
The list of wrapped transformers. | ||
""" | ||
transformers = [ | ||
cls | ||
for cls in classes | ||
if is_transformer(cls, method="issubclass") # noqa: F821 | ||
] # flake does not detect due to del. | ||
|
||
for t in transformers: | ||
copied_t = type(t.__name__, (t, *t.__bases__), dict(t.__dict__)) | ||
copied_t.__module__ = mname | ||
globals_[copied_t.__name__] = pandas_wrap( # noqa: F821 | ||
copied_t # noqa: F821 | ||
) | ||
# flake does not detect due to del. | ||
|
||
return [t.__name__ for t in transformers] | ||
|
||
|
||
def _get_classes(): | ||
"""Return a list of classes found in transforms directory. | ||
Returns: | ||
list of classes found in transforms directory. | ||
""" | ||
import inspect | ||
|
||
return [c for c in globals().values() if inspect.isclass(c)] | ||
|
||
|
||
__all__ = _get_modules(_get_classes(), globals(), __name__) + [ | ||
"no_serialize_params" | ||
] | ||
|
||
del pandas_wrap | ||
del is_transformer | ||
del _get_classes | ||
del _get_modules |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
"""Custom foreshadow defined transformers.""" | ||
from foreshadow.concrete.internals.boxcox import BoxCox # noqa: F401 | ||
from foreshadow.concrete.internals.cleaners import * # noqa: F403, F401 | ||
from foreshadow.concrete.internals.cleaners import __all__ as c_all | ||
from foreshadow.concrete.internals.dropfeature import DropFeature # noqa: F401 | ||
from foreshadow.concrete.internals.dummyencoder import ( # noqa: F403, F401 | ||
DummyEncoder, | ||
) | ||
from foreshadow.concrete.internals.fancyimpute import ( # noqa: F403, F401 | ||
FancyImputer, | ||
) | ||
from foreshadow.concrete.internals.financial import ( # noqa: F401 | ||
ConvertFinancial, | ||
PrepareFinancial, | ||
) | ||
from foreshadow.concrete.internals.htmlremover import HTMLRemover # noqa: F401 | ||
from foreshadow.concrete.internals.labelencoder import ( # noqa: F403, F401 | ||
FixedLabelEncoder, | ||
) | ||
from foreshadow.concrete.internals.notransform import NoTransform # noqa: F401 | ||
from foreshadow.concrete.internals.tfidf import ( # noqa: F403, F401 | ||
FixedTfidfVectorizer, | ||
) | ||
from foreshadow.concrete.internals.tostring import ToString # noqa: F401 | ||
from foreshadow.concrete.internals.uncommonremover import ( # noqa: F403, F401 | ||
UncommonRemover, | ||
) | ||
|
||
|
||
# TODO flake fails here, figure out why. | ||
# hypothesis: flake8 uses the __repr__ which is modified to be | ||
# DFTransformer.HTMLRemover etc. | ||
|
||
__all__ = [ | ||
"BoxCox", | ||
"DropFeature", | ||
"DummyEncoder", | ||
"FancyImputer", | ||
"ConvertFinancial", | ||
"PrepareFinancial", | ||
"HTMLRemover", | ||
"FixedLabelEncoder", | ||
"FixedTfidfVectorizer", | ||
"ToString", | ||
"UncommonRemover", | ||
"YYYYMMDDDateCleaner", | ||
"DollarFinancialCleaner", | ||
"DropCleaner", | ||
"StandardJsonFlattener", | ||
"NoTransform", | ||
] + c_all |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
"""Internal cleaners for handling the cleaning and shaping of data.""" | ||
from foreshadow.concrete.internals.cleaners.datetimes import ( | ||
YYYYMMDDDateCleaner, | ||
) | ||
from foreshadow.concrete.internals.cleaners.drop import DropCleaner | ||
from foreshadow.concrete.internals.cleaners.financial_cleaner import ( | ||
DollarFinancialCleaner, | ||
) | ||
from foreshadow.concrete.internals.cleaners.json_flattener import ( | ||
StandardJsonFlattener, | ||
) | ||
|
||
|
||
__all__ = [ | ||
"YYYYMMDDDateCleaner", | ||
"DropCleaner", | ||
"DollarFinancialCleaner", | ||
"StandardJsonFlattener", | ||
] |
Oops, something went wrong.