This repository has been archived by the owner on Jan 9, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add financial transformers, smart transformers, and intents * Patch test issues
- Loading branch information
1 parent
6579deb
commit dbeb8d6
Showing
7 changed files
with
293 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
from .subnumeric import * | ||
from .general import * | ||
from .registry import * | ||
from .base import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
""" | ||
Sub Numeric Intents | ||
""" | ||
|
||
import re | ||
|
||
from .base import PipelineTemplateEntry, TransformerEntry | ||
from .general import NumericIntent | ||
|
||
from ..transformers.internals import DropFeature | ||
from ..transformers.smart import FinancialCleaner, SimpleImputer, Scaler | ||
|
||
|
||
class FinancialIntent(NumericIntent): | ||
"""Matches financial data. | ||
Handles American and European Style numbers. Handles brackets for accounting | ||
data. | ||
""" | ||
|
||
children = [] | ||
|
||
single_pipeline_template = [ | ||
PipelineTemplateEntry("dropper", DropFeature, False), | ||
PipelineTemplateEntry("fin_cleaner", FinancialCleaner, True), | ||
PipelineTemplateEntry("simple_imputer", SimpleImputer, False), | ||
PipelineTemplateEntry("scaler", Scaler, True), | ||
] | ||
"""No transformers""" | ||
|
||
multi_pipeline_template = [] | ||
"""Performs multi imputation over the entire DataFrame""" | ||
|
||
@classmethod | ||
def is_intent(cls, df): | ||
"""Returns true by default such that a column must match this""" | ||
us_num = re.compile( | ||
r"(?<!\S)(\[|\()?((-(?=[0-9\.]))?([0-9](\,(?=[0-9]{3}))?)*((\.(?=[0-9]))|((?<=[0-9]))\.)?[0-9]*)(\)|\])?%?(?!\S)" | ||
) | ||
eu_num = re.compile( | ||
r"(?<!\S)(\[|\()?((-(?=[0-9\,]))?([0-9](\.(?=[0-9]{3}))?)*((\,(?=[0-9]))|((?<=[0-9]))\,)?[0-9]*)(\)|\])?%?(?!\S)" | ||
) | ||
|
||
data = df.iloc[:, 0].dropna() | ||
|
||
return ((data.str.match(us_num).sum()) / len(data) > 0.2) or ( | ||
(data.str.match(eu_num).sum()) / len(data) > 0.2 | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
def test_subnumeric_is_intent(): | ||
import numpy as np | ||
import pandas as pd | ||
from foreshadow.intents import FinancialIntent | ||
|
||
X = np.array( | ||
[ | ||
["0", "1.", "1,000", "-.1", "-0.1", "-0.001", "1,000.10000"], # US | ||
["0", "1,", "1.000", "-,1", "-0,1", "-0,001", "1.000,10000"], # EU | ||
["0", "1,", "1.000", "[,1]", "(0,1)", "[0,001]", "(1.000,10000)"], # ACCT | ||
[ | ||
"0%", | ||
"1,%", | ||
"1.000%", | ||
"[,1]%", | ||
"(0,1)%", | ||
"[0,001]%", | ||
"(1.000,10000)%", | ||
], # PCT | ||
] | ||
).T.astype("object") | ||
nans = np.array([np.nan] * 800).reshape((200, 4)).astype("object") | ||
X = pd.DataFrame(np.vstack([X, nans])) | ||
for c in X: | ||
assert FinancialIntent.is_intent(X[[c]]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
import re | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from sklearn.base import TransformerMixin, BaseEstimator | ||
from sklearn.utils import check_array | ||
from sklearn.utils.validation import check_is_fitted | ||
|
||
|
||
class PrepareFinancial(BaseEstimator, TransformerMixin): | ||
"""Cleans data in preparation for a financial transformer | ||
(requires pandas inputs) | ||
""" | ||
|
||
def fit(self, X, y=None): | ||
"""Empty fit""" | ||
return self | ||
|
||
def transform(self, X, y=None): | ||
"""Cleans string columns to prepare for financial transformer | ||
Args: | ||
X (:obj:`pandas.DataFrame`): X data | ||
Returns: | ||
:obj:`pandas.DataFrame`: Transformed data | ||
""" | ||
|
||
X = X.copy() | ||
for c in X: | ||
X[c] = ( | ||
X[c] | ||
.str.replace(r"\s", "") # remove all whitespace | ||
.str.findall(r"[\d\.\(\[\-\)\]\,]+") # keep valid characters | ||
.apply( | ||
lambda l: max(l, key=len) # match largest found group | ||
if isinstance(l, list) and len(l) > 0 | ||
else np.nan | ||
) | ||
) | ||
|
||
return X | ||
|
||
|
||
class ConvertFinancial(BaseEstimator, TransformerMixin): | ||
"""Converts clean financial data into a numeric format | ||
Args: | ||
is_euro (bool): transform as a european number | ||
""" | ||
|
||
def __init__(self, is_euro=False): | ||
self.is_euro = is_euro | ||
self.clean_us = r"(?<!\S)(\[|\()?((-(?=[0-9\.]))?([0-9](\,(?=[0-9]{3}))?)*((\.(?=[0-9]))|((?<=[0-9]))\.)?[0-9]*)(\)|\])?(?!\S)" | ||
|
||
def fit(self, X, y=None): | ||
"""Empty fit""" | ||
return self | ||
|
||
def transform(self, X, y=None): | ||
"""Prepares data to be processed by FinancialIntent | ||
Args: | ||
X (:obj:`pandas.DataFrame`): X data | ||
Returns: | ||
:obj:`pandas.DataFrame`: Transformed data | ||
""" | ||
|
||
def get_match_results(val): | ||
if isinstance(val, str): | ||
match = re.compile(self.clean_us).match(val) | ||
if match: | ||
return match.group() | ||
|
||
return np.nan | ||
|
||
X = X.copy() | ||
for c in X: | ||
if self.is_euro: | ||
X[c] = X[c].str.translate(str.maketrans(",.", ".,")) | ||
|
||
# Filter for validity | ||
X[c] = X[c].apply(get_match_results) | ||
|
||
X[c] = pd.to_numeric( | ||
X[c] | ||
.str.replace(r"[\(\[]", "-") # accounting to negative | ||
.str.replace(r"[\]\)]", "") | ||
.str.replace(",", ""), # remove thousand separator | ||
errors="coerce", # convert to number | ||
) | ||
|
||
return X |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters