Skip to content
This repository has been archived by the owner on Jan 9, 2024. It is now read-only.

Commit

Permalink
Issue 11 Financial Intent (#28)
Browse files Browse the repository at this point in the history
* Add financial transformers, smart transformers, and intents
* Patch test issues
  • Loading branch information
adithyabsk committed Jan 19, 2019
1 parent 6579deb commit dbeb8d6
Show file tree
Hide file tree
Showing 7 changed files with 293 additions and 1 deletion.
1 change: 1 addition & 0 deletions foreshadow/intents/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .subnumeric import *
from .general import *
from .registry import *
from .base import *
49 changes: 49 additions & 0 deletions foreshadow/intents/subnumeric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""
Sub Numeric Intents
"""

import re

from .base import PipelineTemplateEntry, TransformerEntry
from .general import NumericIntent

from ..transformers.internals import DropFeature
from ..transformers.smart import FinancialCleaner, SimpleImputer, Scaler


class FinancialIntent(NumericIntent):
"""Matches financial data.
Handles American and European Style numbers. Handles brackets for accounting
data.
"""

children = []

single_pipeline_template = [
PipelineTemplateEntry("dropper", DropFeature, False),
PipelineTemplateEntry("fin_cleaner", FinancialCleaner, True),
PipelineTemplateEntry("simple_imputer", SimpleImputer, False),
PipelineTemplateEntry("scaler", Scaler, True),
]
"""No transformers"""

multi_pipeline_template = []
"""Performs multi imputation over the entire DataFrame"""

@classmethod
def is_intent(cls, df):
"""Returns true by default such that a column must match this"""
us_num = re.compile(
r"(?<!\S)(\[|\()?((-(?=[0-9\.]))?([0-9](\,(?=[0-9]{3}))?)*((\.(?=[0-9]))|((?<=[0-9]))\.)?[0-9]*)(\)|\])?%?(?!\S)"
)
eu_num = re.compile(
r"(?<!\S)(\[|\()?((-(?=[0-9\,]))?([0-9](\.(?=[0-9]{3}))?)*((\,(?=[0-9]))|((?<=[0-9]))\,)?[0-9]*)(\)|\])?%?(?!\S)"
)

data = df.iloc[:, 0].dropna()

return ((data.str.match(us_num).sum()) / len(data) > 0.2) or (
(data.str.match(eu_num).sum()) / len(data) > 0.2
)
25 changes: 25 additions & 0 deletions foreshadow/tests/test_intents/test_subnumeric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
def test_subnumeric_is_intent():
import numpy as np
import pandas as pd
from foreshadow.intents import FinancialIntent

X = np.array(
[
["0", "1.", "1,000", "-.1", "-0.1", "-0.001", "1,000.10000"], # US
["0", "1,", "1.000", "-,1", "-0,1", "-0,001", "1.000,10000"], # EU
["0", "1,", "1.000", "[,1]", "(0,1)", "[0,001]", "(1.000,10000)"], # ACCT
[
"0%",
"1,%",
"1.000%",
"[,1]%",
"(0,1)%",
"[0,001]%",
"(1.000,10000)%",
], # PCT
]
).T.astype("object")
nans = np.array([np.nan] * 800).reshape((200, 4)).astype("object")
X = pd.DataFrame(np.vstack([X, nans]))
for c in X:
assert FinancialIntent.is_intent(X[[c]])
56 changes: 56 additions & 0 deletions foreshadow/tests/test_transformers/test_internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,62 @@ def test_drop_transformer_string_input():
)


def test_prepare_financial():
import numpy as np
import pandas as pd
from foreshadow.transformers.internals import PrepareFinancial

x = pd.DataFrame(
[
"Test",
"(123)",
" 123",
"[123]",
"123,",
"123.",
"-123",
"123,123",
"ab123.3",
]
)
expected = pd.DataFrame(
[np.nan, "(123)", "123", "[123]", "123,", "123.", "-123", "123,123", "123.3"]
).values
out = PrepareFinancial().fit_transform(x).values

assert np.all((out == expected) | (pd.isnull(out) == pd.isnull(expected)))


def test_convert_financial_us():
import numpy as np
import pandas as pd
from foreshadow.transformers.internals import ConvertFinancial

x = pd.DataFrame(
["0", "000", "0.9", "[0.9]", "-.3", "30.00", "1,000", "1.000,000", "1.1.1"]
)
expected = pd.DataFrame(
[0.0, 0.0, 0.9, -0.9, -0.3, 30.0, 1000.0, np.nan, np.nan]
).values
out = ConvertFinancial().fit_transform(x).values
assert np.all((out == expected) | (pd.isnull(out) == pd.isnull(expected)))


def test_convert_financial_eu():
import numpy as np
import pandas as pd
from foreshadow.transformers.internals import ConvertFinancial

x = pd.DataFrame(
["0", "000", "0,9", "[0,9]", "-,3", "30,00", "1.000", "1,000.000", "1.1.1"]
)
expected = pd.DataFrame(
[0, 0, 0.9, -0.9, -0.3, 30.0, 1000.0, np.nan, np.nan]
).values
out = ConvertFinancial(is_euro=True).fit_transform(x).values
assert np.all((out == expected) | (pd.isnull(out) == pd.isnull(expected)))


def test_uncommon_remover_integers():
import numpy as np
import pandas as pd
Expand Down
32 changes: 32 additions & 0 deletions foreshadow/tests/test_transformers/test_smart.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,3 +233,35 @@ def test_smart_encoder_more_than_30_levels_with_overwritten_cutoff():
gt_30_random_data = np.random.choice(31, size=500)
smart_coder = Encoder(unique_num_cutoff=35)
assert isinstance(smart_coder.fit(gt_30_random_data), OneHotEncoder)


def test_smart_financial_cleaner_us():
import numpy as np
import pandas as pd
from foreshadow.transformers.smart import FinancialCleaner

x = pd.DataFrame(
["Test", "0", "000", "1,000", "0.9", "[0.9]", "-.3", "30.00", "3,000.35"]
)
expected = pd.DataFrame(
[np.nan, 0.0, 0.0, 1000, 0.9, -0.9, -0.3, 30.0, 3000.35]
).values
out = FinancialCleaner().fit_transform(x).values

assert np.all((out == expected) | (pd.isnull(out) == pd.isnull(expected)))


def test_smart_financial_cleaner_eu():
import numpy as np
import pandas as pd
from foreshadow.transformers.smart import FinancialCleaner

x = pd.DataFrame(
["Test", "0", "000", "1.000", "0,9", "[0,9]", "-,3", "30,00", "3.000,35"]
)
expected = pd.DataFrame(
[np.nan, 0.0, 0.0, 1000, 0.9, -0.9, -0.3, 30.0, 3000.35]
).values
out = FinancialCleaner().fit_transform(x).values

assert np.all((out == expected) | (pd.isnull(out) == pd.isnull(expected)))
96 changes: 96 additions & 0 deletions foreshadow/transformers/internals/financial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import re

import numpy as np
import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted


class PrepareFinancial(BaseEstimator, TransformerMixin):
"""Cleans data in preparation for a financial transformer
(requires pandas inputs)
"""

def fit(self, X, y=None):
"""Empty fit"""
return self

def transform(self, X, y=None):
"""Cleans string columns to prepare for financial transformer
Args:
X (:obj:`pandas.DataFrame`): X data
Returns:
:obj:`pandas.DataFrame`: Transformed data
"""

X = X.copy()
for c in X:
X[c] = (
X[c]
.str.replace(r"\s", "") # remove all whitespace
.str.findall(r"[\d\.\(\[\-\)\]\,]+") # keep valid characters
.apply(
lambda l: max(l, key=len) # match largest found group
if isinstance(l, list) and len(l) > 0
else np.nan
)
)

return X


class ConvertFinancial(BaseEstimator, TransformerMixin):
"""Converts clean financial data into a numeric format
Args:
is_euro (bool): transform as a european number
"""

def __init__(self, is_euro=False):
self.is_euro = is_euro
self.clean_us = r"(?<!\S)(\[|\()?((-(?=[0-9\.]))?([0-9](\,(?=[0-9]{3}))?)*((\.(?=[0-9]))|((?<=[0-9]))\.)?[0-9]*)(\)|\])?(?!\S)"

def fit(self, X, y=None):
"""Empty fit"""
return self

def transform(self, X, y=None):
"""Prepares data to be processed by FinancialIntent
Args:
X (:obj:`pandas.DataFrame`): X data
Returns:
:obj:`pandas.DataFrame`: Transformed data
"""

def get_match_results(val):
if isinstance(val, str):
match = re.compile(self.clean_us).match(val)
if match:
return match.group()

return np.nan

X = X.copy()
for c in X:
if self.is_euro:
X[c] = X[c].str.translate(str.maketrans(",.", ".,"))

# Filter for validity
X[c] = X[c].apply(get_match_results)

X[c] = pd.to_numeric(
X[c]
.str.replace(r"[\(\[]", "-") # accounting to negative
.str.replace(r"[\]\)]", "")
.str.replace(",", ""), # remove thousand separator
errors="coerce", # convert to number
)

return X
35 changes: 34 additions & 1 deletion foreshadow/transformers/smart.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,22 @@
"""

from copy import deepcopy

import numpy as np
import scipy.stats as ss
import pandas as pd
from sklearn.pipeline import Pipeline

from ..transformers.base import SmartTransformer
from ..transformers.internals import BoxCox, FancyImputer, UncommonRemover, DummyEncoder
from ..transformers.internals import (
BoxCox,
FancyImputer,
PrepareFinancial,
ConvertFinancial,
UncommonRemover,
DummyEncoder,
)
from ..transformers.externals import (
MinMaxScaler,
StandardScaler,
Expand Down Expand Up @@ -184,3 +193,27 @@ def _get_transformer(self, X, y=None, **fit_params):
return self._choose_multi(X)
else:
return Pipeline([("null", None)])


class FinancialCleaner(SmartTransformer):
"""Automatically choose apropriate parameters for a financial column"""

def _get_transformer(self, X, y=None, **fit_params):
# number_regex = r'(?<!\S)(\[|\()?(((-(?=[0-9]))|(-\.(?=[0-9])))?[0-9]*((\.(?=[0-9]))|((?<=[0-9]))\.)?[0-9]*)(\)|\])?(?!\S)'

us_pipeline = Pipeline(
[("prepare", PrepareFinancial()), ("convert", ConvertFinancial())]
)
eu_pipeline = Pipeline(
[
("prepare", PrepareFinancial()),
("convert", ConvertFinancial(is_euro=True)),
]
)
us_data = deepcopy(us_pipeline).fit_transform(X)
eu_data = deepcopy(eu_pipeline).fit_transform(X)

if eu_data.isnull().values.sum() < us_data.isnull().values.sum():
return eu_pipeline
else:
return us_pipeline

0 comments on commit dbeb8d6

Please sign in to comment.