Skip to content

Commit

Permalink
base form operates only if at least one alphanumeric char exist (#1581)
Browse files Browse the repository at this point in the history
base form operates only if at least one alphanumeric char exist
  • Loading branch information
Nadav-Barak committed Jun 7, 2022
1 parent 1e77878 commit d36be62
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 13 deletions.
2 changes: 1 addition & 1 deletion deepchecks/tabular/checks/data_integrity/special_chars.py
Expand Up @@ -125,7 +125,7 @@ def _get_special_samples(column_data: pd.Series) -> Union[dict, None]:
return None
samples_to_count = defaultdict(lambda: 0)
for sample in column_data:
if isinstance(sample, str) and len(sample) > 0 and len(string_baseform(sample)) == 0:
if isinstance(sample, str) and len(sample) > 0 and len(string_baseform(sample, True)) == 0:
samples_to_count[sample] = samples_to_count[sample] + 1

return samples_to_count or None
Expand Down
20 changes: 14 additions & 6 deletions deepchecks/utils/strings.py
Expand Up @@ -59,10 +59,10 @@
'generate_check_docs_link',
'widget_to_html_string',
'format_number_if_not_nan',
'get_docs_link',
'get_docs_link'
]

# Creating a translation table for the string.translate() method to be used in string base form method
# Creating a translation table for the string.translate() method to be used in string base-form method
DEL_CHARS = ''.join(c for c in map(chr, range(sys.maxunicode)) if not c.isalnum())
DEL_MAP = str.maketrans('', '', DEL_CHARS)

Expand Down Expand Up @@ -243,21 +243,29 @@ def get_random_string(n: int = 5):
return ''.join(random.choices(ascii_uppercase + digits, k=n))


def string_baseform(string: Hashable) -> Hashable:
"""Remove special characters from given string, leaving only a-z, A-Z, 0-9 characters.
def string_baseform(string: Hashable, allow_empty_result: bool = False) -> Hashable:
"""Normalize the string input to a uniform form.
If input is a string containing alphanumeric characters or if allow_empty_result is set to True,
removes all non-alphanumeric characters and convert characters to lower form.
Parameters
----------
allow_empty_result : bool , default : False
bool indicating whether to return empty result if no alphanumeric characters are present or the original input
string : str
string to remove special characters from
Returns
-------
str
string without special characters
original input if condition is not met or lower form alphanumeric characters of input.
"""
if not isinstance(string, str):
return string
return string.translate(DEL_MAP).lower()
lower_alphanumeric_form = string.translate(DEL_MAP).lower()
if len(lower_alphanumeric_form) > 0 or allow_empty_result:
return lower_alphanumeric_form
else:
return string


def is_string_column(column: pd.Series) -> bool:
Expand Down
3 changes: 1 addition & 2 deletions tests/base/feature_importance_utils_test.py
Expand Up @@ -54,8 +54,7 @@ def test_linear_regression(diabetes):
def test_pipeline(iris_split_dataset_and_model_single_feature):
_, test_ds, clf = iris_split_dataset_and_model_single_feature
feature_importances, fi_type = calculate_feature_importance(clf, test_ds)
print(feature_importances)
assert_that(feature_importances['sepal length (cm)'], equal_to(1))
assert_that(feature_importances['sepal length (cm)'], equal_to(1)) # pylint: disable=e1136
assert_that(feature_importances, has_length(1))
assert_that(fi_type, is_('permutation_importance'))
assert_that(hasattr(clf.steps[-1][1], 'feature_importances_'))
Expand Down
18 changes: 14 additions & 4 deletions tests/tabular/checks/integrity/mixed_nulls_test.py
Expand Up @@ -50,12 +50,12 @@ def test_empty_dataframe():

def test_different_null_types():
# Arrange
data = {'col1': [np.NAN, np.NaN, pd.NA, '$$$$$$$$', 'NULL']}
data = {'col1': [np.NAN, np.NaN, pd.NA, 'value', 'NULL']}
dataframe = pd.DataFrame(data=data)
# Act
result = MixedNulls().run(dataframe)
# Assert
assert_that(result.value, has_entry('col1', has_length(4)))
assert_that(result.value, has_entry('col1', has_length(3)))


def test_null_list_param():
Expand Down Expand Up @@ -131,12 +131,22 @@ def test_mix_value_columns():

def test_single_column_nulls_with_special_characters():
# Arrange
data = {'col1': ['', '#@$', 'Nan!', '#nan', '<NaN>']}
data = {'col1': ['', 'value', 'Nan!', '#nan', '<NaN>']}
dataframe = pd.DataFrame(data=data)
# Act
result = MixedNulls().run(dataframe)
# Assert
assert_that(result.value, has_entry('col1', has_length(5)))
assert_that(result.value, has_entry('col1', has_length(4)))


def test_single_column_nulls_only_special_characters():
# Arrange
data = {'col1': ['', '!@#$', 'Nan!', '#nan', '<NaN>']}
dataframe = pd.DataFrame(data=data)
# Act
result = MixedNulls().run(dataframe)
# Assert
assert_that(result.value, has_entry('col1', has_length(4)))


def test_ignore_columns_single():
Expand Down

0 comments on commit d36be62

Please sign in to comment.