base form operates only if at least one alphanumeric char exist (#1581)

base form operates only if at least one alphanumeric char exist
deepchecks · Jun 7, 2022 · d36be62 · d36be62
1 parent 1e77878
commit d36be62
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 13 deletions.
diff --git a/deepchecks/tabular/checks/data_integrity/special_chars.py b/deepchecks/tabular/checks/data_integrity/special_chars.py
@@ -125,7 +125,7 @@ def _get_special_samples(column_data: pd.Series) -> Union[dict, None]:
         return None
     samples_to_count = defaultdict(lambda: 0)
     for sample in column_data:
-        if isinstance(sample, str) and len(sample) > 0 and len(string_baseform(sample)) == 0:
+        if isinstance(sample, str) and len(sample) > 0 and len(string_baseform(sample, True)) == 0:
             samples_to_count[sample] = samples_to_count[sample] + 1
 
     return samples_to_count or None

diff --git a/deepchecks/utils/strings.py b/deepchecks/utils/strings.py
@@ -59,10 +59,10 @@
     'generate_check_docs_link',
     'widget_to_html_string',
     'format_number_if_not_nan',
-    'get_docs_link',
+    'get_docs_link'
 ]
 
-# Creating a translation table for the string.translate() method to be used in string base form method
+# Creating a translation table for the string.translate() method to be used in string base-form method
 DEL_CHARS = ''.join(c for c in map(chr, range(sys.maxunicode)) if not c.isalnum())
 DEL_MAP = str.maketrans('', '', DEL_CHARS)
 
@@ -243,21 +243,29 @@ def get_random_string(n: int = 5):
     return ''.join(random.choices(ascii_uppercase + digits, k=n))
 
 
-def string_baseform(string: Hashable) -> Hashable:
-    """Remove special characters from given string, leaving only a-z, A-Z, 0-9 characters.
+def string_baseform(string: Hashable, allow_empty_result: bool = False) -> Hashable:
+    """Normalize the string input to a uniform form.
 
+    If input is a string containing alphanumeric characters or if allow_empty_result is set to True,
+    removes all non-alphanumeric characters and convert characters to lower form.
     Parameters
     ----------
+    allow_empty_result : bool , default : False
+        bool indicating whether to return empty result if no alphanumeric characters are present or the original input
     string : str
         string to remove special characters from
     Returns
     -------
     str
-        string without special characters
+        original input if condition is not met or lower form alphanumeric characters of input.
     """
     if not isinstance(string, str):
         return string
-    return string.translate(DEL_MAP).lower()
+    lower_alphanumeric_form = string.translate(DEL_MAP).lower()
+    if len(lower_alphanumeric_form) > 0 or allow_empty_result:
+        return lower_alphanumeric_form
+    else:
+        return string
 
 
 def is_string_column(column: pd.Series) -> bool:

diff --git a/tests/base/feature_importance_utils_test.py b/tests/base/feature_importance_utils_test.py
@@ -54,8 +54,7 @@ def test_linear_regression(diabetes):
 def test_pipeline(iris_split_dataset_and_model_single_feature):
     _, test_ds, clf = iris_split_dataset_and_model_single_feature
     feature_importances, fi_type = calculate_feature_importance(clf, test_ds)
-    print(feature_importances)
-    assert_that(feature_importances['sepal length (cm)'], equal_to(1))
+    assert_that(feature_importances['sepal length (cm)'], equal_to(1))  # pylint: disable=e1136
     assert_that(feature_importances, has_length(1))
     assert_that(fi_type, is_('permutation_importance'))
     assert_that(hasattr(clf.steps[-1][1], 'feature_importances_'))

diff --git a/tests/tabular/checks/integrity/mixed_nulls_test.py b/tests/tabular/checks/integrity/mixed_nulls_test.py
@@ -50,12 +50,12 @@ def test_empty_dataframe():
 
 def test_different_null_types():
     # Arrange
-    data = {'col1': [np.NAN, np.NaN, pd.NA, '$$$$$$$$', 'NULL']}
+    data = {'col1': [np.NAN, np.NaN, pd.NA, 'value', 'NULL']}
     dataframe = pd.DataFrame(data=data)
     # Act
     result = MixedNulls().run(dataframe)
     # Assert
-    assert_that(result.value, has_entry('col1', has_length(4)))
+    assert_that(result.value, has_entry('col1', has_length(3)))
 
 
 def test_null_list_param():
@@ -131,12 +131,22 @@ def test_mix_value_columns():
 
 def test_single_column_nulls_with_special_characters():
     # Arrange
-    data = {'col1': ['', '#@$', 'Nan!', '#nan', '<NaN>']}
+    data = {'col1': ['', 'value', 'Nan!', '#nan', '<NaN>']}
     dataframe = pd.DataFrame(data=data)
     # Act
     result = MixedNulls().run(dataframe)
     # Assert
-    assert_that(result.value, has_entry('col1', has_length(5)))
+    assert_that(result.value, has_entry('col1', has_length(4)))
+
+
+def test_single_column_nulls_only_special_characters():
+    # Arrange
+    data = {'col1': ['', '!@#$', 'Nan!', '#nan', '<NaN>']}
+    dataframe = pd.DataFrame(data=data)
+    # Act
+    result = MixedNulls().run(dataframe)
+    # Assert
+    assert_that(result.value, has_entry('col1', has_length(4)))
 
 
 def test_ignore_columns_single():