condition for outlier check + bug fix in validation

deepchecks · Jun 7, 2023 · 59464b8 · 59464b8
1 parent d1af76b
commit 59464b8
Show file tree

Hide file tree

Showing 6 changed files with 90 additions and 7 deletions.
diff --git a/deepchecks/nlp/checks/data_integrity/text_property_outliers.py b/deepchecks/nlp/checks/data_integrity/text_property_outliers.py
@@ -13,13 +13,16 @@
 
 import numpy as np
 import pandas as pd
+from typing_extensions import Self
 
+from deepchecks import ConditionCategory, ConditionResult
 from deepchecks.core import CheckResult, DatasetKind
 from deepchecks.core.errors import NotEnoughSamplesError
 from deepchecks.nlp import Context, SingleDatasetCheck
 from deepchecks.nlp.utils.nlp_plot import get_text_outliers_graph
 from deepchecks.utils.dataframes import hide_index_for_display
 from deepchecks.utils.outliers import iqr_outliers_range, sharp_drop_outliers_range
+from deepchecks.utils.strings import format_percent
 
 __all__ = ['TextPropertyOutliers']
 
@@ -126,6 +129,7 @@ def run_logic(self, context: Context, dataset_kind: DatasetKind) -> CheckResult:
                 # we have in the data
                 'lower_limit': max(lower_limit, min(values_arr)),
                 'upper_limit': min(upper_limit, max(values_arr)) if is_numeric else None,
+                'outlier_ratio': len(text_outliers) / len(values_arr)
             }
 
         # Create display
@@ -181,3 +185,43 @@ def run_logic(self, context: Context, dataset_kind: DatasetKind) -> CheckResult:
             display = None
 
         return CheckResult(result, display=display)
+
+    def add_condition_outlier_ratio_less_or_equal(self: Self, threshold: float = 0.05,
+                                                  properties_to_ignore: t.Optional[t.List[str]] = None) -> Self:
+        """Add condition - outlier ratio in every property is less or equal to ratio.
+
+        Parameters
+        ----------
+        threshold : float , default: 0.05
+            Maximum threshold of outliers ratio per property.
+        properties_to_ignore : t.Optional[t.List[str]] , default: None
+            List of properties to ignore for the condition.
+        """
+
+        def condition(result: t.Dict[str, t.Any]):
+            failed_properties = []
+            worst_property = ''
+            worst_ratio = 0
+
+            for property_name, info in result.items():
+                if properties_to_ignore is not None and property_name in properties_to_ignore:
+                    continue
+                if info['outlier_ratio'] > threshold:
+                    failed_properties.append(property_name)
+                if info['outlier_ratio'] > worst_ratio:
+                    worst_property = property_name
+                    worst_ratio = info['outlier_ratio']
+
+            if len(failed_properties) > 0:
+                return ConditionResult(ConditionCategory.FAIL,
+                                       f'Found {len(failed_properties)} properties with outlier ratios above threshold.'
+                                       f'</br>Property with highest ratio is {worst_property} with outlier ratio of '
+                                       f'{format_percent(worst_ratio)}')
+            else:
+                return ConditionResult(ConditionCategory.PASS,
+                                       f'All properties have outlier ratios below threshold. '
+                                       f'Property with highest ratio is {worst_property} with outlier ratio of'
+                                       f' {format_percent(worst_ratio)}')
+
+        return self.add_condition(f'Outlier ratio in all properties is less or equal than {format_percent(threshold)}',
+                                  condition)
diff --git a/deepchecks/nlp/input_validations.py b/deepchecks/nlp/input_validations.py
@@ -67,7 +67,7 @@ def validate_modify_label(labels: Optional[TTextLabel], task_type: TaskType, exp
                                            ' of possible classes.')
             labels = [[None]*len(labels[0]) if is_label_none(label_per_sample) else [int(x) for x in label_per_sample]
                       for label_per_sample in labels]
-        elif not all(isinstance(x, (str, int)) or pd.isna(x) for x in labels):  # Classic classification
+        elif any(not isinstance(x, (str, np.integer, int)) and not pd.isna(x) for x in labels):
             raise DeepchecksValueError('label must be a Sequence of strings or ints (multiclass classification) '
                                        'or a Sequence of Sequences of strings or ints (multilabel classification)')
         else:

diff --git a/deepchecks/nlp/suites/default_suites.py b/deepchecks/nlp/suites/default_suites.py
@@ -59,7 +59,7 @@ def data_integrity(n_samples: int = None,
     kwargs = {**non_none_args, **kwargs}
     return Suite(
         'Data Integrity Suite',
-        TextPropertyOutliers(**kwargs),
+        TextPropertyOutliers(**kwargs).add_condition_outlier_ratio_less_or_equal(),
         UnknownTokens(**kwargs).add_condition_ratio_of_unknown_words_less_or_equal(),
         UnderAnnotatedPropertySegments(**kwargs).add_condition_segments_relative_performance_greater_than(),
         UnderAnnotatedMetaDataSegments(**kwargs).add_condition_segments_relative_performance_greater_than(),

diff --git a/docs/source/checks/nlp/data_integrity/plot_text_property_outliers.py b/docs/source/checks/nlp/data_integrity/plot_text_property_outliers.py
@@ -14,6 +14,7 @@
 * `How Does the Check Work? <#how-does-the-check-work>`__
 * `Which Text Properties Are Used? <#which-text-properties-are-used>`__
 * `Run the Check <#run-the-check>`__
+* `Define a Condition <#define-a-condition>`__
 
 
 Why Check for Outliers?
@@ -63,13 +64,25 @@
 
 check = TextPropertyOutliers()
 result = check.run(dataset)
-result
+result.show()
 
 #%%
 # Observe Graphic Result
 # ^^^^^^^^^^^^^^^^^^^^^^
 # In this example, we can find many tweets that are outliers - For example, in the "average word length" property,
-# we can see that there are tweets with a very large average word length, which is is usually because of missing spaces
-# in the tweet itself, or the fact that tweeter hashtags remained in the data and they don't contain spaces. This
-# could be problematic for the model, as it cannot coprehend the hashtags as words, and it may cause the model to
+# we can see that there are tweets with a very large average word length, which is usually because of missing spaces
+# in the tweet itself, or the fact that tweeter hashtags remained in the data, and they don't contain spaces. This
+# could be problematic for the model, as it cannot comprehend the hashtags as words, and it may cause the model to
 # fail on these tweets.
+#
+# Define a Condition
+# ------------------
+#
+# Now, we define a condition that enforces the ratio of duplicates to be 0. A condition
+# is deepchecks' way to validate model and data quality, and let you know if anything
+# goes wrong.
+
+check = TextPropertyOutliers()
+check.add_condition_outlier_ratio_less_or_equal(0.1)
+result = check.run(dataset)
+result.show(show_additional_outputs=False)
diff --git a/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py b/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py
@@ -152,7 +152,7 @@
 # Integrity #2: Text Outliers
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
-# In the “Other” tab, Looking at the Text Outlier check result we can
+# In the “Didn't Pass” tab, by looking at the Text Outlier check result we can
 # derive several insights by hovering over the different values and inspecting the outlier texts:
 #
 # 1. hashtags (‘#…’) are usually several words

diff --git a/tests/nlp/checks/data_integrity/text_property_outliers_test.py b/tests/nlp/checks/data_integrity/text_property_outliers_test.py
@@ -15,6 +15,8 @@
 
 from deepchecks.core.errors import NotEnoughSamplesError
 from deepchecks.nlp.checks import TextPropertyOutliers
+from deepchecks.utils.strings import format_percent
+from tests.base.utils import equal_condition_result
 
 
 def test_tweet_emotion_properties(tweet_emotion_train_test_textdata):
@@ -51,6 +53,30 @@ def test_tweet_emotion_properties(tweet_emotion_train_test_textdata):
     assert_that((expected_series != result_series).sum().sum(), equal_to(0))
 
 
+def test_tweet_emotion_condition(tweet_emotion_train_test_textdata):
+    # Arrange
+    _, test = tweet_emotion_train_test_textdata
+    check = TextPropertyOutliers().add_condition_outlier_ratio_less_or_equal()
+    # Act
+    result = check.run(test)
+    conditions_decisions = check.conditions_decision(result)
+
+    # Assert
+    assert_that(len(result.value['Sentiment']['indices']), equal_to(65))
+    assert_that(result.value['Sentiment']['lower_limit'], close_to(-0.90, 0.01))
+    assert_that(result.value['Sentiment']['upper_limit'], close_to(0.92, 0.01))
+
+    assert_that(
+        conditions_decisions[0],
+        equal_condition_result(
+            is_pass=False,
+            name='Outlier ratio in all properties is less or equal than 5%',
+            details='Found 1 properties with outlier ratios above threshold.</br>'
+                    'Property with highest ratio is Toxicity with outlier ratio of 16.43%'
+        )  # type: ignore
+    )
+
+
 def test_not_enough_samples(tweet_emotion_train_test_textdata):
     # Arrange
     _, test = tweet_emotion_train_test_textdata