Skip to content

Commit

Permalink
condition for outlier check + bug fix in validation
Browse files Browse the repository at this point in the history
  • Loading branch information
Nadav-Barak committed Jun 7, 2023
1 parent d1af76b commit 59464b8
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 7 deletions.
44 changes: 44 additions & 0 deletions deepchecks/nlp/checks/data_integrity/text_property_outliers.py
Expand Up @@ -13,13 +13,16 @@

import numpy as np
import pandas as pd
from typing_extensions import Self

from deepchecks import ConditionCategory, ConditionResult
from deepchecks.core import CheckResult, DatasetKind
from deepchecks.core.errors import NotEnoughSamplesError
from deepchecks.nlp import Context, SingleDatasetCheck
from deepchecks.nlp.utils.nlp_plot import get_text_outliers_graph
from deepchecks.utils.dataframes import hide_index_for_display
from deepchecks.utils.outliers import iqr_outliers_range, sharp_drop_outliers_range
from deepchecks.utils.strings import format_percent

__all__ = ['TextPropertyOutliers']

Expand Down Expand Up @@ -126,6 +129,7 @@ def run_logic(self, context: Context, dataset_kind: DatasetKind) -> CheckResult:
# we have in the data
'lower_limit': max(lower_limit, min(values_arr)),
'upper_limit': min(upper_limit, max(values_arr)) if is_numeric else None,
'outlier_ratio': len(text_outliers) / len(values_arr)
}

# Create display
Expand Down Expand Up @@ -181,3 +185,43 @@ def run_logic(self, context: Context, dataset_kind: DatasetKind) -> CheckResult:
display = None

return CheckResult(result, display=display)

def add_condition_outlier_ratio_less_or_equal(self: Self, threshold: float = 0.05,
properties_to_ignore: t.Optional[t.List[str]] = None) -> Self:
"""Add condition - outlier ratio in every property is less or equal to ratio.
Parameters
----------
threshold : float , default: 0.05
Maximum threshold of outliers ratio per property.
properties_to_ignore : t.Optional[t.List[str]] , default: None
List of properties to ignore for the condition.
"""

def condition(result: t.Dict[str, t.Any]):
failed_properties = []
worst_property = ''
worst_ratio = 0

for property_name, info in result.items():
if properties_to_ignore is not None and property_name in properties_to_ignore:
continue
if info['outlier_ratio'] > threshold:
failed_properties.append(property_name)
if info['outlier_ratio'] > worst_ratio:
worst_property = property_name
worst_ratio = info['outlier_ratio']

if len(failed_properties) > 0:
return ConditionResult(ConditionCategory.FAIL,
f'Found {len(failed_properties)} properties with outlier ratios above threshold.'
f'</br>Property with highest ratio is {worst_property} with outlier ratio of '
f'{format_percent(worst_ratio)}')
else:
return ConditionResult(ConditionCategory.PASS,
f'All properties have outlier ratios below threshold. '
f'Property with highest ratio is {worst_property} with outlier ratio of'
f' {format_percent(worst_ratio)}')

return self.add_condition(f'Outlier ratio in all properties is less or equal than {format_percent(threshold)}',
condition)
2 changes: 1 addition & 1 deletion deepchecks/nlp/input_validations.py
Expand Up @@ -67,7 +67,7 @@ def validate_modify_label(labels: Optional[TTextLabel], task_type: TaskType, exp
' of possible classes.')
labels = [[None]*len(labels[0]) if is_label_none(label_per_sample) else [int(x) for x in label_per_sample]
for label_per_sample in labels]
elif not all(isinstance(x, (str, int)) or pd.isna(x) for x in labels): # Classic classification
elif any(not isinstance(x, (str, np.integer, int)) and not pd.isna(x) for x in labels):
raise DeepchecksValueError('label must be a Sequence of strings or ints (multiclass classification) '
'or a Sequence of Sequences of strings or ints (multilabel classification)')
else:
Expand Down
2 changes: 1 addition & 1 deletion deepchecks/nlp/suites/default_suites.py
Expand Up @@ -59,7 +59,7 @@ def data_integrity(n_samples: int = None,
kwargs = {**non_none_args, **kwargs}
return Suite(
'Data Integrity Suite',
TextPropertyOutliers(**kwargs),
TextPropertyOutliers(**kwargs).add_condition_outlier_ratio_less_or_equal(),
UnknownTokens(**kwargs).add_condition_ratio_of_unknown_words_less_or_equal(),
UnderAnnotatedPropertySegments(**kwargs).add_condition_segments_relative_performance_greater_than(),
UnderAnnotatedMetaDataSegments(**kwargs).add_condition_segments_relative_performance_greater_than(),
Expand Down
Expand Up @@ -14,6 +14,7 @@
* `How Does the Check Work? <#how-does-the-check-work>`__
* `Which Text Properties Are Used? <#which-text-properties-are-used>`__
* `Run the Check <#run-the-check>`__
* `Define a Condition <#define-a-condition>`__
Why Check for Outliers?
Expand Down Expand Up @@ -63,13 +64,25 @@

check = TextPropertyOutliers()
result = check.run(dataset)
result
result.show()

#%%
# Observe Graphic Result
# ^^^^^^^^^^^^^^^^^^^^^^
# In this example, we can find many tweets that are outliers - For example, in the "average word length" property,
# we can see that there are tweets with a very large average word length, which is is usually because of missing spaces
# in the tweet itself, or the fact that tweeter hashtags remained in the data and they don't contain spaces. This
# could be problematic for the model, as it cannot coprehend the hashtags as words, and it may cause the model to
# we can see that there are tweets with a very large average word length, which is usually because of missing spaces
# in the tweet itself, or the fact that tweeter hashtags remained in the data, and they don't contain spaces. This
# could be problematic for the model, as it cannot comprehend the hashtags as words, and it may cause the model to
# fail on these tweets.
#
# Define a Condition
# ------------------
#
# Now, we define a condition that enforces the ratio of duplicates to be 0. A condition
# is deepchecks' way to validate model and data quality, and let you know if anything
# goes wrong.

check = TextPropertyOutliers()
check.add_condition_outlier_ratio_less_or_equal(0.1)
result = check.run(dataset)
result.show(show_additional_outputs=False)
Expand Up @@ -152,7 +152,7 @@
# Integrity #2: Text Outliers
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# In the “Other” tab, Looking at the Text Outlier check result we can
# In the “Didn't Pass” tab, by looking at the Text Outlier check result we can
# derive several insights by hovering over the different values and inspecting the outlier texts:
#
# 1. hashtags (‘#…’) are usually several words
Expand Down
26 changes: 26 additions & 0 deletions tests/nlp/checks/data_integrity/text_property_outliers_test.py
Expand Up @@ -15,6 +15,8 @@

from deepchecks.core.errors import NotEnoughSamplesError
from deepchecks.nlp.checks import TextPropertyOutliers
from deepchecks.utils.strings import format_percent
from tests.base.utils import equal_condition_result


def test_tweet_emotion_properties(tweet_emotion_train_test_textdata):
Expand Down Expand Up @@ -51,6 +53,30 @@ def test_tweet_emotion_properties(tweet_emotion_train_test_textdata):
assert_that((expected_series != result_series).sum().sum(), equal_to(0))


def test_tweet_emotion_condition(tweet_emotion_train_test_textdata):
# Arrange
_, test = tweet_emotion_train_test_textdata
check = TextPropertyOutliers().add_condition_outlier_ratio_less_or_equal()
# Act
result = check.run(test)
conditions_decisions = check.conditions_decision(result)

# Assert
assert_that(len(result.value['Sentiment']['indices']), equal_to(65))
assert_that(result.value['Sentiment']['lower_limit'], close_to(-0.90, 0.01))
assert_that(result.value['Sentiment']['upper_limit'], close_to(0.92, 0.01))

assert_that(
conditions_decisions[0],
equal_condition_result(
is_pass=False,
name='Outlier ratio in all properties is less or equal than 5%',
details='Found 1 properties with outlier ratios above threshold.</br>'
'Property with highest ratio is Toxicity with outlier ratio of 16.43%'
) # type: ignore
)


def test_not_enough_samples(tweet_emotion_train_test_textdata):
# Arrange
_, test = tweet_emotion_train_test_textdata
Expand Down

0 comments on commit 59464b8

Please sign in to comment.