Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nb/bug/small updates multi lang usecase #2583

Merged
merged 7 commits into from
Jun 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
44 changes: 44 additions & 0 deletions deepchecks/nlp/checks/data_integrity/text_property_outliers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@

import numpy as np
import pandas as pd
from typing_extensions import Self

from deepchecks import ConditionCategory, ConditionResult
from deepchecks.core import CheckResult, DatasetKind
from deepchecks.core.errors import NotEnoughSamplesError
from deepchecks.nlp import Context, SingleDatasetCheck
from deepchecks.nlp.utils.nlp_plot import get_text_outliers_graph
from deepchecks.utils.dataframes import hide_index_for_display
from deepchecks.utils.outliers import iqr_outliers_range, sharp_drop_outliers_range
from deepchecks.utils.strings import format_percent

__all__ = ['TextPropertyOutliers']

Expand Down Expand Up @@ -126,6 +129,7 @@ def run_logic(self, context: Context, dataset_kind: DatasetKind) -> CheckResult:
# we have in the data
'lower_limit': max(lower_limit, min(values_arr)),
'upper_limit': min(upper_limit, max(values_arr)) if is_numeric else None,
'outlier_ratio': len(text_outliers) / len(values_arr)
}

# Create display
Expand Down Expand Up @@ -181,3 +185,43 @@ def run_logic(self, context: Context, dataset_kind: DatasetKind) -> CheckResult:
display = None

return CheckResult(result, display=display)

def add_condition_outlier_ratio_less_or_equal(self: Self, threshold: float = 0.05,
properties_to_ignore: t.Optional[t.List[str]] = None) -> Self:
"""Add condition - outlier ratio in every property is less or equal to ratio.

Parameters
----------
threshold : float , default: 0.05
Maximum threshold of outliers ratio per property.
properties_to_ignore : t.Optional[t.List[str]] , default: None
List of properties to ignore for the condition.
"""

def condition(result: t.Dict[str, t.Any]):
failed_properties = []
worst_property = ''
worst_ratio = 0

for property_name, info in result.items():
if properties_to_ignore is not None and property_name in properties_to_ignore:
continue
if info['outlier_ratio'] > threshold:
failed_properties.append(property_name)
if info['outlier_ratio'] > worst_ratio:
worst_property = property_name
worst_ratio = info['outlier_ratio']

if len(failed_properties) > 0:
return ConditionResult(ConditionCategory.FAIL,
f'Found {len(failed_properties)} properties with outlier ratios above threshold.'
f'</br>Property with highest ratio is {worst_property} with outlier ratio of '
f'{format_percent(worst_ratio)}')
else:
return ConditionResult(ConditionCategory.PASS,
f'All properties have outlier ratios below threshold. '
f'Property with highest ratio is {worst_property} with outlier ratio of'
f' {format_percent(worst_ratio)}')

return self.add_condition(f'Outlier ratio in all properties is less or equal than {format_percent(threshold)}',
condition)
2 changes: 1 addition & 1 deletion deepchecks/nlp/input_validations.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def validate_modify_label(labels: Optional[TTextLabel], task_type: TaskType, exp
' of possible classes.')
labels = [[None]*len(labels[0]) if is_label_none(label_per_sample) else [int(x) for x in label_per_sample]
for label_per_sample in labels]
elif not all(isinstance(x, (str, int)) or pd.isna(x) for x in labels): # Classic classification
elif any(not isinstance(x, (str, np.integer, int)) and not pd.isna(x) for x in labels):
raise DeepchecksValueError('label must be a Sequence of strings or ints (multiclass classification) '
'or a Sequence of Sequences of strings or ints (multilabel classification)')
else:
Expand Down
2 changes: 1 addition & 1 deletion deepchecks/nlp/suites/default_suites.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def data_integrity(n_samples: int = None,
kwargs = {**non_none_args, **kwargs}
return Suite(
'Data Integrity Suite',
TextPropertyOutliers(**kwargs),
TextPropertyOutliers(**kwargs).add_condition_outlier_ratio_less_or_equal(),
UnknownTokens(**kwargs).add_condition_ratio_of_unknown_words_less_or_equal(),
UnderAnnotatedPropertySegments(**kwargs).add_condition_segments_relative_performance_greater_than(),
UnderAnnotatedMetaDataSegments(**kwargs).add_condition_segments_relative_performance_greater_than(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* `How Does the Check Work? <#how-does-the-check-work>`__
* `Which Text Properties Are Used? <#which-text-properties-are-used>`__
* `Run the Check <#run-the-check>`__
* `Define a Condition <#define-a-condition>`__


Why Check for Outliers?
Expand Down Expand Up @@ -63,13 +64,25 @@

check = TextPropertyOutliers()
result = check.run(dataset)
result
result.show()

#%%
# Observe Graphic Result
# ^^^^^^^^^^^^^^^^^^^^^^
# In this example, we can find many tweets that are outliers - For example, in the "average word length" property,
# we can see that there are tweets with a very large average word length, which is is usually because of missing spaces
# in the tweet itself, or the fact that tweeter hashtags remained in the data and they don't contain spaces. This
# could be problematic for the model, as it cannot coprehend the hashtags as words, and it may cause the model to
# we can see that there are tweets with a very large average word length, which is usually because of missing spaces
# in the tweet itself, or the fact that tweeter hashtags remained in the data, and they don't contain spaces. This
# could be problematic for the model, as it cannot comprehend the hashtags as words, and it may cause the model to
# fail on these tweets.
#
# Define a Condition
# ------------------
#
# Now, we define a condition that enforces the ratio of duplicates to be 0. A condition
# is deepchecks' way to validate model and data quality, and let you know if anything
# goes wrong.

check = TextPropertyOutliers()
check.add_condition_outlier_ratio_less_or_equal(0.1)
result = check.run(dataset)
result.show(show_additional_outputs=False)
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@
# Integrity #2: Text Outliers
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# In the “Other” tab, Looking at the Text Outlier check result we can
# In the “Didn't Pass” tab, by looking at the Text Outlier check result we can
# derive several insights by hovering over the different values and inspecting the outlier texts:
#
# 1. hashtags (‘#…’) are usually several words
Expand Down
26 changes: 26 additions & 0 deletions tests/nlp/checks/data_integrity/text_property_outliers_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

from deepchecks.core.errors import NotEnoughSamplesError
from deepchecks.nlp.checks import TextPropertyOutliers
from deepchecks.utils.strings import format_percent
from tests.base.utils import equal_condition_result


def test_tweet_emotion_properties(tweet_emotion_train_test_textdata):
Expand Down Expand Up @@ -51,6 +53,30 @@ def test_tweet_emotion_properties(tweet_emotion_train_test_textdata):
assert_that((expected_series != result_series).sum().sum(), equal_to(0))


def test_tweet_emotion_condition(tweet_emotion_train_test_textdata):
# Arrange
_, test = tweet_emotion_train_test_textdata
check = TextPropertyOutliers().add_condition_outlier_ratio_less_or_equal()
# Act
result = check.run(test)
conditions_decisions = check.conditions_decision(result)

# Assert
assert_that(len(result.value['Sentiment']['indices']), equal_to(65))
assert_that(result.value['Sentiment']['lower_limit'], close_to(-0.90, 0.01))
assert_that(result.value['Sentiment']['upper_limit'], close_to(0.92, 0.01))

assert_that(
conditions_decisions[0],
equal_condition_result(
is_pass=False,
name='Outlier ratio in all properties is less or equal than 5%',
details='Found 1 properties with outlier ratios above threshold.</br>'
'Property with highest ratio is Toxicity with outlier ratio of 16.43%'
) # type: ignore
)


def test_not_enough_samples(tweet_emotion_train_test_textdata):
# Arrange
_, test = tweet_emotion_train_test_textdata
Expand Down