Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Return nan embeddings for empty samples #2649

Merged
merged 5 commits into from
Jul 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions deepchecks/nlp/checks/data_integrity/conflicting_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ def _get_labels(self, dataset):
labels = [tuple(np.where(row == 1)[0]) for row in dataset.label]
elif dataset.task_type is TaskType.TEXT_CLASSIFICATION:
labels = dataset.label
elif dataset.task_type is TaskType.OTHER:
raise DeepchecksValueError('Check is irrelevant when task type is not specified')
else:
raise DeepchecksValueError(f'Unknown task type - {dataset.task_type}')
return labels
Expand Down
3 changes: 3 additions & 0 deletions deepchecks/nlp/checks/data_integrity/special_characters.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
continue
if len(sample) > self.max_chars_to_review_per_sample:
sample = random.sample(sample, self.max_chars_to_review_per_sample)
if len(sample) == 0:
percent_special_chars_in_sample[idx] = 0
continue
special_chars_in_sample = [char for char in sample if char in self.special_characters_deny_list]
percent_special_chars_in_sample[idx] = len(special_chars_in_sample) / len(sample)
for char in frozenset(special_chars_in_sample):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,8 @@ def condition(result: t.Dict[str, t.Any]):
for property_name, info in result.items():
if properties_to_ignore is not None and property_name in properties_to_ignore:
continue
if isinstance(info, str):
continue
if info['outlier_ratio'] > threshold:
failed_properties.append(property_name)
if info['outlier_ratio'] > worst_ratio:
Expand Down
7 changes: 5 additions & 2 deletions deepchecks/nlp/utils/text_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,11 @@ def len_safe_get_embedding(list_of_texts, model_name=EMBEDDING_MODEL, max_tokens
text_lens.append(chunk_lens[idx])
idx += 1

text_embedding = np.average(text_embeddings, axis=0, weights=text_lens)
text_embedding = text_embedding / np.linalg.norm(text_embedding) # normalizes length to 1
if sum(text_lens) == 0:
text_embedding = np.ones((EMBEDDING_DIM, )) * np.nan
else:
text_embedding = np.average(text_embeddings, axis=0, weights=text_lens)
text_embedding = text_embedding / np.linalg.norm(text_embedding) # normalizes length to 1
result_embeddings.append(text_embedding.tolist())

return result_embeddings
Expand Down
16 changes: 16 additions & 0 deletions tests/nlp/checks/data_integrity/special_characters_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,22 @@ def test_check_on_clean_dataset(clean_dataset):
)) # type: ignore


def test_check_on_dataset_with_emptt_sample():
# Arrange
data = TextData(raw_text=['', 'aa'])
check = SpecialCharacters().add_condition_samples_ratio_w_special_characters_less_or_equal(0)

# Act
result = check.run(dataset=data)

# Assert
assert_that(result.value, has_entries({
"samples_per_special_char": has_length(0),
"percent_of_samples_with_special_chars": equal_to(0),
'percent_special_chars_per_sample': has_length(2),
}))


def test_check_on_samples_with_special_characters(dataset_with_special_characters):
# Arrange
check = SpecialCharacters().add_condition_samples_ratio_w_special_characters_less_or_equal(
Expand Down
28 changes: 27 additions & 1 deletion tests/nlp/checks/data_integrity/text_property_outliers_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# ----------------------------------------------------------------------------
#
"""Test for the NLP TextPropertyOutliers check"""

import numpy as np
import pandas as pd
from hamcrest import assert_that, close_to, equal_to

Expand Down Expand Up @@ -76,6 +76,32 @@ def test_tweet_emotion_condition(tweet_emotion_train_test_textdata):
)


def test_tweet_emotion_condition_property_with_nans(tweet_emotion_train_test_textdata):
# Arrange
_, test = tweet_emotion_train_test_textdata
test = test.copy()
test._properties['Subjectivity'] = test._properties['Subjectivity'] * np.nan
check = TextPropertyOutliers().add_condition_outlier_ratio_less_or_equal()
# Act
result = check.run(test)
conditions_decisions = check.conditions_decision(result)

# Assert
assert_that(len(result.value['Sentiment']['indices']), equal_to(65))
assert_that(result.value['Sentiment']['lower_limit'], close_to(-0.90, 0.01))
assert_that(result.value['Sentiment']['upper_limit'], close_to(0.92, 0.01))

assert_that(
conditions_decisions[0],
equal_condition_result(
is_pass=False,
name='Outlier ratio in all properties is less or equal than 5%',
details='Found 1 properties with outlier ratios above threshold.</br>'
'Property with highest ratio is Toxicity with outlier ratio of 16.43%'
) # type: ignore
)


def test_not_enough_samples(tweet_emotion_train_test_textdata):
# Arrange
_, test = tweet_emotion_train_test_textdata
Expand Down