deepchecks · noamzbr · Jul 27, 2023 · Jul 26, 2023 · Jul 27, 2023 · Jul 27, 2023
@@ -88,6 +88,8 @@ def _get_labels(self, dataset):
             labels = [tuple(np.where(row == 1)[0]) for row in dataset.label]
         elif dataset.task_type is TaskType.TEXT_CLASSIFICATION:
             labels = dataset.label
+        elif dataset.task_type is TaskType.OTHER:
+            raise DeepchecksValueError('Check is irrelevant when task type is not specified')
         else:
             raise DeepchecksValueError(f'Unknown task type - {dataset.task_type}')
         return labels

@@ -102,6 +102,9 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
                 continue
             if len(sample) > self.max_chars_to_review_per_sample:
                 sample = random.sample(sample, self.max_chars_to_review_per_sample)
+            if len(sample) == 0:
+                percent_special_chars_in_sample[idx] = 0
+                continue
             special_chars_in_sample = [char for char in sample if char in self.special_characters_deny_list]
             percent_special_chars_in_sample[idx] = len(special_chars_in_sample) / len(sample)
             for char in frozenset(special_chars_in_sample):

@@ -216,6 +216,8 @@ def condition(result: t.Dict[str, t.Any]):
             for property_name, info in result.items():
                 if properties_to_ignore is not None and property_name in properties_to_ignore:
                     continue
+                if isinstance(info, str):
+                    continue
                 if info['outlier_ratio'] > threshold:
                     failed_properties.append(property_name)
                 if info['outlier_ratio'] > worst_ratio:

@@ -186,8 +186,11 @@ def len_safe_get_embedding(list_of_texts, model_name=EMBEDDING_MODEL, max_tokens
                         text_lens.append(chunk_lens[idx])
                         idx += 1
 
-                    text_embedding = np.average(text_embeddings, axis=0, weights=text_lens)
-                    text_embedding = text_embedding / np.linalg.norm(text_embedding)  # normalizes length to 1
+                    if sum(text_lens) == 0:
+                        text_embedding = np.ones((EMBEDDING_DIM, )) * np.nan
+                    else:
+                        text_embedding = np.average(text_embeddings, axis=0, weights=text_lens)
+                        text_embedding = text_embedding / np.linalg.norm(text_embedding)  # normalizes length to 1
                 result_embeddings.append(text_embedding.tolist())
 
             return result_embeddings

@@ -66,6 +66,22 @@ def test_check_on_clean_dataset(clean_dataset):
     ))  # type: ignore
 
 
+def test_check_on_dataset_with_emptt_sample():
+    # Arrange
+    data = TextData(raw_text=['', 'aa'])
+    check = SpecialCharacters().add_condition_samples_ratio_w_special_characters_less_or_equal(0)
+
+    # Act
+    result = check.run(dataset=data)
+
+    # Assert
+    assert_that(result.value, has_entries({
+        "samples_per_special_char": has_length(0),
+        "percent_of_samples_with_special_chars": equal_to(0),
+        'percent_special_chars_per_sample': has_length(2),
+    }))
+
+
 def test_check_on_samples_with_special_characters(dataset_with_special_characters):
     # Arrange
     check = SpecialCharacters().add_condition_samples_ratio_w_special_characters_less_or_equal(

@@ -9,7 +9,7 @@
 # ----------------------------------------------------------------------------
 #
 """Test for the NLP TextPropertyOutliers check"""
-
+import numpy as np
 import pandas as pd
 from hamcrest import assert_that, close_to, equal_to
 
@@ -76,6 +76,32 @@ def test_tweet_emotion_condition(tweet_emotion_train_test_textdata):
     )
 
 
+def test_tweet_emotion_condition_property_with_nans(tweet_emotion_train_test_textdata):
+    # Arrange
+    _, test = tweet_emotion_train_test_textdata
+    test = test.copy()
+    test._properties['Subjectivity'] = test._properties['Subjectivity'] * np.nan
+    check = TextPropertyOutliers().add_condition_outlier_ratio_less_or_equal()
+    # Act
+    result = check.run(test)
+    conditions_decisions = check.conditions_decision(result)
+
+    # Assert
+    assert_that(len(result.value['Sentiment']['indices']), equal_to(65))
+    assert_that(result.value['Sentiment']['lower_limit'], close_to(-0.90, 0.01))
+    assert_that(result.value['Sentiment']['upper_limit'], close_to(0.92, 0.01))
+
+    assert_that(
+        conditions_decisions[0],
+        equal_condition_result(
+            is_pass=False,
+            name='Outlier ratio in all properties is less or equal than 5%',
+            details='Found 1 properties with outlier ratios above threshold.</br>'
+                    'Property with highest ratio is Toxicity with outlier ratio of 16.43%'
+        )  # type: ignore
+    )
+
+
 def test_not_enough_samples(tweet_emotion_train_test_textdata):
     # Arrange
     _, test = tweet_emotion_train_test_textdata