From b1fa92e02e58dba5f001a7eb81962c1e72ef8794 Mon Sep 17 00:00:00 2001
From: Nir Hutnik <92314933+nirhutnik@users.noreply.github.com>
Date: Sun, 18 Jun 2023 11:35:39 +0300
Subject: [PATCH 01/23] Changed model_classes logger warning and made sure
 tabular only warns once (#2604)

* Changed model_classes logger warning, and made sure that tabular only prints that warning once by updating model_classes like in NLP

* Added fix for BoostingOverfit error message
---
 deepchecks/nlp/context.py                     |  4 ++-
 .../model_evaluation/boosting_overfit.py      | 35 ++++++++++---------
 deepchecks/tabular/context.py                 |  7 ++--
 3 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/deepchecks/nlp/context.py b/deepchecks/nlp/context.py
index 4641bf5895..31eef63e3b 100644
--- a/deepchecks/nlp/context.py
+++ b/deepchecks/nlp/context.py
@@ -343,7 +343,9 @@ def model_classes(self) -> t.List:
             # If in infer_observed_and_model_labels we didn't find classes on model, or user didn't pass any,
             # then using the observed
             self._model_classes = self._observed_classes
-            get_logger().warning('Could not find model\'s classes, using the observed classes')
+            get_logger().warning('Could not find model\'s classes, using the observed classes. '
+                                 'In order to make sure the classes used by the model are inferred correctly, '
+                                 'please use the model_classes argument')
         return self._model_classes
 
     @property
diff --git a/deepchecks/tabular/checks/model_evaluation/boosting_overfit.py b/deepchecks/tabular/checks/model_evaluation/boosting_overfit.py
index f6ae6dda9a..4879db620e 100644
--- a/deepchecks/tabular/checks/model_evaluation/boosting_overfit.py
+++ b/deepchecks/tabular/checks/model_evaluation/boosting_overfit.py
@@ -32,10 +32,12 @@
 class PartialBoostingModel:
     """Wrapper for boosting models which limits the number of estimators being used in the prediction."""
 
-    _UNSUPPORTED_MODEL_ERROR = (
-        'Check is relevant for Boosting models of type '
-        '{supported_models}, but received model of type {model_type}'
-    )
+    _UNSUPPORTED_MODEL_ERROR = \
+        'Check is relevant for Boosting models of type {supported_models}, but received model of type {model_type}'
+
+    _NO_MODEL_ERROR = \
+        'Check is relevant only when receiving the model, but predictions/probabilities were received instead. ' \
+        'In order to use this check, please pass the model to the run() method.'
 
     _SUPPORTED_CLASSIFICATION_MODELS = (
         'AdaBoostClassifier',
@@ -78,6 +80,16 @@ def __init__(self, model, step):
         else:
             self.model = model
 
+    @classmethod
+    def _raise_not_supported_model_error(cls, model_class):
+        if model_class != '_DummyModel':
+            raise ModelValidationError(cls._UNSUPPORTED_MODEL_ERROR.format(
+                supported_models=cls._SUPPORTED_MODELS,
+                model_type=model_class
+            ))
+        else:
+            raise ModelValidationError(cls._NO_MODEL_ERROR)
+
     def predict_proba(self, x):
         if self.model_class in ['AdaBoostClassifier', 'GradientBoostingClassifier']:
             return self.model.predict_proba(x)
@@ -88,10 +100,7 @@ def predict_proba(self, x):
         elif self.model_class == 'CatBoostClassifier':
             return self.model.predict_proba(x, ntree_end=self.step)
         else:
-            raise ModelValidationError(self._UNSUPPORTED_MODEL_ERROR.format(
-                supported_models=self._SUPPORTED_CLASSIFICATION_MODELS,
-                model_type=self.model_class
-            ))
+            self._raise_not_supported_model_error(self.model_class)
 
     def predict(self, x):
         if self.model_class in ['AdaBoostClassifier', 'GradientBoostingClassifier', 'AdaBoostRegressor',
@@ -104,10 +113,7 @@ def predict(self, x):
         elif self.model_class in ['CatBoostClassifier', 'CatBoostRegressor']:
             return self.model.predict(x, ntree_end=self.step)
         else:
-            raise ModelValidationError(self._UNSUPPORTED_MODEL_ERROR.format(
-                supported_models=self._SUPPORTED_MODELS,
-                model_type=self.model_class
-            ))
+            self._raise_not_supported_model_error(self.model_class)
 
     @classmethod
     def n_estimators(cls, model):
@@ -123,10 +129,7 @@ def n_estimators(cls, model):
         elif model_class in ['CatBoostClassifier', 'CatBoostRegressor']:
             return model.tree_count_
         else:
-            raise ModelValidationError(cls._UNSUPPORTED_MODEL_ERROR.format(
-                supported_models=cls._SUPPORTED_MODELS,
-                model_type=model_class
-            ))
+            cls._raise_not_supported_model_error(model_class=model_class)
 
 
 class BoostingOverfit(TrainTestCheck):
diff --git a/deepchecks/tabular/context.py b/deepchecks/tabular/context.py
index c8b19b712c..48d10320fe 100644
--- a/deepchecks/tabular/context.py
+++ b/deepchecks/tabular/context.py
@@ -291,8 +291,11 @@ def model_classes(self) -> t.List:
         """Return ordered list of possible label classes for classification tasks or None for regression."""
         if self._model_classes is None and self.task_type in (TaskType.BINARY, TaskType.MULTICLASS):
             # If in infer_task_type we didn't find classes on model, or user didn't pass any, then using the observed
-            get_logger().warning('Could not find model\'s classes, using the observed classes')
-            return self.observed_classes
+            get_logger().warning('Could not find model\'s classes, using the observed classes. '
+                                 'In order to make sure the classes used by the model are inferred correctly, '
+                                 'please use the model_classes argument')
+            self._model_classes = self.observed_classes
+
         return self._model_classes
 
     @property

From c521fc78f976d1a4bf3f2654555d2fdc16cd7bdb Mon Sep 17 00:00:00 2001
From: JKL98ISR <jonatan.lib@gmail.com>
Date: Tue, 20 Jun 2023 13:16:48 +0300
Subject: [PATCH 02/23] fix many warnings when using joblib (weak segments)
 (#2609)

* -many-warnings-when

* -many-warnings-when
---
 deepchecks/analytics/anonymous_telemetry.py | 1 +
 spelling-allowlist.txt                      | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/deepchecks/analytics/anonymous_telemetry.py b/deepchecks/analytics/anonymous_telemetry.py
index f0e0a669b3..304610784b 100644
--- a/deepchecks/analytics/anonymous_telemetry.py
+++ b/deepchecks/analytics/anonymous_telemetry.py
@@ -47,5 +47,6 @@ def validate_latest_version():
                                      ' Deepchecks is frequently updated with major improvements. You should consider '
                                      'upgrading via the "python -m pip install --upgrade deepchecks" command.',
                                      deepchecks.__version__)
+                os.environ['DISABLE_DEEPCHECKS_ANONYMOUS_TELEMETRY'] = 'True'  # to ignore joblib
         except Exception:  # pylint: disable=broad-except
             pass
diff --git a/spelling-allowlist.txt b/spelling-allowlist.txt
index 8ae83ecca4..e365cde865 100644
--- a/spelling-allowlist.txt
+++ b/spelling-allowlist.txt
@@ -161,4 +161,5 @@ ai
 idx
 isnan
 'isnan'
-ufunc
\ No newline at end of file
+ufunc
+joblib
\ No newline at end of file

From 18aa661be308ecda1b79c09863a9ed36034cbe2d Mon Sep 17 00:00:00 2001
From: Harsh Jain <136261806+harsh-deepchecks@users.noreply.github.com>
Date: Sun, 25 Jun 2023 15:23:53 +0530
Subject: [PATCH 03/23] Not raise DeepchecksProcessError for UnderAnnotated +
 WeakSegments (#2611)

* Not raise DeepchecksProcessError for UnderAnnotated + WeakSegments

* Fix to pass condition is message is there

* Making annotation_ratio configurable parameter

* Added test for more coverage

* Update deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py

Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>

* Update deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py

Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>

* Resolved comments and minor fixes

---------

Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>
---
 .../under_annotated_segments.py               | 28 +++++++++---
 .../weak_segments_performance.py              | 12 +++--
 .../utils/abstracts/weak_segment_abstract.py  |  3 ++
 .../under_annotated_segments_test.py          | 44 ++++++++++++++-----
 .../weak_segment_performance_test.py          | 24 +++++++++-
 5 files changed, 91 insertions(+), 20 deletions(-)

diff --git a/deepchecks/nlp/checks/data_integrity/under_annotated_segments.py b/deepchecks/nlp/checks/data_integrity/under_annotated_segments.py
index 8f64c62b98..4b942657ad 100644
--- a/deepchecks/nlp/checks/data_integrity/under_annotated_segments.py
+++ b/deepchecks/nlp/checks/data_integrity/under_annotated_segments.py
@@ -18,7 +18,7 @@
 from deepchecks import ConditionCategory, ConditionResult
 from deepchecks.core import CheckResult
 from deepchecks.core.check_result import DisplayMap
-from deepchecks.core.errors import DeepchecksProcessError
+from deepchecks.core.errors import NotEnoughSamplesError
 from deepchecks.nlp import Context, SingleDatasetCheck
 from deepchecks.nlp.utils.text import break_to_lines_and_trim
 from deepchecks.nlp.utils.weak_segments import get_relevant_data_table
@@ -30,6 +30,10 @@
 __all__ = ['UnderAnnotatedMetaDataSegments', 'UnderAnnotatedPropertySegments']
 
 MAX_SAMPLES_IN_FIGURE = 1000
+# The threshold the UnderAnnotatedSegments considers the data to be well
+# annotated and skips the checks
+ANNOTATION_RATIO_THRESHOLD = 95.0
+MIN_TEXT_SAMPLES = 10  # Min samples to calculate under annotated segments
 
 
 class UnderAnnotatedSegments(SingleDatasetCheck, WeakSegmentAbstract):
@@ -37,8 +41,8 @@ class UnderAnnotatedSegments(SingleDatasetCheck, WeakSegmentAbstract):
 
     def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], None],
                  ignore_columns: Union[Hashable, List[Hashable], None], n_top_features: int,
-                 segment_minimum_size_ratio: float, n_samples: int,
-                 categorical_aggregation_threshold: float, n_to_show: int, **kwargs):
+                 segment_minimum_size_ratio: float, n_samples: int,  n_to_show: int,
+                 categorical_aggregation_threshold: float, **kwargs):
         super().__init__(**kwargs)
         self.segment_by = segment_by
         self.columns = columns
@@ -48,6 +52,7 @@ def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], Non
         self.n_samples = n_samples
         self.n_to_show = n_to_show
         self.categorical_aggregation_threshold = categorical_aggregation_threshold
+        self.annotation_ratio_threshold = ANNOTATION_RATIO_THRESHOLD
 
     def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         """Run check."""
@@ -59,6 +64,17 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
                                                          n_top_features=self.n_top_features)
 
         score_per_sample = pd.Series([1 - is_label_none(x) for x in text_data.label], index=features.index)
+        annotation_ratio = round(score_per_sample.sum() * 100 / text_data.n_samples, 2)
+        if annotation_ratio > self.annotation_ratio_threshold:
+            display_msg = f'Under annotated {self.segment_by} segments check is skipped since your data ' \
+                          f'annotation ratio is > {self.annotation_ratio_threshold}%. Try increasing the ' \
+                          'annotation_ratio_threshold parameter.'
+            return CheckResult(value={'message': display_msg}, display=[display_msg])
+
+        if text_data.n_samples < MIN_TEXT_SAMPLES:
+            raise NotEnoughSamplesError(f'Not enough samples to calculate under annotated {self.segment_by} '
+                                        'segments. Minimum 10 samples required.')
+
         encoded_dataset = self._target_encode_categorical_features_fill_na(features, score_per_sample,
                                                                            cat_features)
 
@@ -68,9 +84,9 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
                                                    scorer_name='Annotation Ratio')
 
         if len(weak_segments) == 0:
-            raise DeepchecksProcessError('Check was unable to find under annotated segments. This is expected if '
-                                         'your data is well annotated. If this is not the case, try increasing '
-                                         f'n_samples or supply more {self.segment_by}.')
+            display_msg = 'Check was unable to find under annotated segments. Try ' \
+                            f'supplying more {self.segment_by}.'
+            return CheckResult(value={'message': display_msg}, display=[display_msg])
 
         check_result_value = self._generate_check_result_value(weak_segments, cat_features, avg_score)
         display_msg = f'Showcasing intersections of {self.segment_by} that result in the most ' \
diff --git a/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py b/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py
index 8e29d8c1eb..e9c77c96ff 100644
--- a/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py
+++ b/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py
@@ -17,7 +17,7 @@
 
 from deepchecks.core import CheckResult
 from deepchecks.core.check_result import DisplayMap
-from deepchecks.core.errors import DeepchecksNotSupportedError, DeepchecksProcessError
+from deepchecks.core.errors import DeepchecksNotSupportedError, NotEnoughSamplesError
 from deepchecks.nlp import Context, SingleDatasetCheck
 from deepchecks.nlp.utils.weak_segments import get_relevant_data_table
 from deepchecks.tabular.context import _DummyModel
@@ -27,6 +27,8 @@
 
 __all__ = ['MetadataSegmentsPerformance', 'PropertySegmentsPerformance']
 
+MIN_TEXT_SAMPLES = 10  # Min samples to calculate weak segments performance
+
 
 class WeakSegmentsAbstractText(SingleDatasetCheck, WeakSegmentAbstract):
     """Check the performance of the model on different segments of the data."""
@@ -55,6 +57,9 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         text_data = context.get_data_by_kind(dataset_kind)
         text_data = text_data.sample(self.n_samples, random_state=context.random_state, drop_na_label=True)
 
+        if text_data.n_samples < MIN_TEXT_SAMPLES:
+            raise NotEnoughSamplesError(f'Not enough samples to find weak {self.segment_by} segments.'
+                                        f' Minimum {MIN_TEXT_SAMPLES} samples required.')
         features, cat_features = get_relevant_data_table(text_data, data_type=self.segment_by,
                                                          columns=self.columns, ignore_columns=self.ignore_columns,
                                                          n_top_features=self.n_top_features)
@@ -105,8 +110,9 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
                                                    dummy_model=dummy_model, scorer=scorer)
 
         if len(weak_segments) == 0:
-            raise DeepchecksProcessError('WeakSegmentsPerformance was unable to train an error model to find weak '
-                                         f'segments. Try increasing n_samples or supply more {self.segment_by}.')
+            display_msg = 'WeakSegmentsPerformance was unable to train an error model to find weak segments.'\
+                          f'Try supplying additional {self.segment_by}.'
+            return CheckResult(value={'message': display_msg}, display=[display_msg])
 
         if context.with_display:
             display = self._create_heatmap_display(data=encoded_dataset.data, weak_segments=weak_segments,
diff --git a/deepchecks/utils/abstracts/weak_segment_abstract.py b/deepchecks/utils/abstracts/weak_segment_abstract.py
index 061ae58541..2711715f77 100644
--- a/deepchecks/utils/abstracts/weak_segment_abstract.py
+++ b/deepchecks/utils/abstracts/weak_segment_abstract.py
@@ -330,6 +330,9 @@ def add_condition_segments_relative_performance_greater_than(self, max_ratio_cha
         """
 
         def condition(result: Dict) -> ConditionResult:
+            if 'message' in result:
+                return ConditionResult(ConditionCategory.PASS, result['message'])
+
             weakest_segment_score = result['weak_segments_list'].iloc[0, 0]
             scorer_name = result['weak_segments_list'].columns[0].lower()
             msg = f'Found a segment with {scorer_name} of {format_number(weakest_segment_score, 3)} ' \
diff --git a/tests/nlp/checks/data_integrity/under_annotated_segments_test.py b/tests/nlp/checks/data_integrity/under_annotated_segments_test.py
index 109bd8a793..4ff02be71e 100644
--- a/tests/nlp/checks/data_integrity/under_annotated_segments_test.py
+++ b/tests/nlp/checks/data_integrity/under_annotated_segments_test.py
@@ -12,7 +12,7 @@
 import numpy as np
 from hamcrest import assert_that, calling, close_to, equal_to, has_items, raises
 
-from deepchecks.core.errors import DeepchecksProcessError
+from deepchecks.core.errors import NotEnoughSamplesError
 from deepchecks.nlp.checks import UnderAnnotatedMetaDataSegments, UnderAnnotatedPropertySegments
 from tests.base.utils import equal_condition_result
 
@@ -55,8 +55,10 @@ def test_tweet_emotion_metadata(tweet_emotion_train_test_textdata):
     # Assert
     assert_that(condition_result, has_items(
         equal_condition_result(is_pass=False,
-                               details='Found a segment with annotation ratio of 0.366 in comparison to an average score of 0.5 in sampled data.',
-                               name='The relative performance of weakest segment is greater than 80% of average model performance.')
+                               details='Found a segment with annotation ratio of 0.366 in comparison to an average '
+                                       'score of 0.5 in sampled data.',
+                               name='The relative performance of weakest segment is greater than 80% of average '
+                                    'model performance.')
     ))
 
     assert_that(result.value['avg_score'], close_to(0.5, 0.001))
@@ -89,13 +91,13 @@ def test_tweet_emotion_metadata_fully_annotated(tweet_emotion_train_test_textdat
     _, test = tweet_emotion_train_test_textdata
     check = UnderAnnotatedMetaDataSegments().add_condition_segments_relative_performance_greater_than()
 
-    # Act & Assert
-    assert_that(
-        calling(check.run).with_args(test),
-        raises(DeepchecksProcessError, 'Check was unable to find under annotated segments. This is expected if '
-                                       'your data is well annotated. If this is not the case, try increasing '
-                                       'n_samples or supply more metadata.')
-    )
+    # Act
+    result = check.run(test)
+
+    # Assert
+    assert_that(result.value['message'], equal_to('Under annotated metadata segments check is '
+                                                  'skipped since your data annotation ratio is > 95.0%. '
+                                                  'Try increasing the annotation_ratio_threshold parameter.'))
 
 
 def test_token_classification_dataset(small_wikiann_train_test_text_data):
@@ -150,3 +152,25 @@ def test_multilabel_dataset(multilabel_mock_dataset_and_probabilities):
     assert_that(result.value['avg_score'], close_to(0.5, 0.001))
     assert_that(len(result.value['weak_segments_list']), equal_to(4))
     assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.326, 0.01))
+
+
+def test_not_enough_samples(tweet_emotion_train_test_textdata):
+    # Arrange
+    _, test = tweet_emotion_train_test_textdata
+    text_data = test.sample(5)
+    text_data.label[0] = np.nan
+    text_data.label[3] = None
+
+    # Act & Assert
+    property_check = UnderAnnotatedPropertySegments(segment_minimum_size_ratio=0.04)
+    metadata_check = UnderAnnotatedMetaDataSegments(segment_minimum_size_ratio=0.04)
+    assert_that(
+        calling(property_check.run).with_args(text_data),
+        raises(NotEnoughSamplesError,
+               'Not enough samples to calculate under annotated properties segments. Minimum 10 samples required.'
+               ))
+    assert_that(
+        calling(metadata_check.run).with_args(text_data),
+        raises(NotEnoughSamplesError,
+               'Not enough samples to calculate under annotated metadata segments. Minimum 10 samples required.'
+               ))
diff --git a/tests/nlp/checks/model_evaluation/weak_segment_performance_test.py b/tests/nlp/checks/model_evaluation/weak_segment_performance_test.py
index a440e8d119..c5cbcb3140 100644
--- a/tests/nlp/checks/model_evaluation/weak_segment_performance_test.py
+++ b/tests/nlp/checks/model_evaluation/weak_segment_performance_test.py
@@ -14,7 +14,7 @@
 import pytest
 from hamcrest import assert_that, calling, close_to, equal_to, has_items, is_in, matches_regexp, raises
 
-from deepchecks.core.errors import DeepchecksNotSupportedError
+from deepchecks.core.errors import DeepchecksNotSupportedError, NotEnoughSamplesError
 from deepchecks.nlp.checks import MetadataSegmentsPerformance, PropertySegmentsPerformance
 from tests.base.utils import equal_condition_result
 
@@ -184,3 +184,25 @@ def test_binary_classification(binary_mock_dataset_and_probabilities):
     assert_that(result.value['avg_score'], close_to(0.447, 0.001))
     assert_that(len(result.value['weak_segments_list']), equal_to(6))
     assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.34, 0.01))
+
+
+def test_not_enough_samples(tweet_emotion_train_test_textdata, tweet_emotion_train_test_probabilities):
+
+    _, test = tweet_emotion_train_test_textdata
+    _, test_probas = tweet_emotion_train_test_probabilities
+    property_check = PropertySegmentsPerformance(n_top_properties=3)
+    metadata_check = MetadataSegmentsPerformance(n_top_columns=2)
+    text_data = test.sample(5)
+    text_data.label[0] = np.nan
+    text_data.label[3] = None
+
+    assert_that(
+        calling(property_check.run).with_args(text_data),
+        raises(NotEnoughSamplesError,
+               'Not enough samples to find weak properties segments. Minimum 10 samples required.'
+               ))
+    assert_that(
+        calling(metadata_check.run).with_args(text_data),
+        raises(NotEnoughSamplesError,
+               'Not enough samples to find weak metadata segments. Minimum 10 samples required.'
+               ))

From dac4bc64d436ceb6fb5b82f1a2d24a64bb09c8ac Mon Sep 17 00:00:00 2001
From: Harsh Jain <136261806+harsh-deepchecks@users.noreply.github.com>
Date: Sun, 25 Jun 2023 18:21:40 +0530
Subject: [PATCH 04/23] Added .describe() function to the TextData class
 (#2606)

* Added .describe() function to the TextData class

* Added handling for categorical data and added to the doc

* Import fix

* Small Fix

* Pylint and Docstring fix

* Fixed few comments and pylint issue

* Refactoring of the code and adding (i) to the plot titles

* Couple of fixes

* Apply same formatting to mean and median

* Fix

* small fix

* Update docs/source/nlp/usage_guides/text_data_object.rst

Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>

* Resolved comments

* Allow pie and table to display even if properties are not there

* Few fixes

* Added support for multi-label and token classification

* Added various test cases

* Pylint fix

* Fix pylint

* Small fix

* Update deepchecks/nlp/text_data.py

Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>

* Update deepchecks/nlp/text_data.py

Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>

* Update docs/source/nlp/usage_guides/text_data_object.rst

Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>

* Resolved comments

* Small import fix

---------

Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>
---
 deepchecks/nlp/text_data.py                   |  63 ++++
 deepchecks/nlp/utils/text_data_plot.py        | 324 ++++++++++++++++++
 deepchecks/utils/plot.py                      |   5 +
 .../nlp/usage_guides/text_data_object.rst     |   8 +
 tests/nlp/test_text_data.py                   |  86 ++++-
 5 files changed, 484 insertions(+), 2 deletions(-)
 create mode 100644 deepchecks/nlp/utils/text_data_plot.py

diff --git a/deepchecks/nlp/text_data.py b/deepchecks/nlp/text_data.py
index 9ebe2183cf..2e5c078d39 100644
--- a/deepchecks/nlp/text_data.py
+++ b/deepchecks/nlp/text_data.py
@@ -23,6 +23,7 @@
                                               validate_raw_text, validate_tokenized_text)
 from deepchecks.nlp.task_type import TaskType, TTextLabel
 from deepchecks.nlp.utils.text import break_to_lines_and_trim
+from deepchecks.nlp.utils.text_data_plot import text_data_describe_plot
 from deepchecks.nlp.utils.text_embeddings import calculate_builtin_embeddings
 from deepchecks.nlp.utils.text_properties import calculate_builtin_properties, get_builtin_properties_types
 from deepchecks.utils.logger import get_logger
@@ -528,6 +529,14 @@ def categorical_properties(self) -> t.List[str]:
         """Return categorical properties names."""
         return self._cat_properties
 
+    @property
+    def numerical_properties(self) -> t.List[str]:
+        """Return numerical properties names."""
+        if self._properties is not None:
+            return [prop for prop in self._properties.columns if prop not in self._cat_properties]
+        else:
+            return []
+
     @property
     def task_type(self) -> t.Optional[TaskType]:
         """Return the task type.
@@ -733,6 +742,60 @@ def is_sampled(self, n_samples: t.Optional[int]):
             return False
         return self.n_samples > n_samples
 
+    def describe(self, n_properties_to_show: t.Optional[int] = 4, properties_to_show: t.Optional[t.List[str]] = None,
+                 max_num_labels_to_show: t.Optional[int] = 5, model_classes: t.Optional[t.List[str]] = None):
+        """Provide holistic view of the data.
+
+        Generates the following plots:
+        1. Label distribution
+        2. Statistics about the data such as number of samples, annotation ratio, list of metadata columns, list of
+        text properties and so on.
+        3. Property distribution for the text properties defined either by n_properties_to_show or properties_to_show
+        parameter.
+
+        Parameters
+        ----------
+        n_properties_to_show : int, default: 4
+            Number of properties to consider for generating property distribution graphs. If properties_to_show
+            is provided, this value is ignored.
+        properties_to_show : List[str], default: None
+            List of property names to consider for generating property distribution graphs. If None, all the
+            properties are considered.
+        max_num_labels_to_show : int, default: 5
+            The threshold to display the maximum number of labels on the label distribution pie chart and
+            display rest of the labels under "Others" category.
+        model_classes : Optional[List[str]], default: None
+            List of classes names to use for multi-label display. Only used if the dataset is multi-label.
+
+        Returns
+        -------
+        Displays the Plotly Figure.
+        """
+        prop_names = []
+        all_properties_data = pd.DataFrame()
+        if self._properties is None and properties_to_show is not None:
+            raise DeepchecksValueError('No properties exist!')
+        elif self._properties is not None:
+            if properties_to_show is not None:
+                prop_names = [prop for prop in properties_to_show if prop in self.properties.columns]
+                if len(prop_names) != len(properties_to_show):
+                    raise DeepchecksValueError(f'{set(properties_to_show)-set(prop_names)} '
+                                               'properties does not exist in the TextData object')
+            else:
+                prop_names = list(self.properties.columns)[:n_properties_to_show]
+            all_properties_data = self.properties[prop_names]
+
+        fig = text_data_describe_plot(properties=all_properties_data, n_samples=self.n_samples,
+                                      is_multi_label=self.is_multi_label_classification(), task_type=self.task_type,
+                                      categorical_metadata=self.categorical_metadata,
+                                      numerical_metadata=self.numerical_metadata,
+                                      categorical_properties=self.categorical_properties,
+                                      numerical_properties=self.numerical_properties, label=self._label,
+                                      model_classes=model_classes,
+                                      max_num_labels_to_show=max_num_labels_to_show)
+
+        return fig
+
 
 @contextlib.contextmanager
 def disable_deepchecks_logger():
diff --git a/deepchecks/nlp/utils/text_data_plot.py b/deepchecks/nlp/utils/text_data_plot.py
new file mode 100644
index 0000000000..ca41c45687
--- /dev/null
+++ b/deepchecks/nlp/utils/text_data_plot.py
@@ -0,0 +1,324 @@
+# ----------------------------------------------------------------------------
+# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
+#
+# This file is part of Deepchecks.
+# Deepchecks is distributed under the terms of the GNU Affero General
+# Public License (version 3 or later).
+# You should have received a copy of the GNU Affero General Public License
+# along with Deepchecks.  If not, see <http://www.gnu.org/licenses/>.
+# ----------------------------------------------------------------------------
+#
+"""A module containing utils for displaying information on TextData object."""
+from typing import List, Optional
+
+import numpy as np
+import pandas as pd
+import plotly.graph_objs as go
+from plotly.subplots import make_subplots
+
+from deepchecks.nlp.task_type import TaskType, TTextLabel
+from deepchecks.nlp.utils.text import break_to_lines_and_trim
+from deepchecks.nlp.utils.text_properties import TEXT_PROPERTIES_DESCRIPTION
+from deepchecks.utils.dataframes import un_numpy
+from deepchecks.utils.distribution.plot import get_density
+from deepchecks.utils.plot import feature_distribution_colors
+from deepchecks.utils.strings import format_percent, get_docs_link
+
+__all__ = ['text_data_describe_plot']
+
+
+def _calculate_annoation_ratio(label, n_samples, is_mutli_label, task_type):
+
+    if label is None:
+        return format_percent(0)
+    if is_mutli_label or task_type == TaskType.TOKEN_CLASSIFICATION:
+        annotated_count = _calculate_number_of_annotated_samples(label=label,
+                                                                 is_multi_label=is_mutli_label,
+                                                                 task_type=task_type)
+        return format_percent(annotated_count / n_samples)
+    else:
+        return format_percent(pd.notna(label).sum() / n_samples)
+
+
+def _get_table_row_data(n_samples, annotation_ratio, categorical_metadata, numerical_metadata,
+                        categorical_properties, numerical_properties, max_values_to_show: int = 5):
+
+    info_cell = [n_samples, annotation_ratio]
+
+    if categorical_metadata is None or len(categorical_metadata) == 0:
+        info_cell.append('No categorical metadata')
+    else:
+        info_cell.append(', '.join(categorical_metadata) if len(categorical_metadata) <= max_values_to_show
+                         else f'{len(categorical_metadata)} metadata columns')
+
+    if numerical_metadata is None or len(numerical_metadata) == 0:
+        info_cell.append('No numerical metadata')
+    else:
+        info_cell.append(', '.join(numerical_metadata) if len(numerical_metadata) <= max_values_to_show
+                         else f'{len(numerical_metadata)} metadata columns')
+
+    if categorical_properties is None or len(categorical_properties) == 0:
+        info_cell.append('No categorical properties')
+    else:
+        info_cell.append(', '.join(categorical_properties) if len(categorical_properties) <= max_values_to_show
+                         else f'{len(categorical_properties)} properties')
+
+    if numerical_properties is None or len(numerical_properties) == 0:
+        info_cell.append('No numerical properties')
+    else:
+        info_cell.append(', '.join(numerical_properties) if len(numerical_properties) <= max_values_to_show
+                         else f'{len(numerical_properties)} properties')
+
+    return info_cell
+
+
+def _generate_table_trace(n_samples, annotation_ratio, categorical_metadata, numerical_metadata,
+                          categorical_properties, numerical_properties):
+    data_cell = ['<b>Number of samples</b>', '<b>Annotation ratio</b>', '<b>Metadata categorical columns</b>',
+                 '<b>Metadata numerical columns</b>', '<b>Categorical properties</b>', '<b>Numerical properties</b>']
+
+    info_cell = _get_table_row_data(n_samples=n_samples, annotation_ratio=annotation_ratio,
+                                    categorical_metadata=categorical_metadata, numerical_metadata=numerical_metadata,
+                                    categorical_properties=categorical_properties,
+                                    numerical_properties=numerical_properties, max_values_to_show=7)
+
+    trace = go.Table(header={'fill': {'color': 'white'}},
+                     cells={'values': [data_cell, info_cell], 'align': ['left'], 'font_size': 12,
+                            'height': 30})
+    return trace
+
+
+def _generate_categorical_distribution_plot(data, property_name):
+
+    dist_counts = data.value_counts(normalize=True).to_dict()
+    counts = list(dist_counts.values())
+    categories_list = list(dist_counts.keys())
+    cat_df = pd.DataFrame({property_name: counts}, index=[un_numpy(cat) for cat in categories_list])
+    trace = go.Bar(x=cat_df.index, y=cat_df[property_name], showlegend=False,
+                   marker={'color': feature_distribution_colors['feature']},
+                   hovertemplate='<b>Value:</b> %{x}<br><b>Frequency:</b> %{y}<extra></extra>')
+    yaxis_layout = dict(type='log', title='Frequency (Log Scale)')
+    xaxis_layout = dict(title=property_name)
+    return trace, xaxis_layout, yaxis_layout
+
+
+def _get_distribution_values(data):
+    mean = data.mean()
+    median = data.median()
+    x_range = (data.min(), data.max())
+    if all(int(x) == x for x in data if x is not None):
+        # If the distribution is discrete, we take all the values in it:
+        xs = sorted(np.unique(data))
+        if len(xs) > 50:
+            # If there are too many values, we take only 50, using a constant interval between them:
+            xs = list(range(int(xs[0]), int(xs[-1]) + 1, int((xs[-1] - xs[0]) // 50)))
+    else:
+        xs = sorted(np.concatenate((np.linspace(x_range[0], x_range[1], 50),
+                                    np.quantile(data, q=np.arange(0.02, 1, 0.02)),
+                                    [mean, median]
+                                    )))
+        ixs = np.searchsorted(sorted(data), xs, side='left')
+        xs = [xs[i] for i in range(len(ixs)) if ixs[i] != ixs[i - 1]]
+    y_value = get_density(data, xs)
+    return y_value, xs
+
+
+def _calculate_number_of_annotated_samples(label, is_multi_label, task_type):
+
+    if is_multi_label or task_type == TaskType.TOKEN_CLASSIFICATION:
+        annotated_count = 0
+        for label_data in label:
+            annotated_count = annotated_count + 1 if len(label_data) > 0 and pd.isna(label_data).sum() == 0 \
+                              else annotated_count
+        return annotated_count
+    else:
+        return pd.notna(label).sum()
+
+
+def _generate_numeric_distribution_plot(data, x_value, y_value, property_name):
+
+    mean = data.mean()
+    percentile_90 = data.quantile(0.9)
+    percentile_10 = data.quantile(0.1)
+    median = data.median()
+
+    trace = go.Scatter(x=x_value, y=y_value, fill='tozeroy', showlegend=False,
+                       hovertemplate=f'<b>{property_name}:</b> ''%{x}<br><b>Density:</b> %{y}<extra></extra>',
+                       line={'color': feature_distribution_colors['feature'],
+                             'shape': 'linear', 'width': 5})
+    shapes = []
+    annotations = []
+
+    shapes.append(dict(type='line', x0=mean, y0=0, x1=mean, y1=max(y_value),
+                       line={'color': feature_distribution_colors['measure'], 'dash': 'dash', 'width': 3}))
+    mean_xpos = mean + max(x_value) * 0.02 if median < mean else mean - max(x_value) * 0.02
+    annotations.append(dict(x=mean_xpos, y=max(y_value)/2, text='<b>Mean</b>', showarrow=False,
+                            textangle=-90, font={'size': 12}))
+
+    shapes.append(dict(type='line', x0=median, y0=0, x1=median, y1=max(y_value),
+                       line={'color': feature_distribution_colors['measure'], 'dash': 'dot', 'width': 3}))
+    median_xpos = median - max(x_value) * 0.02 if median < mean else median + max(x_value) * 0.02
+    annotations.append(dict(x=median_xpos, y=max(y_value)/2, text='<b>Median</b>', showarrow=False,
+                            textangle=-90, font={'size': 12}))
+
+    shapes.append(dict(type='line', x0=percentile_10, y0=0, x1=percentile_10, y1=max(y_value),
+                       line={'color': feature_distribution_colors['measure'], 'dash': 'dashdot', 'width': 3}))
+    annotations.append(dict(x=percentile_10 - max(x_value)*0.02, y=max(y_value)/2, textangle=-90,
+                            text='<b>10<sup>th</sup> Percentile</b>', showarrow=False, font={'size': 12}))
+
+    shapes.append(dict(type='line', x0=percentile_90, y0=0, x1=percentile_90, y1=max(y_value),
+                       line={'color': feature_distribution_colors['measure'], 'dash': 'dashdot', 'width': 3}))
+    annotations.append(dict(x=percentile_90 + max(x_value)*0.02, y=max(y_value)/2, textangle=-90,
+                            text='<b>90<sup>th</sup> Percentile</b>', showarrow=False, font={'size': 12}))
+
+    xaxis_layout = dict(title=property_name)
+    yaxis_layout = dict(title='Density')
+
+    return trace, shapes, annotations, xaxis_layout, yaxis_layout
+
+
+def text_data_describe_plot(n_samples: int, max_num_labels_to_show: int,
+                            is_multi_label: bool, task_type: str,
+                            properties: pd.DataFrame,
+                            categorical_metadata: Optional[List[str]] = None,
+                            numerical_metadata: Optional[List[str]] = None,
+                            categorical_properties: Optional[List[str]] = None,
+                            numerical_properties: Optional[List[str]] = None,
+                            model_classes: Optional[List[str]] = None,
+                            label: Optional[TTextLabel] = None):
+    """Return a plotly figure instance.
+
+    Parameters
+    ----------
+    properties: pd.DataFrame
+        The DataFrame consisting of the text properties data. If no prooperties are there, you can pass an
+        empty DataFrame as well.
+    n_samples: int
+        The total number of samples present in the TextData object.
+    max_num_labels_to_show : int
+        The threshold to display the maximum number of labels on the label distribution pie chart and display
+        rest of the labels under "Others" category.
+    is_multi_label: bool
+        A boolean where True denotes that the TextData contains multi labeled data otherwise false.
+    task_type: str
+        The task type for the text data. Can be either 'text_classification' or 'token_classification'.
+    categorical_metadata: Optional[List[str]], default: None
+        The names of the categorical metadata columns.
+    numerical_metadata: Optional[List[str]], default: None
+        The names of the numerical metadata columns.
+    categorical_properties: Optional[List[str]], default: None
+        The names of the categorical properties columns.
+    numerical_properties: Optional[List[str]], default: None
+        The names of the numerical text properties columns.
+    label: Optional[TTextLabel], default: None
+        The label for the text data. Can be either a text_classification label or a token_classification label.
+        If None, the label distribution graph is not generated.
+
+        - text_classification label - For text classification the accepted label format differs between multilabel and
+          single label cases. For single label data, the label should be passed as a sequence of labels, with one entry
+          per sample that can be either a string or an integer. For multilabel data, the label should be passed as a
+          sequence of sequences, with the sequence for each sample being a binary vector, representing the presence of
+          the i-th label in that sample.
+        - token_classification label - For token classification the accepted label format is the IOB format or similar
+          to it. The Label must be a sequence of sequences of strings or integers, with each sequence corresponding to
+          a sample in the tokenized text, and exactly the length of the corresponding tokenized text.
+    model_classes: Optional[List[str]], default: None
+        List of classes names to use for multi-label display. Only used if the dataset is multi-label.
+
+    Returns
+    -------
+    Plotly Figure instance.
+    """
+    specs = [[{'type': 'pie'}, {'type': 'table'}] if label is not None else [{'type': 'table', 'colspan': 2}, None]] + \
+        [[{'type': 'xy', 'colspan': 2}, None] for _ in range(len(properties.columns))]
+
+    subplot_titles = []
+    if label is not None:
+        annotated_samples = _calculate_number_of_annotated_samples(label, is_multi_label, task_type)
+        subplot_titles.append(f'Label Distribution<br><sup>Out of {annotated_samples} annotated samples</sup><br><br>')
+
+    subplot_titles.append('')  # Empty title for table figure
+    if not properties.empty:
+        for prop_name in properties:
+            if prop_name in TEXT_PROPERTIES_DESCRIPTION:
+                subplot_titles.append(f'{prop_name} Property Distribution<sup><a href="{get_docs_link()}nlp/'
+                                      'usage_guides/nlp_properties.html#deepchecks-built-in-properties">&#x24D8;</a>'
+                                      f'</sup><br><sup>{TEXT_PROPERTIES_DESCRIPTION[prop_name]}</sup>')
+
+    fig = make_subplots(rows=len(properties.columns) + 1, cols=2, specs=specs, subplot_titles=subplot_titles,
+                        row_heights=[1.5] + [1.0] * len(properties.columns))
+
+    # Create label distribution if label is provided
+    if label is not None:
+        if is_multi_label:
+            df_label = pd.DataFrame(label).fillna(0)
+            if model_classes is not None:
+                hashmap = {}
+                for val in label:
+                    model_array = np.array([model_classes[i] for i, val in enumerate(val) if val == 1])
+                    for class_name in model_array:
+                        hashmap[class_name] = hashmap[class_name] + 1 if class_name in hashmap else 1
+                label_counts = pd.Series(list(hashmap.values()), index=list(hashmap))
+            else:
+                label_counts = pd.Series(np.sum(df_label.to_numpy(), axis=0))
+        elif task_type == TaskType.TOKEN_CLASSIFICATION:
+            hashmap = {}
+            for val in label:
+                flattened_array = pd.Series(np.array(val).flatten()).fillna('NaN').to_numpy()
+                unique_values, counts = np.unique(flattened_array, return_counts=True)
+                for label_value, count in zip(unique_values, counts):
+                    if label_value != 'NaN':
+                        hashmap[label_value] = hashmap[label_value] + count if label_value in hashmap else count
+            label_counts = pd.Series(list(hashmap.values()), index=list(hashmap))
+        else:
+            label_counts = pd.Series(label).value_counts()
+
+        label_counts.sort_values(ascending=False, inplace=True)
+        labels_to_display = label_counts[:max_num_labels_to_show]
+        labels_to_display.index = [break_to_lines_and_trim(str(label)) for label in list(labels_to_display.index)]
+        count_other_labels = label_counts[max_num_labels_to_show + 1:].sum()
+        labels_to_display['Others'] = count_other_labels
+
+        # Pie chart for label distribution
+        fig.add_trace(go.Pie(labels=list(labels_to_display.index), values=list(labels_to_display),
+                             textposition='inside', showlegend=False, textinfo='label+percent',
+                             hovertemplate='%{label}: %{value} samples<extra></extra>'), row=1, col=1)
+
+    # Table figure for displaying some statistics
+    annotation_ratio = _calculate_annoation_ratio(label, n_samples, is_multi_label, task_type)
+    table_trace = _generate_table_trace(n_samples, annotation_ratio, categorical_metadata, numerical_metadata,
+                                        categorical_properties, numerical_properties)
+    fig.add_trace(table_trace, row=1, col=2 if label is not None else 1)
+
+    # Looping over all the properties to generate respective property distribution graphs
+    curr_row = 2  # Since row 1 is occupied with Pie and Table
+    for property_name in properties.columns:
+
+        if property_name in categorical_properties:
+            # Creating bar plots for categorical properties
+            trace, xaxis_layout, yaxis_layout = _generate_categorical_distribution_plot(
+                                                    properties[property_name], property_name
+                                                )
+            fig.add_trace(trace, row=curr_row, col=1)
+            fig.update_xaxes(xaxis_layout, row=curr_row, col=1)
+            fig.update_yaxes(yaxis_layout, row=curr_row, col=1)
+        else:
+            # Creating scatter plots for numerical properties
+            y_value, xs = _get_distribution_values(properties[property_name])
+            trace, shapes, annotations, xaxis_layout, yaxis_layout = _generate_numeric_distribution_plot(
+                                                                        properties[property_name],
+                                                                        xs, y_value, property_name
+                                                                    )
+            fig.add_trace(trace, row=curr_row, col=1)
+
+            for shape, annotation in zip(shapes, annotations):
+                fig.add_shape(shape, row=curr_row, col=1)
+                fig.add_annotation(annotation, row=curr_row, col=1)
+
+            fig.update_yaxes(yaxis_layout, row=curr_row, col=1)
+            fig.update_xaxes(xaxis_layout, row=curr_row, col=1)
+
+        curr_row += 1
+
+    fig.update_layout(height=450*(len(properties.columns) + 1))
+    return fig
diff --git a/deepchecks/utils/plot.py b/deepchecks/utils/plot.py
index 840a82989a..3325811f02 100644
--- a/deepchecks/utils/plot.py
+++ b/deepchecks/utils/plot.py
@@ -31,6 +31,11 @@
                  'rgb(139, 224, 164)',
                  'rgb(180, 151, 231)']
 
+feature_distribution_colors = {
+    'measure': '#00008b',  # dark blue
+    'feature': 'rgba(105, 179, 162, 1)'
+}
+
 common_and_outlier_colors = {'common': 'rgba(105, 179, 162, 1)',
                              'outliers': 'rgba(179, 106, 106, 1)',
                              'common_fill': 'rgba(105, 179, 162, 0.7)',
diff --git a/docs/source/nlp/usage_guides/text_data_object.rst b/docs/source/nlp/usage_guides/text_data_object.rst
index 9acb330ebf..20d348a72b 100644
--- a/docs/source/nlp/usage_guides/text_data_object.rst
+++ b/docs/source/nlp/usage_guides/text_data_object.rst
@@ -67,6 +67,14 @@ into tokens.
 Useful Functions
 ===================
 
+Describe data
+-----------------------------
+
+The ``describe()`` function is a great way to get a quick overview of your dataset. Calling the function will display the label distribution,
+the distribution of the calculated :ref:`text properties <nlp__properties_guide>` and statistical information. You can use the function in the following way:
+
+>>> text_data.describe()
+
 Calculate Default Properties
 -----------------------------
 
diff --git a/tests/nlp/test_text_data.py b/tests/nlp/test_text_data.py
index 143a1dab4b..d441d4679c 100644
--- a/tests/nlp/test_text_data.py
+++ b/tests/nlp/test_text_data.py
@@ -60,7 +60,7 @@ def test_init_mismatched_task_type():
 
 def test_wrong_token_label_format():
     # Arrange
-    tokenized_text = [['a'] ,['b', 'b' ,'b'], ['c', 'c', 'c', 'c']]
+    tokenized_text = [['a'], ['b', 'b', 'b'], ['c', 'c', 'c', 'c']]
 
     label_structure_error = r'label must be a Sequence of Sequences of either strings or integers'
 
@@ -183,7 +183,7 @@ def test_properties(text_classification_dataset_mock):
     # Act & Assert
     assert_that(dataset._properties, equal_to(None))
     # TODO: Create test for the heavy properties
-    dataset.calculate_builtin_properties(include_long_calculation_properties = False)
+    dataset.calculate_builtin_properties(include_long_calculation_properties=False)
     properties = dataset.properties
     assert_that(properties.shape[0], equal_to(3))
     assert_that(properties.shape[1], equal_to(10))
@@ -273,6 +273,7 @@ def test_set_metadata_with_an_incorrect_list_of_categorical_columns(text_classif
         )
     )
 
+
 def test_load_metadata(text_classification_dataset_mock):
     # Arrange
     dataset = text_classification_dataset_mock
@@ -410,3 +411,84 @@ def test_mixed_builtin_and_mixed_properties(text_classification_dataset_mock):
     # The custom property is not categorical as we passed an empty list, and the builtin property is not categorical
     # as defined internally.
     assert_that(dataset.categorical_properties, equal_to(['Language']))
+
+
+def test_describe_with_properties(text_multilabel_classification_dataset_mock, tweet_emotion_train_test_textdata):
+    # Arrange
+    dataset_without_properties = text_multilabel_classification_dataset_mock
+    dataset_with_properties, _ = tweet_emotion_train_test_textdata
+
+    # Act
+    figure_without_properties = dataset_without_properties.describe(n_properties_to_show=8)
+    figure_with_properties_one = dataset_with_properties.describe(n_properties_to_show=3)
+    figure_with_properties_two = dataset_with_properties.describe(properties_to_show=['Text Length',
+                                                                                      'Language'])
+    # Assert
+    assert_that(
+        calling(dataset_without_properties.describe).with_args(properties_to_show=['Property One']),
+        raises(DeepchecksValueError, 'No properties exist!')
+    )
+    assert_that(len(figure_without_properties.data), equal_to(2))
+    assert_that(len(figure_without_properties.layout.annotations), equal_to(1))
+    assert_that(figure_without_properties.data[0].type, equal_to('pie'))
+    assert_that(figure_without_properties.data[1].type, equal_to('table'))
+
+    assert_that(len(figure_with_properties_one.data), equal_to(5))
+    assert_that(len(figure_with_properties_one.layout.annotations), equal_to(16))
+    assert_that(figure_with_properties_one.data[0].type, equal_to('pie'))
+    assert_that(figure_with_properties_one.data[1].type, equal_to('table'))
+    assert_that(figure_with_properties_one.data[2].type, equal_to('scatter'))
+    assert_that(figure_with_properties_one.data[3].type, equal_to('scatter'))
+    assert_that(figure_with_properties_one.data[4].type, equal_to('scatter'))
+
+    assert_that(len(figure_with_properties_two.data), equal_to(4))
+    assert_that(len(figure_with_properties_two.layout.annotations), equal_to(7))
+    assert_that(figure_with_properties_two.data[0].type, equal_to('pie'))
+    assert_that(figure_with_properties_two.data[1].type, equal_to('table'))
+    assert_that(figure_with_properties_two.data[2].type, equal_to('scatter'))
+    assert_that(figure_with_properties_two.data[3].type, equal_to('bar'))
+
+
+def test_describe_with_multi_label_dataset(text_multilabel_classification_dataset_mock):
+    # Arrange
+    dataset = text_multilabel_classification_dataset_mock
+
+    # Act
+    figure = dataset.describe()
+
+    # Assert
+    assert_that(len(figure.data), equal_to(2))
+    assert_that(len(figure.layout.annotations), equal_to(1))
+    assert_that(figure.data[0].type, equal_to('pie'))
+    assert_that(figure.data[1].type, equal_to('table'))
+
+
+def test_describe_with_single_label_dataset(tweet_emotion_train_test_textdata):
+    # Arrange
+    dataset, _ = tweet_emotion_train_test_textdata
+
+    # Act
+    figure = dataset.describe(n_properties_to_show=2)
+
+    # Assert
+    assert_that(len(figure.data), equal_to(4))
+    # 1 for pie, 2 for scatter, 4*2 lines for mean, median, 10^th percentile and 90^th percentile
+    assert_that(len(figure.layout.annotations), equal_to(11))
+    assert_that(figure.data[0].type, equal_to('pie'))
+    assert_that(figure.data[1].type, equal_to('table'))
+    assert_that(figure.data[2].type, equal_to('scatter'))
+    assert_that(figure.data[3].type, equal_to('scatter'))
+
+
+def test_describe_with_token_classification_dataset(text_token_classification_dataset_mock):
+    # Arrange
+    dataset = text_token_classification_dataset_mock
+
+    # Act
+    figure = dataset.describe()
+
+    # Assert
+    assert_that(len(figure.data), equal_to(2))
+    assert_that(len(figure.layout.annotations), equal_to(1))
+    assert_that(figure.data[0].type, equal_to('pie'))
+    assert_that(figure.data[1].type, equal_to('table'))

From 17455f5b1f06b884ac08022d741d332fbca04035 Mon Sep 17 00:00:00 2001
From: Nir Hutnik <92314933+nirhutnik@users.noreply.github.com>
Date: Fri, 30 Jun 2023 12:47:11 +0300
Subject: [PATCH 05/23] Added is_english property & flag to calculate
 english-only properties on non-english identified samples (#2616)

---
 deepchecks/nlp/text_data.py             |  8 +++
 deepchecks/nlp/utils/text_properties.py | 92 +++++++++++++++----------
 tests/nlp/utils/test_properties.py      | 40 ++++++++++-
 3 files changed, 103 insertions(+), 37 deletions(-)

diff --git a/deepchecks/nlp/text_data.py b/deepchecks/nlp/text_data.py
index 2e5c078d39..3fa65b62b7 100644
--- a/deepchecks/nlp/text_data.py
+++ b/deepchecks/nlp/text_data.py
@@ -412,6 +412,7 @@ def calculate_builtin_properties(
         include_properties: t.Optional[t.List[str]] = None,
         ignore_properties: t.Optional[t.List[str]] = None,
         include_long_calculation_properties: bool = False,
+        ignore_non_english_samples_for_english_properties: bool = True,
         device: t.Optional[str] = None
     ):
         """Calculate the default properties of the dataset.
@@ -427,6 +428,12 @@ def calculate_builtin_properties(
         include_long_calculation_properties : bool, default False
             Whether to include properties that may take a long time to calculate. If False, these properties will be
             ignored.
+        ignore_non_english_samples_for_english_properties : bool, default True
+            Whether to ignore samples that are not in English when calculating English properties. If False, samples
+            that are not in English will be calculated as well. This parameter is ignored when calculating non-English
+            properties.
+            English-Only properties WILL NOT work properly on non-English samples, and this parameter should be used
+            only when you are sure that all the samples are in English.
         device : int, default None
             The device to use for the calculation. If None, the default device will be used.
         """
@@ -438,6 +445,7 @@ def calculate_builtin_properties(
             include_properties=include_properties,
             ignore_properties=ignore_properties,
             include_long_calculation_properties=include_long_calculation_properties,
+            ignore_non_english_samples_for_english_properties=ignore_non_english_samples_for_english_properties,
             device=device
         )
 
diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py
index 04c8bb1aa1..1377c9f0aa 100644
--- a/deepchecks/nlp/utils/text_properties.py
+++ b/deepchecks/nlp/utils/text_properties.py
@@ -310,7 +310,8 @@ def language(
     """Return text language, represented as a string."""
     if not text:
         return None
-    # Not recommended, takes a long time. Here only to enable to call this function from outside:
+    # Load the model if it wasn't received as a parameter. This is done to avoid loading the model
+    # each time the function is called.
     if fasttext_model is None:
         fasttext_model = _get_fasttext_model()
 
@@ -321,6 +322,20 @@ def language(
     return language_code
 
 
+def is_english(
+        text: str,
+        lang_certainty_threshold: float = 0.8,
+        fasttext_model: Optional[Dict[object, Any]] = None,
+        language_property_result: Optional[str] = None
+) -> Union[bool, None]:
+    """Return whether text is in English or not."""
+    if not text:
+        return None
+    if language_property_result is None:
+        language_property_result = language(text, lang_certainty_threshold, fasttext_model)
+    return language_property_result == 'en'
+
+
 def sentiment(text: str) -> float:
     """Return float representing sentiment."""
     hash_key = hash_text(text)
@@ -605,40 +620,36 @@ class TextProperty(TypedDict):
     output_type: str
 
 
-DEFAULT_PROPERTIES: Tuple[TextProperty, ...] = (
-    {'name': 'Text Length', 'method': text_length, 'output_type': 'numeric'},
-    {'name': 'Average Word Length', 'method': average_word_length, 'output_type': 'numeric'},
-    {'name': 'Max Word Length', 'method': max_word_length, 'output_type': 'numeric'},
-    {'name': '% Special Characters', 'method': percentage_special_characters, 'output_type': 'numeric'},
-    {'name': 'Language', 'method': language, 'output_type': 'categorical'},
-    {'name': 'Sentiment', 'method': sentiment, 'output_type': 'numeric'},
-    {'name': 'Subjectivity', 'method': subjectivity, 'output_type': 'numeric'},
-    {'name': 'Average Words Per Sentence', 'method': average_words_per_sentence, 'output_type': 'numeric'},
-    {'name': 'Readability Score', 'method': readability_score, 'output_type': 'numeric'},
-    {'name': 'Lexical Density', 'method': lexical_density, 'output_type': 'numeric'},
-    {'name': 'Toxicity', 'method': toxicity, 'output_type': 'numeric'},
-    {'name': 'Fluency', 'method': fluency, 'output_type': 'numeric'},
-    {'name': 'Formality', 'method': formality, 'output_type': 'numeric'},
-    {'name': 'Unique Noun Count', 'method': unique_noun_count, 'output_type': 'numeric'},
-)
+DEFAULT_PROPERTIES: Tuple[TextProperty, ...] = \
+    (
+        {'name': 'Text Length', 'method': text_length, 'output_type': 'numeric'},
+        {'name': 'Average Word Length', 'method': average_word_length, 'output_type': 'numeric'},
+        {'name': 'Max Word Length', 'method': max_word_length, 'output_type': 'numeric'},
+        {'name': '% Special Characters', 'method': percentage_special_characters, 'output_type': 'numeric'},
+        {'name': 'Language', 'method': language, 'output_type': 'categorical'},
+        {'name': 'Sentiment', 'method': sentiment, 'output_type': 'numeric'},
+        {'name': 'Subjectivity', 'method': subjectivity, 'output_type': 'numeric'},
+        {'name': 'Average Words Per Sentence', 'method': average_words_per_sentence, 'output_type': 'numeric'},
+        {'name': 'Readability Score', 'method': readability_score, 'output_type': 'numeric'},
+        {'name': 'Lexical Density', 'method': lexical_density, 'output_type': 'numeric'},
+        {'name': 'Toxicity', 'method': toxicity, 'output_type': 'numeric'},
+        {'name': 'Fluency', 'method': fluency, 'output_type': 'numeric'},
+        {'name': 'Formality', 'method': formality, 'output_type': 'numeric'},
+        {'name': 'Unique Noun Count', 'method': unique_noun_count, 'output_type': 'numeric'},
+    )
 
-ALL_PROPERTIES: Tuple[TextProperty, ...] = (
-                                               {'name': 'URLs Count', 'method': urls_count, 'output_type': 'numeric'},
-                                               {'name': 'Email Addresses Count', 'method': email_addresses_count,
-                                                'output_type': 'numeric'},
-                                               {'name': 'Unique URLs Count', 'method': unique_urls_count,
-                                                'output_type': 'numeric'},
-                                               {'name': 'Unique Email Addresses Count',
-                                                'method': unique_email_addresses_count, 'output_type': 'numeric'},
-                                               {'name': 'Unique Syllables Count', 'method': unique_syllables_count,
-                                                'output_type': 'numeric'},
-                                               {'name': 'Reading Time', 'method': reading_time,
-                                                'output_type': 'numeric'},
-                                               {'name': 'Sentences Count', 'method': sentences_count,
-                                                'output_type': 'numeric'},
-                                               {'name': 'Average Syllable Length', 'method': average_syllable_length,
-                                                'output_type': 'numeric'},
-                                           ) + DEFAULT_PROPERTIES
+ALL_PROPERTIES: Tuple[TextProperty, ...] = \
+    (
+        {'name': 'Is English', 'method': is_english, 'output_type': 'categorical'},
+        {'name': 'URLs Count', 'method': urls_count, 'output_type': 'numeric'},
+        {'name': 'Email Addresses Count', 'method': email_addresses_count, 'output_type': 'numeric'},
+        {'name': 'Unique URLs Count', 'method': unique_urls_count, 'output_type': 'numeric'},
+        {'name': 'Unique Email Addresses Count', 'method': unique_email_addresses_count, 'output_type': 'numeric'},
+        {'name': 'Unique Syllables Count', 'method': unique_syllables_count, 'output_type': 'numeric'},
+        {'name': 'Reading Time', 'method': reading_time, 'output_type': 'numeric'},
+        {'name': 'Sentences Count', 'method': sentences_count, 'output_type': 'numeric'},
+        {'name': 'Average Syllable Length', 'method': average_syllable_length, 'output_type': 'numeric'},
+    ) + DEFAULT_PROPERTIES
 
 LONG_RUN_PROPERTIES = ('Toxicity', 'Fluency', 'Formality', 'Unique Noun Count')
 LARGE_SAMPLE_SIZE = 10_000
@@ -751,6 +762,7 @@ def calculate_builtin_properties(
         include_properties: Optional[List[str]] = None,
         ignore_properties: Optional[List[str]] = None,
         include_long_calculation_properties: bool = False,
+        ignore_non_english_samples_for_english_properties: bool = True,
         device: Optional[str] = None,
         models_storage: Union[pathlib.Path, str, None] = None
 ) -> Tuple[Dict[str, List[float]], Dict[str, str]]:
@@ -772,7 +784,7 @@ def calculate_builtin_properties(
         '% Special Characters', 'Language', 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality',
         'Lexical Density', 'Unique Noun Count', 'Readability Score', 'Average Words Per Sentence']
         To calculate all the default properties, the include_properties and ignore_properties parameters should
-        be None. If you pass either include_properties or ignore_properties then the only the properties specified
+        be None. If you pass either include_properties or ignore_properties then only the properties specified
         in the list will be calculated or ignored.
         Note that the properties ['Toxicity', 'Fluency', 'Formality', 'Language', 'Unique Noun Count'] may
         take a long time to calculate. If include_long_calculation_properties is False, these properties will be
@@ -783,6 +795,12 @@ def calculate_builtin_properties(
     include_long_calculation_properties : bool, default False
         Whether to include properties that may take a long time to calculate. If False, these properties will be
         ignored, unless they are specified in the include_properties parameter explicitly.
+    ignore_non_english_samples_for_english_properties : bool, default True
+        Whether to ignore samples that are not in English when calculating English properties. If False, samples
+        that are not in English will be calculated as well. This parameter is ignored when calculating non-English
+        properties.
+        English-Only properties WILL NOT work properly on non-English samples, and this parameter should be used
+        only when you are sure that all the samples are in English.
     device : int, default None
         The device to use for the calculation. If None, the default device will be used.
     models_storage : Union[str, pathlib.Path, None], default None
@@ -869,11 +887,13 @@ def calculate_builtin_properties(
         sample_language = run_available_kwargs(language, text=text, **kwargs)
         if is_language_property_requested:
             calculated_properties['Language'].append(sample_language)
+        kwargs['language_property_result'] = sample_language  # Pass the language property result to other properties
 
         for prop in text_properties:
             if prop['name'] in import_warnings:  # Skip properties that failed to import:
                 calculated_properties[prop['name']].append(np.nan)
-            elif sample_language != 'en' and prop['name'] in english_properties_names:
+            elif sample_language != 'en' and prop['name'] in english_properties_names \
+                    and ignore_non_english_samples_for_english_properties is True:
                 calculated_properties[prop['name']].append(np.nan)
             else:
                 try:
diff --git a/tests/nlp/utils/test_properties.py b/tests/nlp/utils/test_properties.py
index 8b4d869587..bd00c2e911 100644
--- a/tests/nlp/utils/test_properties.py
+++ b/tests/nlp/utils/test_properties.py
@@ -20,7 +20,7 @@
 
 from deepchecks.core.errors import DeepchecksValueError
 from deepchecks.nlp.utils.text_properties import (MODELS_STORAGE, _sample_for_property, calculate_builtin_properties,
-                                                  get_transformer_model)
+                                                  get_transformer_model, is_english)
 
 
 def mock_fn(*args, **kwargs):  # pylint: disable=unused-argument
@@ -316,6 +316,17 @@ def test_calculate_average_syllable_count(tweet_emotion_train_test_textdata):
     assert_that(result_none_text['Average Syllable Length'], equal_to([np.nan]))
 
 
+def test_calcualte_is_english_property():
+    data = ['This is a sentence in English.', 'Это предложение на русском языке.']
+    result = calculate_builtin_properties(data, include_properties=['Is English'])[0]
+    assert_that(result['Is English'], equal_to([True, False]))
+
+
+def test_calcualte_is_english_property_without_language_precalculation():
+    data = ['This is a sentence in English.', 'Это предложение на русском языке.']
+    assert_that([is_english(data[0]), is_english(data[1])], equal_to([True, False]))
+
+
 def test_include_properties():
 
     # Arrange
@@ -458,6 +469,33 @@ def test_english_only_properties_calculation_with_not_english_samples():
     }))  # type: ignore
 
 
+def test_english_only_properties_calculated_for_all_samples():
+    # Arrange
+    text = [
+        'Explicit is better than implicit',
+        'Сьогодні чудова погода',
+        'London is the capital of Great Britain'
+    ]
+    # Act
+    properties, properties_types = calculate_builtin_properties(
+        raw_text=text,
+        include_properties=['Sentiment', 'Language', 'Text Length'],
+        ignore_non_english_samples_for_english_properties=False
+    )
+    # Assert
+    assert_that(properties, has_entries({
+        'Sentiment': contains_exactly(close_to(0.5, 0.01), close_to(0.0, 0.01), close_to(0.8, 0.01)),
+        'Language': contains_exactly('en', 'uk', 'en'),
+        'Text Length': contains_exactly(*[len(it) for it in text]),
+    }))  # type: ignore
+    assert_that(properties_types, has_entries({
+        'Sentiment': 'numeric',
+        'Language': 'categorical',
+        'Text Length': 'numeric',
+    }))  # type: ignore
+
+
+
 def test_sample_for_property():
     s = 'all the single ladies. all the single ladies? now put your hands up.'
     sample_words = _sample_for_property(text=s, mode='words', limit=2, random_seed=42)

From 3ec7ac86407a7729941603ebbeb0c47ef8ccad4a Mon Sep 17 00:00:00 2001
From: Harsh Jain <136261806+harsh-deepchecks@users.noreply.github.com>
Date: Sun, 2 Jul 2023 22:29:15 +0530
Subject: [PATCH 06/23] Update scipy version and broken pillow dependencies in
 requirements (#2620)

* Update scipy version in requirements

* Fix

* Pillow version fix

* Removed deprecated anitaliase in pillow

* Fix get_size() from pillow

* fixed getsize() to use getbbox() in pillow

* Minor fix

* Added comment in requirements

* small fix
---
 deepchecks/vision/utils/image_functions.py | 8 +++++---
 requirements/dev-requirements.txt          | 4 +++-
 requirements/requirements.txt              | 4 +++-
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/deepchecks/vision/utils/image_functions.py b/deepchecks/vision/utils/image_functions.py
index 9d92ed4e3c..54f228ccf3 100644
--- a/deepchecks/vision/utils/image_functions.py
+++ b/deepchecks/vision/utils/image_functions.py
@@ -225,7 +225,9 @@ def get_font_with_size(text, desired_width):
     desired_width = max(100, desired_width)
     jump_size = 100
     while jump_size > 1:
-        if font.getsize(text)[0] < desired_width:
+        left, _, right, _ = font.getbbox(text)
+        width = right - left
+        if width < desired_width:
             font_size += jump_size
         else:
             jump_size = jump_size // 2
@@ -266,8 +268,8 @@ def prepare_thumbnail(
         # Takes the minimum factor in order for the image to not exceed the size in either width or height
         factor = min(width_factor, height_factor)
         size = (int(image.size[0] * factor), int(image.size[1] * factor))
-        # Resize the image
-        image = image.resize(size, pilimage.ANTIALIAS)
+        # Resize the image by Image.LANCZOS
+        image = image.resize(size, pilimage.LANCZOS)
     else:
         image = ensure_image(image, copy=False)
 
diff --git a/requirements/dev-requirements.txt b/requirements/dev-requirements.txt
index 8f70dfe22a..71c64848e9 100644
--- a/requirements/dev-requirements.txt
+++ b/requirements/dev-requirements.txt
@@ -37,7 +37,9 @@ opencv-python>=4.1.2
 Pillow>=7.1.2
 PyYAML>=5.3.1
 requests>=2.23.0
-scipy>=1.4.1
+# Remove the <=1.10.1 dependency below once sklearn's issue is fixed. The higher version causes
+# issues with sklearn's _most_frequent() function using scipy's mode() function
+scipy>=1.4.1, <=1.10.1
 tqdm>=4.41.0
 seaborn>=0.11.0
 wandb>=0.12.15,<0.13.0
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 56d5f0dd6e..9941860c34 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -25,7 +25,9 @@ importlib_metadata>=1.4; python_version < '3.8'
 # is updated, explicitly add it here
 statsmodels>=0.11.0; python_version < '3.7'
 statsmodels>=0.13.5; python_version >= '3.7'
-scipy>=1.4.1
+# Remove the <=1.10.1 dependency below once sklearn's issue is fixed. The higher version causes
+# issues with sklearn's _most_frequent() function using scipy's mode() function
+scipy>=1.4.1, <=1.10.1
 dataclasses>=0.6; python_version < '3.7'
 plotly>=5.13.1
 matplotlib>=3.3.4

From e481a87a9ddb87c02853c723f51b1b7ab7d22ab1 Mon Sep 17 00:00:00 2001
From: Harsh Jain <136261806+harsh-deepchecks@users.noreply.github.com>
Date: Mon, 3 Jul 2023 10:31:01 +0530
Subject: [PATCH 07/23] Improved heatmap display for confusion matrix report
 (#2617)

* Improved heatmap display for confusion matrix report

* Resolved Comments

* Minor fix
---
 .../abstracts/confusion_matrix_abstract.py    | 55 ++++++++++++-------
 .../model_evaluation/confusion_matrix.py      |  2 +-
 .../model_evaluation/confusion_matrix_test.py | 24 ++++++--
 .../confusion_matrix_report_test.py           | 21 ++++++-
 .../model_evaluation/confusion_matrix_test.py | 18 ++++++
 5 files changed, 95 insertions(+), 25 deletions(-)

diff --git a/deepchecks/utils/abstracts/confusion_matrix_abstract.py b/deepchecks/utils/abstracts/confusion_matrix_abstract.py
index 800db3459d..47f0b41a76 100644
--- a/deepchecks/utils/abstracts/confusion_matrix_abstract.py
+++ b/deepchecks/utils/abstracts/confusion_matrix_abstract.py
@@ -31,14 +31,14 @@ def run_confusion_matrix_check(y_pred: np.ndarray, y_true: np.ndarray, with_disp
     result = confusion_matrix(y_true, y_pred)
 
     if with_display:
-        fig = create_confusion_matrix_figure(result, total_classes, normalize_display)
+        displays = create_confusion_matrix_figure(result, total_classes, normalize_display)
     else:
-        fig = None
+        displays = None
 
     # For accessing the class names from the condition
     result = pd.DataFrame(result, index=total_classes, columns=total_classes)
 
-    return CheckResult(result, display=fig)
+    return CheckResult(result, display=displays)
 
 
 def create_confusion_matrix_figure(confusion_matrix_data: np.ndarray, classes_names: List[str],
@@ -60,29 +60,46 @@ def create_confusion_matrix_figure(confusion_matrix_data: np.ndarray, classes_na
         confusion matrix figure
 
     """
+    confusion_matrix_norm = confusion_matrix_data.astype('float') / \
+        (confusion_matrix_data.sum(axis=1)[:, np.newaxis] + np.finfo(float).eps) * 100
     if normalize_display:
-        confusion_matrix_norm = confusion_matrix_data.astype('float') / \
-                                (confusion_matrix_data.sum(axis=1)[:, np.newaxis] + np.finfo(float).eps) * 100
         z = np.vectorize(format_number_if_not_nan)(confusion_matrix_norm)
-        text_template = '%{z}%<br>(%{text})'
-        color_bar_title = '% out of<br>True Values'
-        plot_title = 'Percent Out of True Values (Count)'
     else:
         z = confusion_matrix_data
-        color_bar_title = None
-        text_template = '%{text}'
-        plot_title = 'Value Count'
-
-    fig = go.Figure(data=go.Heatmap(
-        x=classes_names, y=classes_names, z=z,
-        text=confusion_matrix_data, texttemplate=text_template))
-    fig.data[0].colorbar.title = color_bar_title
-    fig.update_layout(title=plot_title)
+
+    accuracy_array = np.diag(confusion_matrix_norm).round(decimals=2)
+
+    display = []
+    display_msg = f'The overall accuracy of your model is: {round(np.sum(accuracy_array)/len(accuracy_array), 2)}%.'
+
+    if min(accuracy_array) < 100:
+        display_msg += f'<br>Best accuracy achieved on samples with <b>{classes_names[np.argmax(accuracy_array)]}' \
+                       f'</b> label ({np.max(accuracy_array)}%).'
+        display_msg += f'<br>Worst accuracy achieved on samples with <b>{classes_names[np.argmin(accuracy_array)]}' \
+                       f'</b> label ({np.min(accuracy_array)}%).'
+    display.append(display_msg)
+
+    total_samples = np.nansum(confusion_matrix_data)
+    percent_data_each_row = np.round(confusion_matrix_norm, decimals=2)
+    percent_data_each_cell = np.round(np.divide(np.nan_to_num(confusion_matrix_data, nan=0.0), total_samples) * 100,
+                                      decimals=2)
+    percent_data_each_col = (confusion_matrix_data.astype('float') /
+                             (confusion_matrix_data.sum(axis=0)[:, np.newaxis] +
+                              np.finfo(float).eps) * 100).round(decimals=2)
+    custom_hoverdata = np.dstack((percent_data_each_cell, percent_data_each_row, percent_data_each_col))
+
+    fig = go.Figure(data=go.Heatmap(x=classes_names, y=classes_names, z=z, customdata=custom_hoverdata,
+                                    xgap=1, ygap=1, text=confusion_matrix_data, texttemplate='%{text}',
+                                    hovertemplate='% out of all data: <b>%{customdata[0]}%</b><br>% out '
+                                                  'of row: <b>%{customdata[1]}%</b><br>% out of column: '
+                                                  '<b>%{customdata[2]}%</b><extra></extra>',
+                                    showscale=False))
+    fig.update_layout(title='Confusion Matrix (# Samples)', title_x=0.5)
     fig.update_layout(height=600)
     fig.update_xaxes(title='Predicted Value', type='category', scaleanchor='y', constrain='domain')
     fig.update_yaxes(title='True Value', type='category', constrain='domain', autorange='reversed')
-
-    return fig
+    display.append(fig)
+    return display
 
 
 def misclassified_samples_lower_than_condition(value: pd.DataFrame,
diff --git a/deepchecks/vision/checks/model_evaluation/confusion_matrix.py b/deepchecks/vision/checks/model_evaluation/confusion_matrix.py
index eed268942e..4b76e10f34 100644
--- a/deepchecks/vision/checks/model_evaluation/confusion_matrix.py
+++ b/deepchecks/vision/checks/model_evaluation/confusion_matrix.py
@@ -145,7 +145,7 @@ def compute(self, context: Context, dataset_kind: DatasetKind = None) -> CheckRe
                     x.append('No overlapping')
                     y.append('No overlapping')
 
-            description.append(
+            description.extend(
                 create_confusion_matrix_figure(confusion_matrix, x, self.normalized)
             )
         else:
diff --git a/tests/nlp/checks/model_evaluation/confusion_matrix_test.py b/tests/nlp/checks/model_evaluation/confusion_matrix_test.py
index 7c2a3c77d0..dcd818609e 100644
--- a/tests/nlp/checks/model_evaluation/confusion_matrix_test.py
+++ b/tests/nlp/checks/model_evaluation/confusion_matrix_test.py
@@ -100,16 +100,16 @@ def test_condition_misclassified_samples_lower_than_raises_error(tweet_emotion_t
     assert_that(result.conditions_results[0], equal_condition_result(
         is_pass=False,
         name=f'Misclassified cell size lower than {format_number(-0.1 * 100)}% of the total samples',
-        details='Exception in condition: DeepchecksValueError: Condition requires the parameter "misclassified_samples_threshold" '
-                'to be between 0 and 1 inclusive but got -0.1',
+        details='Exception in condition: DeepchecksValueError: Condition requires the parameter '
+                '"misclassified_samples_threshold" to be between 0 and 1 inclusive but got -0.1',
         category=ConditionCategory.ERROR
     ))
 
     assert_that(result.conditions_results[1], equal_condition_result(
         is_pass=False,
         name=f'Misclassified cell size lower than {format_number(1.1 * 100)}% of the total samples',
-        details='Exception in condition: DeepchecksValueError: Condition requires the parameter "misclassified_samples_threshold" '
-                'to be between 0 and 1 inclusive but got 1.1',
+        details='Exception in condition: DeepchecksValueError: Condition requires the parameter '
+                '"misclassified_samples_threshold" to be between 0 and 1 inclusive but got 1.1',
         category=ConditionCategory.ERROR
     ))
 
@@ -189,3 +189,19 @@ def test_condition_misclassified_samples_lower_than_fails(tweet_emotion_train_te
                   f'Largest misclassified cell ({format_percent(max_misclassified_samples_ratio)} of the data) ' \
                   f'is samples with a true value of "{class_names[x]}" and a predicted value of "{class_names[y]}".'
     ))
+
+
+def test_confusion_matrix_report_display(tweet_emotion_train_test_textdata, tweet_emotion_train_test_predictions):
+    # Arrange and Act
+    check = ConfusionMatrixReport()
+    result = check.run(tweet_emotion_train_test_textdata[0], predictions=tweet_emotion_train_test_predictions[0])
+
+    # Assert
+    assert_that(result.display[0],
+                equal_to('The overall accuracy of your model is: 92.04%.<br>Best accuracy achieved on samples with '
+                         '<b>anger</b> label (96.59%).<br>Worst accuracy achieved on samples with <b>sadness</b> '
+                         'label (88.86%).'))
+    # First is the text description and second is the heatmap
+    assert_that(len(result.display), equal_to(2))
+    assert_that(len(result.display[1].data), equal_to(1))
+    assert_that(result.display[1].data[0].type, equal_to('heatmap'))
diff --git a/tests/tabular/checks/model_evaluation/confusion_matrix_report_test.py b/tests/tabular/checks/model_evaluation/confusion_matrix_report_test.py
index 610406253f..4f10eec8f2 100644
--- a/tests/tabular/checks/model_evaluation/confusion_matrix_report_test.py
+++ b/tests/tabular/checks/model_evaluation/confusion_matrix_report_test.py
@@ -10,7 +10,7 @@
 #
 """Contains unit tests for the confusion_matrix_report check."""
 import numpy as np
-from hamcrest import assert_that, calling, greater_than, has_length, raises
+from hamcrest import assert_that, calling, greater_than, has_length, raises, equal_to
 
 from deepchecks.core.condition import ConditionCategory
 from deepchecks.core.errors import DeepchecksNotSupportedError, DeepchecksValueError, ModelValidationError
@@ -183,3 +183,22 @@ def test_condition_misclassified_samples_lower_than_fails(iris_split_dataset_and
                   f'Largest misclassified cell ({format_percent(max_misclassified_samples_ratio)} of the data) ' \
                   f'is samples with a true value of "{class_names[x]}" and a predicted value of "{class_names[y]}".'
     ))
+
+
+def test_confusion_matrix_report_display(iris_split_dataset_and_model):
+    # Arrange
+    _, test, clf = iris_split_dataset_and_model
+
+    # Act
+    check = ConfusionMatrixReport()
+
+    result = check.run(test, clf)
+
+    # Assert
+    assert_that(result.display[0],
+                equal_to('The overall accuracy of your model is: 91.67%.<br>Best accuracy achieved on samples with <b>'
+                         '0</b> label (100.0%).<br>Worst accuracy achieved on samples with <b>2</b> label (75.0%).'))
+    # # First is the text description and second is the heatmap
+    assert_that(len(result.display), equal_to(2))
+    assert_that(len(result.display[1].data), equal_to(1))
+    assert_that(result.display[1].data[0].type, equal_to('heatmap'))
\ No newline at end of file
diff --git a/tests/vision/checks/model_evaluation/confusion_matrix_test.py b/tests/vision/checks/model_evaluation/confusion_matrix_test.py
index a0f441a9e6..8681586f1f 100644
--- a/tests/vision/checks/model_evaluation/confusion_matrix_test.py
+++ b/tests/vision/checks/model_evaluation/confusion_matrix_test.py
@@ -56,3 +56,21 @@ def test_detection(coco_visiondata_train):
     # Assert
     num_of_classes = len(coco_visiondata_train.get_observed_classes()) + 1  # plus no-overlapping
     assert_that(result.value.shape, le((num_of_classes, num_of_classes)))
+
+
+def test_confusion_matrix_report_display(mnist_visiondata_train):
+    # Arrange
+    check = ConfusionMatrixReport()
+
+    # Act
+    result = check.run(mnist_visiondata_train)
+
+    # Assert
+    assert_that(result.display[0], equal_to('Showing 10 of 10 classes:'))
+    assert_that(result.display[1],
+                equal_to('The overall accuracy of your model is: 97.45%.<br>Best accuracy achieved on samples with <b>'
+                         '0</b> label (100.0%).<br>Worst accuracy achieved on samples with <b>9</b> label (86.96%).'))
+    # First and second are the text descriptions and third is a heatmap
+    assert_that(len(result.display), equal_to(3))
+    assert_that(len(result.display[2].data), equal_to(1))
+    assert_that(result.display[2].data[0].type, equal_to('heatmap'))

From 908e4dad66d94cdd19fd087884177e7d7d0157f4 Mon Sep 17 00:00:00 2001
From: Itay Gabbay <itay@deepchecks.com>
Date: Tue, 11 Jul 2023 10:02:56 +0300
Subject: [PATCH 08/23] Calculation of NLP properties with batches (#2621)

* First commit of supporting batched properties

* Adapting the batch_size

* Adapting the batch_size

* Fixing the pylint
adding device_map

* Handling one bad sample

* Fixes

* Adding sample by sample

* Fixes

* Fixes

* It works

* Adding variable batch size test

* Adding variable batch size test

* Fixing pylint

* Fixing pylint

* resolving PR comments

* Adding test assertion
---
 deepchecks/nlp/utils/text_properties.py | 209 ++++++++++++++++--------
 tests/nlp/utils/test_properties.py      |  23 +++
 2 files changed, 162 insertions(+), 70 deletions(-)

diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py
index 1377c9f0aa..6f09b3dedd 100644
--- a/deepchecks/nlp/utils/text_properties.py
+++ b/deepchecks/nlp/utils/text_properties.py
@@ -15,6 +15,7 @@
 import re
 import string
 import warnings
+from collections import defaultdict
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 
 import numpy as np
@@ -24,13 +25,13 @@
 from nltk import corpus
 from nltk import download as nltk_download
 from nltk import sent_tokenize, word_tokenize
+from tqdm import tqdm
 from typing_extensions import TypedDict
 
 from deepchecks.core.errors import DeepchecksValueError
 from deepchecks.nlp.utils.text import cut_string, hash_text, normalize_text, remove_punctuation
 from deepchecks.utils.function import run_available_kwargs
-from deepchecks.utils.ipython import create_progress_bar
-from deepchecks.utils.strings import format_list, truncate_string
+from deepchecks.utils.strings import format_list
 
 __all__ = ['calculate_builtin_properties', 'get_builtin_properties_types']
 
@@ -166,7 +167,8 @@ def get_transformer_model(
         # TODO: quantize if 'quantize_model' is True
         return transformers.AutoModelForSequenceClassification.from_pretrained(
             model_name,
-            cache_dir=models_storage
+            cache_dir=models_storage,
+            device_map=device
         )
 
     onnx = _import_optional_property_dependency(
@@ -187,12 +189,13 @@ def get_transformer_model(
         model_path = models_storage / 'onnx' / model_name
 
         if model_path.exists():
-            return onnx.ORTModelForSequenceClassification.from_pretrained(model_path)
+            return onnx.ORTModelForSequenceClassification.from_pretrained(model_path, device_map=device)
 
         model = onnx.ORTModelForSequenceClassification.from_pretrained(
             model_name,
             export=True,
-            cache_dir=models_storage
+            cache_dir=models_storage,
+            device_map=device
         )
         # NOTE:
         # 'optimum', after exporting/converting a model to the ONNX format,
@@ -204,7 +207,7 @@ def get_transformer_model(
     model_path = models_storage / 'onnx' / 'quantized' / model_name
 
     if model_path.exists():
-        return onnx.ORTModelForSequenceClassification.from_pretrained(model_path)
+        return onnx.ORTModelForSequenceClassification.from_pretrained(model_path, device_map=device)
 
     not_quantized_model = get_transformer_model(
         property_name,
@@ -214,7 +217,7 @@ def get_transformer_model(
         models_storage=models_storage
     )
 
-    quantizer = onnx.ORTQuantizer.from_pretrained(not_quantized_model)
+    quantizer = onnx.ORTQuantizer.from_pretrained(not_quantized_model, device_map=device)
 
     quantizer.quantize(
         save_dir=model_path,
@@ -224,7 +227,7 @@ def get_transformer_model(
             per_channel=False
         )
     )
-    return onnx.ORTModelForSequenceClassification.from_pretrained(model_path)
+    return onnx.ORTModelForSequenceClassification.from_pretrained(model_path, device_map=device)
 
 
 def get_transformer_pipeline(
@@ -235,7 +238,7 @@ def get_transformer_pipeline(
 ):
     """Return a transformers pipeline for the given model name."""
     transformers = _import_optional_property_dependency('transformers', property_name=property_name)
-    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, device_map=device)
     model = get_transformer_model(
         property_name=property_name,
         model_name=model_name,
@@ -358,11 +361,28 @@ def subjectivity(text: str) -> float:
     return textblob_cache.get(hash_key).subjectivity
 
 
-def _predict(text: str, classifier, kind: str) -> float:
+def _parse_prediction_results(v, kind):
+    if not v:
+        return np.nan
+    elif kind == 'toxicity':
+        return v['score']
+    elif kind == 'fluency':
+        return v['score'] if v['label'] == 'LABEL_1' else 1 - v['score']
+    elif kind == 'formality':
+        return v['score'] if v['label'] == 'formal' else 1 - v['score']
+    else:
+        raise ValueError('Unsupported value for "kind" parameter')
+
+
+def _predict(text_batch: Sequence[str], classifier, kind: str, batch_size: int) -> Sequence[float]:
     """Return prediction of huggingface Pipeline classifier."""
-    try:
-        # TODO: make this way smarter, and not just a hack. Count tokens, for a start. Then not just sample sentences.
-        # If text is longer than classifier context window, sample it:
+    # TODO: make this way smarter, and not just a hack. Count tokens, for a start. Then not just sample sentences.
+    # If text is longer than classifier context window, sample it:
+    text_list_to_predict = []
+    reduced_batch_size = batch_size  # Initialize the reduced batch size
+    retry_count = 0
+
+    for text in text_batch:
         if len(text) > MAX_CHARS:
             sentences = _sample_for_property(text, mode='sentences', limit=10, return_as_list=True)
             text_to_use = ''
@@ -374,28 +394,36 @@ def _predict(text: str, classifier, kind: str) -> float:
             # if even one sentence is too long, use part of the first one:
             if len(text_to_use) == 0:
                 text_to_use = cut_string(sentences[0], MAX_CHARS)
-            text = text_to_use
-
-        v = classifier(text)
-    except Exception:  # pylint: disable=broad-except
-        return np.nan
-    else:
-        if not v:
-            return np.nan
-        v = v[0]
-        if kind == 'toxicity':
-            return v['score']
-        elif kind == 'fluency':
-            label_value = 'LABEL_1'
-        elif kind == 'formality':
-            label_value = 'formal'
+            text_list_to_predict.append(text_to_use)
         else:
-            raise ValueError('Unsupported value for "kind" parameter')
-        return (
-            v['score']
-            if v['label'] == label_value
-            else 1 - v['score']
-        )
+            text_list_to_predict.append(text)
+
+    while reduced_batch_size >= 1:
+        try:
+            if reduced_batch_size == 1 or retry_count == 3:
+                results = []
+                for text in text_list_to_predict:
+                    try:
+                        v = classifier(text)
+                        results.append(_parse_prediction_results(v, kind))
+                    except Exception:  # pylint: disable=broad-except
+                        results.append(np.nan)
+                return results  # Return the results if prediction is successful
+
+            v_list = classifier(text_list_to_predict, batch_size=reduced_batch_size)
+            results = []
+
+            for v in v_list:
+                results.append(_parse_prediction_results(v, kind))
+
+            return results  # Return the results if prediction is successful
+
+        except Exception:  # pylint: disable=broad-except
+            reduced_batch_size = max(reduced_batch_size // 2, 1)  # Reduce the batch size by half
+            text_list_to_predict = []  # Clear the list of texts to predict for retry
+            retry_count += 1
+
+    return [np.nan] * batch_size  # Prediction failed, return NaN values for the original batch size
 
 
 TOXICITY_MODEL_NAME = 'unitary/toxic-bert'
@@ -404,42 +432,45 @@ def _predict(text: str, classifier, kind: str) -> float:
 
 
 def toxicity(
-        text: str,
+        text_batch: Sequence[str],
+        batch_size: int = 1,
         device: Optional[str] = None,
         models_storage: Union[pathlib.Path, str, None] = None,
         toxicity_classifier: Optional[object] = None
-) -> float:
+) -> Sequence[float]:
     """Return float representing toxicity."""
     if toxicity_classifier is None:
         toxicity_classifier = get_transformer_pipeline(
             property_name='toxicity', model_name=TOXICITY_MODEL_NAME, device=device, models_storage=models_storage)
-    return _predict(text, toxicity_classifier, 'toxicity')
+    return _predict(text_batch, toxicity_classifier, 'toxicity', batch_size)
 
 
 def fluency(
-        text: str,
+        text_batch: Sequence[str],
+        batch_size: int = 1,
         device: Optional[str] = None,
         models_storage: Union[pathlib.Path, str, None] = None,
         fluency_classifier: Optional[object] = None
-) -> float:
+) -> Sequence[float]:
     """Return float representing fluency."""
     if fluency_classifier is None:
         fluency_classifier = get_transformer_pipeline(
             property_name='fluency', model_name=FLUENCY_MODEL_NAME, device=device, models_storage=models_storage)
-    return _predict(text, fluency_classifier, 'fluency')
+    return _predict(text_batch, fluency_classifier, 'fluency', batch_size)
 
 
 def formality(
-        text: str,
+        text_batch: Sequence[str],
+        batch_size: int = 1,
         device: Optional[str] = None,
         models_storage: Union[pathlib.Path, str, None] = None,
         formality_classifier: Optional[object] = None
-) -> float:
+) -> Sequence[float]:
     """Return float representing formality."""
     if formality_classifier is None:
         formality_classifier = get_transformer_pipeline(
             property_name='formality', model_name=FORMALITY_MODEL_NAME, device=device, models_storage=models_storage)
-    return _predict(text, formality_classifier, 'formality')
+    return _predict(text_batch, formality_classifier, 'formality', batch_size)
 
 
 def lexical_density(text: str) -> float:
@@ -461,7 +492,7 @@ def lexical_density(text: str) -> float:
     return round(total_unique_words * 100 / len(all_words), 2)
 
 
-def unique_noun_count(text: str) -> int:
+def unique_noun_count(text: Sequence[str]) -> int:
     """Return the number of unique noun words in the text."""
     if pd.isna(text):
         return np.nan
@@ -614,6 +645,22 @@ def average_syllable_length(text: str, cmudict_dict: dict = None) -> float:
     return round(syllable_count / sentence_count, 2)
 
 
+def _batch_wrapper(text_batch: Sequence[str], func: Callable, **kwargs) -> List[Any]:
+    """Wrap the non-batched properties execution with batches API."""
+    results = []
+    language_property_result = []
+    if 'language_property_result' in kwargs:
+        language_property_result = kwargs.pop('language_property_result')
+
+    language_property_exists = len(language_property_result) > 0
+
+    for i, text in enumerate(text_batch):
+        kwargs['language_property_result'] = language_property_result[i] if language_property_exists else None
+        results.append(run_available_kwargs(func, text=text, **kwargs))
+
+    return results
+
+
 class TextProperty(TypedDict):
     name: str
     method: Callable[..., Sequence[Any]]
@@ -652,6 +699,9 @@ class TextProperty(TypedDict):
     ) + DEFAULT_PROPERTIES
 
 LONG_RUN_PROPERTIES = ('Toxicity', 'Fluency', 'Formality', 'Unique Noun Count')
+
+BATCH_PROPERTIES = ('Toxicity', 'Fluency', 'Formality')
+
 LARGE_SAMPLE_SIZE = 10_000
 
 ENGLISH_ONLY_PROPERTIES = (
@@ -764,7 +814,8 @@ def calculate_builtin_properties(
         include_long_calculation_properties: bool = False,
         ignore_non_english_samples_for_english_properties: bool = True,
         device: Optional[str] = None,
-        models_storage: Union[pathlib.Path, str, None] = None
+        models_storage: Union[pathlib.Path, str, None] = None,
+        batch_size: Optional[int] = 16
 ) -> Tuple[Dict[str, List[float]], Dict[str, str]]:
     """Calculate properties on provided text samples.
 
@@ -807,6 +858,8 @@ def calculate_builtin_properties(
         A directory to store the models.
         If not provided, models will be stored in `DEEPCHECKS_LIB_PATH/nlp/.nlp-models`.
         Also, if a folder already contains relevant resources they are not re-downloaded.
+    batch_size : int, default 8
+        The batch size.
 
     Returns
     -------
@@ -870,40 +923,56 @@ def calculate_builtin_properties(
     )
     import_warnings = set()
 
-    progress_bar = create_progress_bar(
-        iterable=list(raw_text),
-        name='Text Samples Calculation',
-        unit='Text Sample'
-    )
-    for text in progress_bar:
-        progress_bar.set_postfix(
-            {'Sample': truncate_string(text, max_length=20) if text else 'EMPTY STRING'},
-            refresh=False
-        )
-        if pd.isna(text):
-            for prop in text_properties:
-                calculated_properties[prop['name']].append(np.nan)
-            continue
-        sample_language = run_available_kwargs(language, text=text, **kwargs)
-        if is_language_property_requested:
-            calculated_properties['Language'].append(sample_language)
-        kwargs['language_property_result'] = sample_language  # Pass the language property result to other properties
+    for i in tqdm(range(0, len(raw_text), batch_size)):
+        batch = raw_text[i:i + batch_size]
+        batch_properties = defaultdict(list)
+
+        # filtering out empty sequences
+        nan_indices = {i for i, seq in enumerate(batch) if pd.isna(seq) is True}
+        filtered_sequences = [e for i, e in enumerate(batch) if i not in nan_indices]
 
+        samples_language = _batch_wrapper(text_batch=filtered_sequences, func=language, **kwargs)
+        if is_language_property_requested:
+            batch_properties['Language'].extend(samples_language)
+            calculated_properties['Language'].extend(samples_language)
+        kwargs['language_property_result'] = samples_language  # Pass the language property to other properties
+
+        non_english_indices = set()
+        if ignore_non_english_samples_for_english_properties:
+            non_english_indices = {i for i, (seq, lang) in enumerate(zip(filtered_sequences, samples_language))
+                                   if lang != 'en'}
         for prop in text_properties:
             if prop['name'] in import_warnings:  # Skip properties that failed to import:
-                calculated_properties[prop['name']].append(np.nan)
-            elif sample_language != 'en' and prop['name'] in english_properties_names \
-                    and ignore_non_english_samples_for_english_properties is True:
-                calculated_properties[prop['name']].append(np.nan)
+                batch_properties[prop['name']].extend([np.nan] * len(batch))
             else:
+                if prop['name'] in english_properties_names \
+                        and ignore_non_english_samples_for_english_properties is True:
+                    filtered_sequences = [e for i, e in enumerate(filtered_sequences) if i not in non_english_indices]
+                kwargs['batch_size'] = batch_size
                 try:
-                    value = run_available_kwargs(prop['method'], text=text, **kwargs)
-                    calculated_properties[prop['name']].append(value)
+                    if prop['name'] in BATCH_PROPERTIES:
+                        value = run_available_kwargs(func=prop['method'], text_batch=filtered_sequences, **kwargs)
+                    else:
+                        value = _batch_wrapper(text_batch=filtered_sequences, func=prop['method'], **kwargs)
+                    batch_properties[prop['name']].extend(value)
                 except ImportError as e:
                     warnings.warn(warning_message.format(prop['name'], str(e)))
-                    calculated_properties[prop['name']].append(np.nan)
+                    batch_properties[prop['name']].extend([np.nan] * len(batch))
                     import_warnings.add(prop['name'])
 
+            result_index = 0
+
+            for index, seq in enumerate(batch):
+                if index in nan_indices or (index in non_english_indices and
+                                            ignore_non_english_samples_for_english_properties and
+                                            prop['name'] in english_properties_names):
+                    calculated_properties[prop['name']].append(np.nan)
+                else:
+                    calculated_properties[prop['name']].append(batch_properties[prop['name']][result_index])
+                    result_index += 1
+
+            filtered_sequences = [e for i, e in enumerate(batch) if i not in nan_indices]
+
         # Clear property caches:
         textblob_cache.clear()
         words_cache.clear()
diff --git a/tests/nlp/utils/test_properties.py b/tests/nlp/utils/test_properties.py
index bd00c2e911..050f9ebc15 100644
--- a/tests/nlp/utils/test_properties.py
+++ b/tests/nlp/utils/test_properties.py
@@ -182,6 +182,29 @@ def test_calculate_average_sentence_length_property(tweet_emotion_train_test_tex
     assert_that(result_none_text['Average Words Per Sentence'], equal_to([np.nan]))
 
 
+def test_batch_size_change(tweet_emotion_train_test_textdata):
+    # Arrange
+    _, test = tweet_emotion_train_test_textdata
+    test_text = test.text
+    batch_sizes=[1, 8, 16, 32, 64]
+
+    # Act
+    for batch in batch_sizes:
+        result = calculate_builtin_properties(test_text, include_properties=['Average Words Per Sentence',
+                                                                             'Unique Noun Count'],
+                                              batch_size=batch)[0]
+        result_none_text = calculate_builtin_properties([None], include_properties=['Average Words Per Sentence',
+                                                                                    'Unique Noun Count'],
+                                                        batch_size=batch)[0]
+
+        # Assert
+        assert_that(result['Average Words Per Sentence'][0: 10], equal_to([5.667, 7.0, 11.0, 12.0, 8.0, 19.0, 3.0, 9.0,
+                                                                           11.5, 7.333]))
+        assert_that(result['Unique Noun Count'][0: 10], equal_to([9, 2, 3, 3, 4, 10, np.nan, 2, 7, 5]))
+        assert_that(result_none_text['Average Words Per Sentence'], equal_to([np.nan]))
+        assert_that(result_none_text['Unique Noun Count'], equal_to([np.nan]))
+
+
 def test_calculate_readability_score_property(tweet_emotion_train_test_textdata):
 
     # Arrange

From f3c420f2a957c3f5b97b3cf6e40c632bfa5fe0fb Mon Sep 17 00:00:00 2001
From: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>
Date: Wed, 12 Jul 2023 15:33:33 +0300
Subject: [PATCH 09/23] small refactor in properties calculation (#2627)

---
 deepchecks/nlp/utils/text_properties.py | 45 ++++++++++++-------------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py
index 6f09b3dedd..954c03b776 100644
--- a/deepchecks/nlp/utils/text_properties.py
+++ b/deepchecks/nlp/utils/text_properties.py
@@ -361,25 +361,13 @@ def subjectivity(text: str) -> float:
     return textblob_cache.get(hash_key).subjectivity
 
 
-def _parse_prediction_results(v, kind):
-    if not v:
-        return np.nan
-    elif kind == 'toxicity':
-        return v['score']
-    elif kind == 'fluency':
-        return v['score'] if v['label'] == 'LABEL_1' else 1 - v['score']
-    elif kind == 'formality':
-        return v['score'] if v['label'] == 'formal' else 1 - v['score']
-    else:
-        raise ValueError('Unsupported value for "kind" parameter')
-
-
-def _predict(text_batch: Sequence[str], classifier, kind: str, batch_size: int) -> Sequence[float]:
+def predict_on_batch(text_batch: Sequence[str], classifier,
+                     output_formatter: Callable[[Dict[str, Any]], float]) -> Sequence[float]:
     """Return prediction of huggingface Pipeline classifier."""
     # TODO: make this way smarter, and not just a hack. Count tokens, for a start. Then not just sample sentences.
     # If text is longer than classifier context window, sample it:
     text_list_to_predict = []
-    reduced_batch_size = batch_size  # Initialize the reduced batch size
+    reduced_batch_size = len(text_batch)  # Initialize the reduced batch size
     retry_count = 0
 
     for text in text_batch:
@@ -405,7 +393,7 @@ def _predict(text_batch: Sequence[str], classifier, kind: str, batch_size: int)
                 for text in text_list_to_predict:
                     try:
                         v = classifier(text)
-                        results.append(_parse_prediction_results(v, kind))
+                        results.append(output_formatter(v))
                     except Exception:  # pylint: disable=broad-except
                         results.append(np.nan)
                 return results  # Return the results if prediction is successful
@@ -414,7 +402,7 @@ def _predict(text_batch: Sequence[str], classifier, kind: str, batch_size: int)
             results = []
 
             for v in v_list:
-                results.append(_parse_prediction_results(v, kind))
+                results.append(output_formatter(v))
 
             return results  # Return the results if prediction is successful
 
@@ -423,7 +411,7 @@ def _predict(text_batch: Sequence[str], classifier, kind: str, batch_size: int)
             text_list_to_predict = []  # Clear the list of texts to predict for retry
             retry_count += 1
 
-    return [np.nan] * batch_size  # Prediction failed, return NaN values for the original batch size
+    return [np.nan] * len(text_batch)  # Prediction failed, return NaN values for the original batch size
 
 
 TOXICITY_MODEL_NAME = 'unitary/toxic-bert'
@@ -433,7 +421,6 @@ def _predict(text_batch: Sequence[str], classifier, kind: str, batch_size: int)
 
 def toxicity(
         text_batch: Sequence[str],
-        batch_size: int = 1,
         device: Optional[str] = None,
         models_storage: Union[pathlib.Path, str, None] = None,
         toxicity_classifier: Optional[object] = None
@@ -442,12 +429,15 @@ def toxicity(
     if toxicity_classifier is None:
         toxicity_classifier = get_transformer_pipeline(
             property_name='toxicity', model_name=TOXICITY_MODEL_NAME, device=device, models_storage=models_storage)
-    return _predict(text_batch, toxicity_classifier, 'toxicity', batch_size)
+
+    def output_formatter(v):
+        return v['score']
+
+    return predict_on_batch(text_batch, toxicity_classifier, output_formatter)
 
 
 def fluency(
         text_batch: Sequence[str],
-        batch_size: int = 1,
         device: Optional[str] = None,
         models_storage: Union[pathlib.Path, str, None] = None,
         fluency_classifier: Optional[object] = None
@@ -456,12 +446,15 @@ def fluency(
     if fluency_classifier is None:
         fluency_classifier = get_transformer_pipeline(
             property_name='fluency', model_name=FLUENCY_MODEL_NAME, device=device, models_storage=models_storage)
-    return _predict(text_batch, fluency_classifier, 'fluency', batch_size)
+
+    def output_formatter(v):
+        return v['score'] if v['label'] == 'LABEL_1' else 1 - v['score']
+
+    return predict_on_batch(text_batch, fluency_classifier, output_formatter)
 
 
 def formality(
         text_batch: Sequence[str],
-        batch_size: int = 1,
         device: Optional[str] = None,
         models_storage: Union[pathlib.Path, str, None] = None,
         formality_classifier: Optional[object] = None
@@ -470,7 +463,11 @@ def formality(
     if formality_classifier is None:
         formality_classifier = get_transformer_pipeline(
             property_name='formality', model_name=FORMALITY_MODEL_NAME, device=device, models_storage=models_storage)
-    return _predict(text_batch, formality_classifier, 'formality', batch_size)
+
+    def output_formatter(v):
+        return v['score'] if v['label'] == 'formal' else 1 - v['score']
+
+    return predict_on_batch(text_batch, formality_classifier, output_formatter)
 
 
 def lexical_density(text: str) -> float:

From d21213dfd1bdf06e4444dfb849177d9ae3eb8996 Mon Sep 17 00:00:00 2001
From: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>
Date: Wed, 12 Jul 2023 17:09:01 +0300
Subject: [PATCH 10/23] bug fix properties calc on single sample batch  (#2628)

---
 deepchecks/nlp/utils/text_properties.py |  2 +-
 tests/nlp/utils/test_properties.py      | 50 ++++++++++++++-----------
 2 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py
index 954c03b776..7870cb5f40 100644
--- a/deepchecks/nlp/utils/text_properties.py
+++ b/deepchecks/nlp/utils/text_properties.py
@@ -392,7 +392,7 @@ def predict_on_batch(text_batch: Sequence[str], classifier,
                 results = []
                 for text in text_list_to_predict:
                     try:
-                        v = classifier(text)
+                        v = classifier(text)[0]
                         results.append(output_formatter(v))
                     except Exception:  # pylint: disable=broad-except
                         results.append(np.nan)
diff --git a/tests/nlp/utils/test_properties.py b/tests/nlp/utils/test_properties.py
index 050f9ebc15..3b2dbd71cc 100644
--- a/tests/nlp/utils/test_properties.py
+++ b/tests/nlp/utils/test_properties.py
@@ -12,7 +12,6 @@
 import os
 import pathlib
 import timeit
-from unittest.mock import patch
 
 import numpy as np
 import pytest
@@ -54,6 +53,7 @@ def text_data_fixture():
     }
     return text_data
 
+
 # TODO: Fix test (problem with pytorch versions)
 # @patch('deepchecks.nlp.utils.text_properties.run_available_kwargs', mock_fn)
 # def test_that_warning_is_shown_for_big_datasets():
@@ -99,7 +99,7 @@ def test_average_word_length():
     result_empty_string = calculate_builtin_properties([''], include_properties=['Average Word Length'])[0]
 
     # Assert
-    assert_that(result['Average Word Length'], equal_to([19/5, 25/5, 30/7]))
+    assert_that(result['Average Word Length'], equal_to([19 / 5, 25 / 5, 30 / 7]))
     assert_that(result_none_text['Average Word Length'], equal_to([np.nan]))
     assert_that(result_empty_string['Average Word Length'], equal_to([0]))
 
@@ -129,13 +129,12 @@ def test_percentage_special_characters():
     result_empty_string = calculate_builtin_properties([''], include_properties=['% Special Characters'])[0]
 
     # Assert
-    assert_that(result['% Special Characters'], equal_to([2/25, 4/34, 0]))
+    assert_that(result['% Special Characters'], equal_to([2 / 25, 4 / 34, 0]))
     assert_that(result_none_text['% Special Characters'], equal_to([np.nan]))
     assert_that(result_empty_string['% Special Characters'], equal_to([0]))
 
 
 def test_calculate_lexical_density_property(tweet_emotion_train_test_textdata):
-
     # Arrange
     _, test = tweet_emotion_train_test_textdata
     test_text = test.text
@@ -151,7 +150,6 @@ def test_calculate_lexical_density_property(tweet_emotion_train_test_textdata):
 
 
 def test_calculate_unique_noun_count_property(tweet_emotion_train_test_textdata):
-
     # Arrange
     _, test = tweet_emotion_train_test_textdata
     test_text = test.text
@@ -186,7 +184,7 @@ def test_batch_size_change(tweet_emotion_train_test_textdata):
     # Arrange
     _, test = tweet_emotion_train_test_textdata
     test_text = test.text
-    batch_sizes=[1, 8, 16, 32, 64]
+    batch_sizes = [1, 8, 16, 32, 64]
 
     # Act
     for batch in batch_sizes:
@@ -206,7 +204,6 @@ def test_batch_size_change(tweet_emotion_train_test_textdata):
 
 
 def test_calculate_readability_score_property(tweet_emotion_train_test_textdata):
-
     # Arrange
     _, test = tweet_emotion_train_test_textdata
     test_text = test.text
@@ -222,7 +219,6 @@ def test_calculate_readability_score_property(tweet_emotion_train_test_textdata)
 
 
 def test_calculate_count_unique_urls(manual_text_data_for_properties):
-
     # Arrange
     text_data = manual_text_data_for_properties['url_data']
 
@@ -236,7 +232,6 @@ def test_calculate_count_unique_urls(manual_text_data_for_properties):
 
 
 def test_calculate_count_urls(manual_text_data_for_properties):
-
     # Arrange
     text_data = manual_text_data_for_properties['url_data']
 
@@ -250,7 +245,6 @@ def test_calculate_count_urls(manual_text_data_for_properties):
 
 
 def test_calculate_count_unique_email_addresses(manual_text_data_for_properties):
-
     # Arrange
     text_data = manual_text_data_for_properties['email_data']
 
@@ -264,7 +258,6 @@ def test_calculate_count_unique_email_addresses(manual_text_data_for_properties)
 
 
 def test_calculate_count_email_addresses(manual_text_data_for_properties):
-
     # Arrange
     text_data = manual_text_data_for_properties['email_data']
 
@@ -278,7 +271,6 @@ def test_calculate_count_email_addresses(manual_text_data_for_properties):
 
 
 def test_calculate_count_unique_syllables(tweet_emotion_train_test_textdata):
-
     # Arrange
     _, test = tweet_emotion_train_test_textdata
     test_text = test.text
@@ -293,7 +285,6 @@ def test_calculate_count_unique_syllables(tweet_emotion_train_test_textdata):
 
 
 def test_calculate_reading_time(tweet_emotion_train_test_textdata):
-
     # Arrange
     _, test = tweet_emotion_train_test_textdata
     test_text = test.text
@@ -309,7 +300,6 @@ def test_calculate_reading_time(tweet_emotion_train_test_textdata):
 
 
 def test_calculate_sentence_length(tweet_emotion_train_test_textdata):
-
     # Arrange
     _, test = tweet_emotion_train_test_textdata
     test_text = test.text
@@ -324,7 +314,6 @@ def test_calculate_sentence_length(tweet_emotion_train_test_textdata):
 
 
 def test_calculate_average_syllable_count(tweet_emotion_train_test_textdata):
-
     # Arrange
     _, test = tweet_emotion_train_test_textdata
     test_text = test.text
@@ -351,7 +340,6 @@ def test_calcualte_is_english_property_without_language_precalculation():
 
 
 def test_include_properties():
-
     # Arrange
     test_text = ['This is simple sentence.']
     # Also check capitalization doesn't matter:
@@ -369,13 +357,13 @@ def test_include_properties():
                 raises(DeepchecksValueError))
 
     # Check that raises if property doesn't exist:
-    assert_that(calling(calculate_builtin_properties).with_args(test_text, include_properties=['Non Existent Property']),
-                raises(DeepchecksValueError,
-                       r'include_properties contains properties that were not found: \[\'non existent property\'\].'))
+    assert_that(
+        calling(calculate_builtin_properties).with_args(test_text, include_properties=['Non Existent Property']),
+        raises(DeepchecksValueError,
+               r'include_properties contains properties that were not found: \[\'non existent property\'\].'))
 
 
 def test_ignore_properties():
-
     # Arrange
     test_text = ['This is simple sentence.']
     expected_properties = ['Text Length', 'Average Word Length', 'Max Word Length',
@@ -467,6 +455,27 @@ def test_properties_models_download_into_provided_directory():
     assert onnx_model_path.exists() and onnx_model_path.is_dir()
 
 
+@pytest.mark.skipif(
+    'TEST_NLP_PROPERTIES_MODELS_DOWNLOAD' not in os.environ,
+    reason='The test takes too long to run, provide env var if you want to run it.'
+)
+def test_batch_only_properties_calculation_with_single_samples():
+    # Arrange
+    text = ['Explicit is better than implicit']
+
+    # Act
+    properties, properties_types = calculate_builtin_properties(
+        raw_text=text, batch_size=1,
+        include_properties=['Formality', 'Text Length']
+    )
+
+    # Assert
+    assert_that(properties, has_entries({
+        'Formality': contains_exactly(close_to(0.955, 0.01)),
+        'Text Length': contains_exactly(*[len(it) for it in text]),
+    }))  # type: ignore
+
+
 def test_english_only_properties_calculation_with_not_english_samples():
     # Arrange
     text = [
@@ -518,7 +527,6 @@ def test_english_only_properties_calculated_for_all_samples():
     }))  # type: ignore
 
 
-
 def test_sample_for_property():
     s = 'all the single ladies. all the single ladies? now put your hands up.'
     sample_words = _sample_for_property(text=s, mode='words', limit=2, random_seed=42)

From 78eebd2e3a9af8e337299082dda757f5d12ecd47 Mon Sep 17 00:00:00 2001
From: Noam Bressler <noamzbr@gmail.com>
Date: Fri, 14 Jul 2023 18:03:37 +0300
Subject: [PATCH 11/23] Add long description for properties, and update some of
 the short ones. (#2629)

---
 deepchecks/nlp/utils/text_properties.py       | 19 ++--
 .../usage_guides/nlp_properties_extended.rst  | 88 +++++++++++++++++++
 2 files changed, 101 insertions(+), 6 deletions(-)
 create mode 100644 docs/source/nlp/usage_guides/nlp_properties_extended.rst

diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py
index 7870cb5f40..1b45205719 100644
--- a/deepchecks/nlp/utils/text_properties.py
+++ b/deepchecks/nlp/utils/text_properties.py
@@ -714,18 +714,25 @@ class TextProperty(TypedDict):
     'Max Word Length': 'Maximum number of characters in a word',
     '% Special Characters': 'Percentage of special characters in the text',
     'Language': 'Language of the text, using the fasttext language detection model',
-    'Sentiment': 'Sentiment of the text, calculated using the TextBlob sentiment analysis model',
-    'Subjectivity': 'Subjectivity of the text, calculated using the TextBlob sentiment analysis model',
+    'Sentiment': 'Sentiment of the text, calculated using the TextBlob sentiment analysis model.'
+                 ' Ranging from -1 (negative) to 1 (positive)',
+    'Subjectivity': 'Subjectivity of the text, calculated using the TextBlob sentiment analysis model. Ranging from 0 '
+                    '(objective) to 1 (subjective)',
     'Average Words Per Sentence': 'Average number of words per sentence in the text',
-    'Readability Score': 'A score calculated based on Flesch reading-ease per text sample',
+    'Readability Score': 'How easy to read a text sample is, from 0 (hard to read) to 100 (easy).'
+                         ' Based on Flesch reading-ease score',
     'Lexical Density': 'Percentage of unique words in the text',
-    'Toxicity': 'Toxicity score using unitary/toxic-bert HuggingFace model',
-    'Fluency': 'Fluency score using prithivida/parrot_fluency_model HuggingFace model',
-    'Formality': 'Formality score using s-nlp/roberta-base-formality-ranker HuggingFace model',
+    'Toxicity': 'A measure of how harmful or offensive a text sample is (0 to 1), uses the Detoxify library '
+                'unitary/toxic-bert model',
+    'Fluency': 'A measure of the fluency of the text (0 to 1), using the prithivida/parrot_fluency_model'
+               ' model from the authors of the Parrot Paraphraser library',
+    'Formality': 'The formality / register of the text (0 to 1), using the s-nlp/roberta-base-formality-ranker'
+                 ' model by the Skolkovo Institute of Science and Technology',
     'Unique Noun Count': 'Number of unique noun words in the text',
     'URLs Count': 'Number of URLS per text sample',
     'Email Addresses Count': 'Number of email addresses per text sample',
     'Unique URLs Count': 'Number of unique URLS per text sample',
+    'Is English': 'Whether the text is in English (1) or not (0)',
     'Unique Email Addresses Count': 'Number of unique email addresses per text sample',
     'Unique Syllables Count': 'Number of unique syllables per text sample',
     'Reading Time': 'Time taken in seconds to read a text sample',
diff --git a/docs/source/nlp/usage_guides/nlp_properties_extended.rst b/docs/source/nlp/usage_guides/nlp_properties_extended.rst
new file mode 100644
index 0000000000..cc730d33a3
--- /dev/null
+++ b/docs/source/nlp/usage_guides/nlp_properties_extended.rst
@@ -0,0 +1,88 @@
+.. _nlp__properties_ext:
+
+==========
+Properties
+==========
+
+Properties are one-dimensional values that are calculated on each text sample. For example, a property could be simple
+text characteristics such as the number of words in the text, or more complex properties such identifying if the
+text contains toxic language.
+
+Link Validly
+------------
+
+The Link Validity property represents the ratio of number links in the text that are valid links, divided by the total
+number of links. A valid link is a link that returns a **200 OK** HTML status when sent a HTTP HEAD request. For text
+without links, the property will always return 1 (all links valid).
+
+Readability Score
+-----------------
+
+A score calculated based on the
+`Flesch reading-ease <https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease>`__,
+calculated for each text sample. The score typically ranges from 0
+(very hard to read, requires intense concentration) to 100 (very easy to read) for english text, though in theory the
+score can range from -inf to 206.835 for arbitrary strings.
+
+Toxicity
+--------
+
+The Toxicity property is a measure of how harmful or offensive a text is. The Toxicity property uses a pre-trained model
+called `unitary/toxic-bert <https://huggingface.co/unitary/toxic-bert>`__ on HuggingFace, which is created and
+maintained by the `Detoxify <https://github.com/unitaryai/detoxify>`__ team, based on the BERT
+architecture and trained on a large corpus of toxic comments. The model assigns a toxicity score to each text,
+ranging from 0 (not toxic) to 1 (very toxic).
+
+Fluency
+-------
+
+The Fluency property is a score between 0 and 1 representing how “well” the input text is written, or how close it is
+to being a sample of fluent English text. A value of 0 represents very poorly written text, while 1 represents perfectly
+written English. The property uses a pre-trained model called
+`prithivida/parrot_fluency_model <https://huggingface.co/prithivida/parrot_fluency_model>`__ on HuggingFace, which
+was created by the authors of the `Parrot Paraphraser <https://github.com/PrithivirajDamodaran/Parrot_Paraphraser>`__
+library.
+
+Formality
+---------
+
+The Formality model returns a measure of how formal the input text is. It uses a pre-traind model called
+`s-nlp/roberta-base-formality-ranker <https://huggingface.co/s-nlp/roberta-base-formality-ranker>`__ on HuggingFace,
+trained by the Skolkovo Institute of Science and Technology (s-nlp).
+The model was trained to predict for English sentences, whether they are formal or informal, where a score of 0
+represents very informal text, and a score of 1 very formal text.
+The model uses the roberta-base architecture, and was trained on
+`GYAFC <https://github.com/raosudha89/GYAFC-corpus>`__ from
+`Rao and Tetreault, 2018 <https://aclanthology.org/N18-1012>`__ and online formality corpus from
+`Pavlick and Tetreault, 2016 <https://aclanthology.org/Q16-1005>`__.
+
+Avoided Answer
+--------------
+
+The Avoided Answer property calculates the probability (0 to 1) of how likely it is that the LLM avoided answering a
+question.
+The property uses a pre-trained bert architecture model that was trained on a dataset of questions and LLM answers
+collected from various LLMs, where the model was trained to predict whether the answer is an avoidance or not.
+
+Grounded in Context
+-------------------
+
+The Grounded in Context Score is a measure of how well the LLM output is grounded in the context of the conversation,
+ranging from 0 (not grounded) to 1 (fully grounded).
+In the definition of this property, grounding means that the LLM output is based on the context given to it as part of
+the input, and not on external knowledge, for example knowledge that was present in the LLM training data. Grounding
+can be thought of as kind of hallucination, as hallucination are outputs that are not grounded in the context, nor
+are true to the real world.
+
+The property is especially useful for evaluating use-cases such as Question Answering, where the LLM is expected to
+answer questions based on the context given to it as part of the input, and not based on external knowledge. An example
+for such a use-case would be Question Answering based on internal company knowledge, where introduction of external
+knowledge (that, for example, may be stale) into the answers is not desired - we can imagine a case in which an LLM is
+asked a question about the company's revenue, and the answer is based on the company's internal financial reports, and
+not on external knowledge such as the company's Wikipedia page. In the context of Question Answering, any answer that
+is not grounded in the context can be considered a hallucination.
+
+The property is calculated by identifying key entities and quantities in the LLM output, such as names, places, dates
+and prices, and then identifying the same entities and quantities in the input given to the LLM.
+The property is calculated as the ratio of the number of entities/quantities in the LLM output that are also in the
+input, divided by the total number of entities/quantities in the LLM output.

From 6017cb96f14c44b5be50d3afb85615d74f181754 Mon Sep 17 00:00:00 2001
From: JKL98ISR <jonatan.lib@gmail.com>
Date: Sun, 16 Jul 2023 18:26:34 +0300
Subject: [PATCH 12/23] fix nlp device kwargs (#2632)

* fix_device_stuff

* reqs
---
 deepchecks/nlp/utils/text_properties.py | 11 +++++------
 requirements/nlp-prop-requirements.txt  |  2 +-
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py
index 1b45205719..e6ccfe2031 100644
--- a/deepchecks/nlp/utils/text_properties.py
+++ b/deepchecks/nlp/utils/text_properties.py
@@ -189,14 +189,13 @@ def get_transformer_model(
         model_path = models_storage / 'onnx' / model_name
 
         if model_path.exists():
-            return onnx.ORTModelForSequenceClassification.from_pretrained(model_path, device_map=device)
+            return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1)
 
         model = onnx.ORTModelForSequenceClassification.from_pretrained(
             model_name,
             export=True,
             cache_dir=models_storage,
-            device_map=device
-        )
+        ).to(device or -1)
         # NOTE:
         # 'optimum', after exporting/converting a model to the ONNX format,
         # does not store it onto disk we need to save it now to not reconvert
@@ -207,7 +206,7 @@ def get_transformer_model(
     model_path = models_storage / 'onnx' / 'quantized' / model_name
 
     if model_path.exists():
-        return onnx.ORTModelForSequenceClassification.from_pretrained(model_path, device_map=device)
+        return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1)
 
     not_quantized_model = get_transformer_model(
         property_name,
@@ -217,7 +216,7 @@ def get_transformer_model(
         models_storage=models_storage
     )
 
-    quantizer = onnx.ORTQuantizer.from_pretrained(not_quantized_model, device_map=device)
+    quantizer = onnx.ORTQuantizer.from_pretrained(not_quantized_model).to(device or -1)
 
     quantizer.quantize(
         save_dir=model_path,
@@ -227,7 +226,7 @@ def get_transformer_model(
             per_channel=False
         )
     )
-    return onnx.ORTModelForSequenceClassification.from_pretrained(model_path, device_map=device)
+    return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1)
 
 
 def get_transformer_pipeline(
diff --git a/requirements/nlp-prop-requirements.txt b/requirements/nlp-prop-requirements.txt
index 7641f9ff93..58be61dd0e 100644
--- a/requirements/nlp-prop-requirements.txt
+++ b/requirements/nlp-prop-requirements.txt
@@ -1,2 +1,2 @@
-optimum[onnxruntime]>=1.7.0
+optimum[onnxruntime]>=1.8.8
 fasttext>=0.8.0

From 2ea001e60264891d8b12226004693f75fdec1c62 Mon Sep 17 00:00:00 2001
From: Noam Bressler <noamzbr@gmail.com>
Date: Mon, 17 Jul 2023 13:20:17 +0300
Subject: [PATCH 13/23] Add examples to new and existing properties (#2634)

* Add examples to new and existing properties
---
 .../usage_guides/nlp_properties_extended.rst  | 81 ++++++++++++++++++-
 1 file changed, 79 insertions(+), 2 deletions(-)

diff --git a/docs/source/nlp/usage_guides/nlp_properties_extended.rst b/docs/source/nlp/usage_guides/nlp_properties_extended.rst
index cc730d33a3..9d93039cb0 100644
--- a/docs/source/nlp/usage_guides/nlp_properties_extended.rst
+++ b/docs/source/nlp/usage_guides/nlp_properties_extended.rst
@@ -8,13 +8,23 @@ Properties are one-dimensional values that are calculated on each text sample. F
 text characteristics such as the number of words in the text, or more complex properties such identifying if the
 text contains toxic language.
 
-Link Validly
-------------
+* `Link Validity <#link-validity>`__
+* `Readability Score <#readability-score>`__
+* `Toxicity <#toxicity>`__
+* `Fluency <#fluency>`__
+* `Formality <#formality>`__
+* `Avoided Answer <#avoided-answer>`__
+* `Grounded in Context <#grounded-in-context>`__
+
+Link Validity
+-------------
 
 The Link Validity property represents the ratio of number links in the text that are valid links, divided by the total
 number of links. A valid link is a link that returns a **200 OK** HTML status when sent a HTTP HEAD request. For text
 without links, the property will always return 1 (all links valid).
 
+`Back to Property List <#properties>`__
+
 Readability Score
 -----------------
 
@@ -24,6 +34,8 @@ calculated for each text sample. The score typically ranges from 0
 (very hard to read, requires intense concentration) to 100 (very easy to read) for english text, though in theory the
 score can range from -inf to 206.835 for arbitrary strings.
 
+`Back to Property List <#properties>`__
+
 Toxicity
 --------
 
@@ -33,6 +45,19 @@ maintained by the `Detoxify <https://github.com/unitaryai/detoxify>`__ team, bas
 architecture and trained on a large corpus of toxic comments. The model assigns a toxicity score to each text,
 ranging from 0 (not toxic) to 1 (very toxic).
 
+`Back to Property List <#properties>`__
+
+Examples
+~~~~~~~~
+
+================================  ========
+Text                              Toxicity
+================================  ========
+Hello! How can I help you today?  0.0007
+You have been a bad user!         0.301
+I hate you!                       0.951
+================================  ========
+
 Fluency
 -------
 
@@ -43,6 +68,19 @@ written English. The property uses a pre-trained model called
 was created by the authors of the `Parrot Paraphraser <https://github.com/PrithivirajDamodaran/Parrot_Paraphraser>`__
 library.
 
+`Back to Property List <#properties>`__
+
+Examples
+~~~~~~~~
+
+===============================================================================================================================================================  ========
+Text                                                                                                                                                             Fluency
+===============================================================================================================================================================  ========
+Natural language processing is an interdisciplinary subfield of linguistics, computer science, and artificial intelligence.                                      0.97
+Pass on what you have learned. Strength, mastery, hmm… but weakness, folly, failure, also. Yes, failure, most of all. The greatest teacher, failure is.          0.75
+Whispering dreams, forgotten desires, chaotic thoughts, dance with words, meaning elusive, swirling amidst.                                                      0.2
+===============================================================================================================================================================  ========
+
 Formality
 ---------
 
@@ -56,6 +94,19 @@ The model uses the roberta-base architecture, and was trained on
 `Rao and Tetreault, 2018 <https://aclanthology.org/N18-1012>`__ and online formality corpus from
 `Pavlick and Tetreault, 2016 <https://aclanthology.org/Q16-1005>`__.
 
+`Back to Property List <#properties>`__
+
+Examples
+~~~~~~~~
+
+================================================================  ========
+Text                                                              Formality
+================================================================  ========
+I hope this email finds you well                                  0.79
+I hope this email find you swell                                  0.28
+What's up doc?                                                    0.14
+================================================================  ========
+
 Avoided Answer
 --------------
 
@@ -64,6 +115,19 @@ question.
 The property uses a pre-trained bert architecture model that was trained on a dataset of questions and LLM answers
 collected from various LLMs, where the model was trained to predict whether the answer is an avoidance or not.
 
+`Back to Property List <#properties>`__
+
+Examples
+~~~~~~~~
+
+============================================================================  ========
+Question                                                                      Avoided Answer
+============================================================================  ========
+The answer is 42                                                              0.001
+You should ask the appropriate authorities                                    0.681
+As a Large Language Model trained by Open AI, I can not answer this question  0.994
+============================================================================  ========
+
 Grounded in Context
 -------------------
 
@@ -86,3 +150,16 @@ The property is calculated by identifying key entities and quantities in the LLM
 and prices, and then identifying the same entities and quantities in the input given to the LLM.
 The property is calculated as the ratio of the number of entities/quantities in the LLM output that are also in the
 input, divided by the total number of entities/quantities in the LLM output.
+
+`Back to Property List <#properties>`__
+
+Examples
+~~~~~~~~
+
+======================================================================================================================  =====================================================  ===================
+LLM Input                                                                                                               LLM Output                                             Grounded in Context
+======================================================================================================================  =====================================================  ===================
+Michael Jordan (1963) is an American former professional basketball player and businessman. In what year was he born?   He was born in 1963.                                   1.0
+Michael Jordan (1963) is an American former professional basketball player and businessman. When was Michael born?      Michael Jeffrey Jordan was born in 1963                0.5
+Michael Jordan (1963) is an American former professional basketball player and businessman. What did he achieve?        He won many NBA championships with the Chicago Bulls   0.0
+======================================================================================================================  =====================================================  ===================

From 57238087a8521cefbb06cf481d36c78e5d50b562 Mon Sep 17 00:00:00 2001
From: Noam Bressler <noamzbr@gmail.com>
Date: Wed, 19 Jul 2023 09:26:39 +0300
Subject: [PATCH 14/23] Amend docs - use spearman rather than Pearson (#2636)

---
 .../tabular/data_integrity/plot_feature_feature_correlation.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/checks/tabular/data_integrity/plot_feature_feature_correlation.py b/docs/source/checks/tabular/data_integrity/plot_feature_feature_correlation.py
index 2ac6791381..98526905f9 100644
--- a/docs/source/checks/tabular/data_integrity/plot_feature_feature_correlation.py
+++ b/docs/source/checks/tabular/data_integrity/plot_feature_feature_correlation.py
@@ -23,7 +23,7 @@
 This check works with 2 types of features: categorical and numerical, and uses a different method to calculate the
 correlation for each combination of feature types:
 
-1. numerical-numerical: `Pearson's correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`__
+1. numerical-numerical: `Spearman's correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`__
 2. numerical-categorical: `Correlation ratio <https://en.wikipedia.org/wiki/Correlation_ratio>`__
 3. categorical-categorical: `Symmetric Theil's U <https://en.wikipedia.org/wiki/Uncertainty_coefficient>`__
 

From 3071ebf54c4ec1ebb5ab14b0bb4608be70659611 Mon Sep 17 00:00:00 2001
From: Noam Bressler <noamzbr@gmail.com>
Date: Wed, 19 Jul 2023 11:03:30 +0300
Subject: [PATCH 15/23] Differentiate special chars from punctuations (#2635)

* Differentiate special chars from punctuations
---
 deepchecks/nlp/utils/text_properties.py       | 23 +++++++++++++++----
 .../nlp/usage_guides/nlp_properties.rst       |  3 ++-
 .../under_annotated_segments_test.py          |  2 +-
 .../property_drift_test.py                    |  4 ++--
 tests/nlp/test_text_data.py                   |  6 ++---
 tests/nlp/utils/test_properties.py            | 21 ++++++++++++++---
 6 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py
index e6ccfe2031..bff58b1c7b 100644
--- a/deepchecks/nlp/utils/text_properties.py
+++ b/deepchecks/nlp/utils/text_properties.py
@@ -31,7 +31,7 @@
 from deepchecks.core.errors import DeepchecksValueError
 from deepchecks.nlp.utils.text import cut_string, hash_text, normalize_text, remove_punctuation
 from deepchecks.utils.function import run_available_kwargs
-from deepchecks.utils.strings import format_list
+from deepchecks.utils.strings import SPECIAL_CHARACTERS, format_list
 
 __all__ = ['calculate_builtin_properties', 'get_builtin_properties_types']
 
@@ -41,6 +41,10 @@
 FASTTEXT_LANG_MODEL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin'
 DEFAULT_SENTENCE_SAMPLE_SIZE = 300
 MAX_CHARS = 512  # Bert accepts max of 512 tokens, so without counting tokens we go for the lower bound.
+# all SPECIAL_CHARACTERS - all string.punctuation except for <>@[]^_`{|}~ - all whitespace
+NON_PUNCTUATION_SPECIAL_CHARS = frozenset(set(SPECIAL_CHARACTERS) - set(r"""!"#$%&'()*+,-./:;=?\@""")
+                                          - set(string.whitespace))
+
 textblob_cache = {}
 words_cache = {}
 sentences_cache = {}
@@ -265,6 +269,11 @@ def average_word_length(text: str) -> float:
 
 def percentage_special_characters(text: str) -> float:
     """Return percentage of special characters (as float between 0 and 1)."""
+    return len([c for c in text if c in NON_PUNCTUATION_SPECIAL_CHARS]) / len(text) if len(text) != 0 else 0
+
+
+def percentage_punctuation(text: str) -> float:
+    """Return percentage of punctuation (as float between 0 and 1)."""
     return len([c for c in text if c in string.punctuation]) / len(text) if len(text) != 0 else 0
 
 
@@ -669,6 +678,7 @@ class TextProperty(TypedDict):
         {'name': 'Average Word Length', 'method': average_word_length, 'output_type': 'numeric'},
         {'name': 'Max Word Length', 'method': max_word_length, 'output_type': 'numeric'},
         {'name': '% Special Characters', 'method': percentage_special_characters, 'output_type': 'numeric'},
+        {'name': '% Punctuation', 'method': percentage_punctuation, 'output_type': 'numeric'},
         {'name': 'Language', 'method': language, 'output_type': 'categorical'},
         {'name': 'Sentiment', 'method': sentiment, 'output_type': 'numeric'},
         {'name': 'Subjectivity', 'method': subjectivity, 'output_type': 'numeric'},
@@ -711,7 +721,10 @@ class TextProperty(TypedDict):
     'Text Length': 'Number of characters in the text',
     'Average Word Length': 'Average number of characters in a word',
     'Max Word Length': 'Maximum number of characters in a word',
-    '% Special Characters': 'Percentage of special characters in the text',
+    '% Special Characters': 'Percentage of special characters in the text. Special characters are non-alphanumeric '
+                            'unicode characters, excluding whitespaces and any of !\"#$%&\'()*+,-./:;=?\\@.',
+    '% Punctuation': 'Percentage of punctuation characters in the text. Punctuation characters are any of '
+                     '!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~',
     'Language': 'Language of the text, using the fasttext language detection model',
     'Sentiment': 'Sentiment of the text, calculated using the TextBlob sentiment analysis model.'
                  ' Ranging from -1 (negative) to 1 (positive)',
@@ -829,14 +842,14 @@ def calculate_builtin_properties(
     include_properties : List[str], default None
         The properties to calculate. If None, all default properties will be calculated. Cannot be used
         together with ignore_properties parameter. Available properties are:
-        ['Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters', 'Language',
+        ['Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters', '% Punctuation', 'Language',
         'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Lexical Density', 'Unique Noun Count',
         'Readability Score', 'Average Words Per Sentence', 'URLs Count', Unique URLs Count', 'Email Address Count',
         'Unique Email Address Count', 'Unique Syllables Count', 'Reading Time', 'Sentences Count',
         'Average Syllable Length']
         List of default properties are: ['Text Length', 'Average Word Length', 'Max Word Length',
-        '% Special Characters', 'Language', 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality',
-        'Lexical Density', 'Unique Noun Count', 'Readability Score', 'Average Words Per Sentence']
+        '% Special Characters', '% Punctuation', 'Language', 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency',
+        'Formality', 'Lexical Density', 'Unique Noun Count', 'Readability Score', 'Average Words Per Sentence']
         To calculate all the default properties, the include_properties and ignore_properties parameters should
         be None. If you pass either include_properties or ignore_properties then only the properties specified
         in the list will be calculated or ignored.
diff --git a/docs/source/nlp/usage_guides/nlp_properties.rst b/docs/source/nlp/usage_guides/nlp_properties.rst
index 68d7e14824..0488d4cc0c 100644
--- a/docs/source/nlp/usage_guides/nlp_properties.rst
+++ b/docs/source/nlp/usage_guides/nlp_properties.rst
@@ -55,7 +55,8 @@ Property name                   Default Property  Description
 Text Length                     Yes               Number of characters in the text                                                                                                                                                      No
 Average Word Length             Yes               Average number of characters in a word                                                                                                                                                No
 Max Word Length                 Yes               Maximum number of characters in a word                                                                                                                                                No
-% Special Characters            Yes               Percentage of special characters in the text                                                                                                                                          No
+% Special Characters            Yes               Percentage of special characters in the text. Special characters are non-alphanumeric unicode characters, excluding whitespaces and any of !"#$%&'()*+,-./:;=?\@.                     No
+% Punctuations                  Yes               Percentage of punctuation characters in the text. Punctuation characters are any of '!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~                                                                 No
 Language                        Yes               Language of the text. Uses the langdetect library                                                                                                                                     No
 Sentiment                       Yes               Sentiment of the text. Uses the textblob library                                                                                                                                      Yes
 Subjectivity                    Yes               Subjectivity of the text. Uses the textblob library                                                                                                                                   Yes
diff --git a/tests/nlp/checks/data_integrity/under_annotated_segments_test.py b/tests/nlp/checks/data_integrity/under_annotated_segments_test.py
index 4ff02be71e..6989f183ba 100644
--- a/tests/nlp/checks/data_integrity/under_annotated_segments_test.py
+++ b/tests/nlp/checks/data_integrity/under_annotated_segments_test.py
@@ -123,7 +123,7 @@ def test_token_classification_dataset(small_wikiann_train_test_text_data):
     ))
 
     assert_that(result.value['avg_score'], close_to(0.8, 0.001))
-    assert_that(len(result.value['weak_segments_list']), equal_to(23))
+    assert_that(len(result.value['weak_segments_list']), equal_to(25))
     assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.2, 0.01))
 
 
diff --git a/tests/nlp/checks/train_test_validation/property_drift_test.py b/tests/nlp/checks/train_test_validation/property_drift_test.py
index ef92d44ae4..f54f1aa600 100644
--- a/tests/nlp/checks/train_test_validation/property_drift_test.py
+++ b/tests/nlp/checks/train_test_validation/property_drift_test.py
@@ -80,7 +80,7 @@ def test_with_drift(self, tweet_emotion_train_test_textdata):
                 "Method": "Kolmogorov-Smirnov",
                 "Importance": None}),
             "% Special Characters": has_entries({
-                "Drift score": close_to(0.23, 0.01),
+                "Drift score": close_to(0.13, 0.01),
                 "Method": "Kolmogorov-Smirnov",
                 "Importance": None}),
             "Sentiment": has_entries({
@@ -137,7 +137,7 @@ def test_with_drift(self, small_wikiann_train_test_text_data):
         assert_that(result.value, has_entries({
             'Max Word Length': has_entries({'Drift score': close_to(0.18, 0.01), 'Method': 'Kolmogorov-Smirnov'}),
             'Average Word Length': has_entries({'Drift score': close_to(0.24, 0.01), 'Method': 'Kolmogorov-Smirnov'}),
-            '% Special Characters': has_entries({'Drift score': close_to(0.16, 0.01), 'Method': 'Kolmogorov-Smirnov'}),
+            '% Special Characters': has_entries({'Drift score': close_to(0.04, 0.01), 'Method': 'Kolmogorov-Smirnov'}),
             'Text Length': has_entries({'Drift score': close_to(0.3, 0.01), 'Method': 'Kolmogorov-Smirnov'}),
             'Subjectivity': has_entries({'Drift score': None, 'Method': None}),
             'Sentiment': has_entries({'Drift score': None, 'Method': None})
diff --git a/tests/nlp/test_text_data.py b/tests/nlp/test_text_data.py
index d441d4679c..f59fe3eed8 100644
--- a/tests/nlp/test_text_data.py
+++ b/tests/nlp/test_text_data.py
@@ -186,14 +186,14 @@ def test_properties(text_classification_dataset_mock):
     dataset.calculate_builtin_properties(include_long_calculation_properties=False)
     properties = dataset.properties
     assert_that(properties.shape[0], equal_to(3))
-    assert_that(properties.shape[1], equal_to(10))
+    assert_that(properties.shape[1], equal_to(11))
     assert_that(properties.columns, contains_exactly(
         'Text Length', 'Average Word Length',
-        'Max Word Length', '% Special Characters', 'Language', 'Sentiment',
+        'Max Word Length', '% Special Characters', '% Punctuation', 'Language', 'Sentiment',
         'Subjectivity', 'Average Words Per Sentence', 'Readability Score', 'Lexical Density'
     ))
     assert_that(properties.iloc[0].values, contains_exactly(
-        22, 3.6, 9, 0.0, 'en', 0.0, 0.0, 5.0, 100.24, 80.0
+        22, 3.6, 9, 0.0, 0.0, 'en', 0.0, 0.0, 5.0, 100.24, 80.0
     ))
 
 
diff --git a/tests/nlp/utils/test_properties.py b/tests/nlp/utils/test_properties.py
index 3b2dbd71cc..6efd7afb86 100644
--- a/tests/nlp/utils/test_properties.py
+++ b/tests/nlp/utils/test_properties.py
@@ -121,7 +121,7 @@ def test_max_word_length():
 
 def test_percentage_special_characters():
     # Arrange
-    text = ['This is a, test sentence.', 'This is another !!! test sentence.', 'וואלק זה משפט בעברית אפילו יא ווראדי']
+    text = ['This is a, test sentence.', 'This is another <|> test sentence.', 'וואלק זה משפט בעברית אפילו יא ווראדי']
 
     # Act
     result = calculate_builtin_properties(text, include_properties=['% Special Characters'])[0]
@@ -129,11 +129,26 @@ def test_percentage_special_characters():
     result_empty_string = calculate_builtin_properties([''], include_properties=['% Special Characters'])[0]
 
     # Assert
-    assert_that(result['% Special Characters'], equal_to([2 / 25, 4 / 34, 0]))
+    assert_that(result['% Special Characters'], equal_to([0, 3 / 34, 0]))
     assert_that(result_none_text['% Special Characters'], equal_to([np.nan]))
     assert_that(result_empty_string['% Special Characters'], equal_to([0]))
 
 
+def test_percentage_punctuation():
+    # Arrange
+    text = ['This is a, test sentence.', 'This is another <|> test sentence.', 'וואלק זה משפט בעברית אפילו יא ווראדי']
+
+    # Act
+    result = calculate_builtin_properties(text, include_properties=['% Punctuation'])[0]
+    result_none_text = calculate_builtin_properties([None], include_properties=['% Punctuation'])[0]
+    result_empty_string = calculate_builtin_properties([''], include_properties=['% Punctuation'])[0]
+
+    # Assert
+    assert_that(result['% Punctuation'], equal_to([2 / 25, 4 / 34, 0]))
+    assert_that(result_none_text['% Punctuation'], equal_to([np.nan]))
+    assert_that(result_empty_string['% Punctuation'], equal_to([0]))
+
+
 def test_calculate_lexical_density_property(tweet_emotion_train_test_textdata):
     # Arrange
     _, test = tweet_emotion_train_test_textdata
@@ -367,7 +382,7 @@ def test_ignore_properties():
     # Arrange
     test_text = ['This is simple sentence.']
     expected_properties = ['Text Length', 'Average Word Length', 'Max Word Length',
-                           '% Special Characters', 'Language', 'Sentiment', 'Subjectivity',
+                           '% Special Characters', '% Punctuation', 'Language', 'Sentiment', 'Subjectivity',
                            'Lexical Density', 'Readability Score', 'Average Words Per Sentence']
 
     # Also check capitalization doesn't matter:

From 407652aaaabd7980742f04c8b2e42f61bf91f9ae Mon Sep 17 00:00:00 2001
From: Noam Bressler <noamzbr@gmail.com>
Date: Thu, 20 Jul 2023 14:17:19 +0300
Subject: [PATCH 16/23] Rename Readability Score to Reading Ease (#2639)

---
 deepchecks/nlp/utils/text_properties.py          | 16 ++++++++--------
 docs/source/nlp/usage_guides/nlp_properties.rst  |  2 +-
 .../nlp/usage_guides/nlp_properties_extended.rst |  4 ++--
 tests/nlp/test_text_data.py                      |  2 +-
 tests/nlp/utils/test_properties.py               | 10 +++++-----
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py
index bff58b1c7b..c200c12229 100644
--- a/deepchecks/nlp/utils/text_properties.py
+++ b/deepchecks/nlp/utils/text_properties.py
@@ -520,7 +520,7 @@ def readability_score(text: str, cmudict_dict: dict = None) -> float:
         return np.nan
     if cmudict_dict is None:
         if not nltk_download('cmudict', quiet=True):
-            _warn_if_missing_nltk_dependencies('cmudict', 'Readability Score')
+            _warn_if_missing_nltk_dependencies('cmudict', 'Reading Ease')
             return np.nan
         cmudict_dict = corpus.cmudict.dict()
     text_sentences = _sample_for_property(text, mode='sentences', limit=DEFAULT_SENTENCE_SAMPLE_SIZE,
@@ -683,7 +683,7 @@ class TextProperty(TypedDict):
         {'name': 'Sentiment', 'method': sentiment, 'output_type': 'numeric'},
         {'name': 'Subjectivity', 'method': subjectivity, 'output_type': 'numeric'},
         {'name': 'Average Words Per Sentence', 'method': average_words_per_sentence, 'output_type': 'numeric'},
-        {'name': 'Readability Score', 'method': readability_score, 'output_type': 'numeric'},
+        {'name': 'Reading Ease', 'method': readability_score, 'output_type': 'numeric'},
         {'name': 'Lexical Density', 'method': lexical_density, 'output_type': 'numeric'},
         {'name': 'Toxicity', 'method': toxicity, 'output_type': 'numeric'},
         {'name': 'Fluency', 'method': fluency, 'output_type': 'numeric'},
@@ -711,11 +711,11 @@ class TextProperty(TypedDict):
 LARGE_SAMPLE_SIZE = 10_000
 
 ENGLISH_ONLY_PROPERTIES = (
-    'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Readability Score',
+    'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Reading Ease',
     'Unique Noun Count', 'Unique Syllables Count', 'Sentences Count', 'Average Syllable Length'
 )
 
-CMUDICT_PROPERTIES = ('Average Syllable Length', 'Unique Syllables Count', 'Readability Score')
+CMUDICT_PROPERTIES = ('Average Syllable Length', 'Unique Syllables Count', 'Reading Ease')
 
 TEXT_PROPERTIES_DESCRIPTION = {
     'Text Length': 'Number of characters in the text',
@@ -731,8 +731,8 @@ class TextProperty(TypedDict):
     'Subjectivity': 'Subjectivity of the text, calculated using the TextBlob sentiment analysis model. Ranging from 0 '
                     '(objective) to 1 (subjective)',
     'Average Words Per Sentence': 'Average number of words per sentence in the text',
-    'Readability Score': 'How easy to read a text sample is, from 0 (hard to read) to 100 (easy).'
-                         ' Based on Flesch reading-ease score',
+    'Reading Ease': 'How easy to read a text sample is, typically ranges from around 0 (hard to read) to around '
+                    '100 (very easy). Based on Flesch reading-ease score',
     'Lexical Density': 'Percentage of unique words in the text',
     'Toxicity': 'A measure of how harmful or offensive a text sample is (0 to 1), uses the Detoxify library '
                 'unitary/toxic-bert model',
@@ -844,12 +844,12 @@ def calculate_builtin_properties(
         together with ignore_properties parameter. Available properties are:
         ['Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters', '% Punctuation', 'Language',
         'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Lexical Density', 'Unique Noun Count',
-        'Readability Score', 'Average Words Per Sentence', 'URLs Count', Unique URLs Count', 'Email Address Count',
+        'Reading Ease', 'Average Words Per Sentence', 'URLs Count', Unique URLs Count', 'Email Address Count',
         'Unique Email Address Count', 'Unique Syllables Count', 'Reading Time', 'Sentences Count',
         'Average Syllable Length']
         List of default properties are: ['Text Length', 'Average Word Length', 'Max Word Length',
         '% Special Characters', '% Punctuation', 'Language', 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency',
-        'Formality', 'Lexical Density', 'Unique Noun Count', 'Readability Score', 'Average Words Per Sentence']
+        'Formality', 'Lexical Density', 'Unique Noun Count', 'Reading Ease', 'Average Words Per Sentence']
         To calculate all the default properties, the include_properties and ignore_properties parameters should
         be None. If you pass either include_properties or ignore_properties then only the properties specified
         in the list will be calculated or ignored.
diff --git a/docs/source/nlp/usage_guides/nlp_properties.rst b/docs/source/nlp/usage_guides/nlp_properties.rst
index 0488d4cc0c..55a7f8c7c4 100644
--- a/docs/source/nlp/usage_guides/nlp_properties.rst
+++ b/docs/source/nlp/usage_guides/nlp_properties.rst
@@ -65,7 +65,7 @@ Fluency*                        Yes               Fluency of the text. Uses the
 Formality*                      Yes               Formality of the text. Uses the s-nlp/roberta-base-formality-ranker model                                                                                                             Yes
 Lexical Density                 Yes               Percentage of unique words in the text, rounded up to 2 decimal digits                                                                                                                Yes
 Unique Noun Count*              Yes               Number of unique noun words in the text                                                                                                                                               Yes
-Readability Score               Yes               A score calculated based on Flesch reading-ease per text sample. For more information: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease     Yes
+Reading Ease                    Yes               A score calculated based on Flesch reading-ease per text sample. For more information: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease     Yes
 Average Words Per Sentence      Yes               Average number of words per sentence in the text                                                                                                                                      No
 URLs Count                      No                Number of URLS per text sample.                                                                                                                                                       No
 Unique URLs Count               No                Number of unique URLS per text sample.                                                                                                                                                No
diff --git a/docs/source/nlp/usage_guides/nlp_properties_extended.rst b/docs/source/nlp/usage_guides/nlp_properties_extended.rst
index 9d93039cb0..8d1e3b79d0 100644
--- a/docs/source/nlp/usage_guides/nlp_properties_extended.rst
+++ b/docs/source/nlp/usage_guides/nlp_properties_extended.rst
@@ -9,7 +9,7 @@ text characteristics such as the number of words in the text, or more complex pr
 text contains toxic language.
 
 * `Link Validity <#link-validity>`__
-* `Readability Score <#readability-score>`__
+* `Reading Ease <#reading-ease>`__
 * `Toxicity <#toxicity>`__
 * `Fluency <#fluency>`__
 * `Formality <#formality>`__
@@ -25,7 +25,7 @@ without links, the property will always return 1 (all links valid).
 
 `Back to Property List <#properties>`__
 
-Readability Score
+Reading Ease
 -----------------
 
 A score calculated based on the
diff --git a/tests/nlp/test_text_data.py b/tests/nlp/test_text_data.py
index f59fe3eed8..f9d03aff09 100644
--- a/tests/nlp/test_text_data.py
+++ b/tests/nlp/test_text_data.py
@@ -190,7 +190,7 @@ def test_properties(text_classification_dataset_mock):
     assert_that(properties.columns, contains_exactly(
         'Text Length', 'Average Word Length',
         'Max Word Length', '% Special Characters', '% Punctuation', 'Language', 'Sentiment',
-        'Subjectivity', 'Average Words Per Sentence', 'Readability Score', 'Lexical Density'
+        'Subjectivity', 'Average Words Per Sentence', 'Reading Ease', 'Lexical Density'
     ))
     assert_that(properties.iloc[0].values, contains_exactly(
         22, 3.6, 9, 0.0, 0.0, 'en', 0.0, 0.0, 5.0, 100.24, 80.0
diff --git a/tests/nlp/utils/test_properties.py b/tests/nlp/utils/test_properties.py
index 6efd7afb86..af4f61c1da 100644
--- a/tests/nlp/utils/test_properties.py
+++ b/tests/nlp/utils/test_properties.py
@@ -224,13 +224,13 @@ def test_calculate_readability_score_property(tweet_emotion_train_test_textdata)
     test_text = test.text
 
     # Act
-    result = calculate_builtin_properties(test_text, include_properties=['Readability Score'])[0]
-    result_none_text = calculate_builtin_properties([None], include_properties=['Readability Score'])[0]
+    result = calculate_builtin_properties(test_text, include_properties=['Reading Ease'])[0]
+    result_none_text = calculate_builtin_properties([None], include_properties=['Reading Ease'])[0]
 
     # Assert
-    assert_that(result['Readability Score'][0: 10], equal_to([96.577, 97.001, 80.306, 67.755, 77.103, 71.782,
+    assert_that(result['Reading Ease'][0: 10], equal_to([96.577, 97.001, 80.306, 67.755, 77.103, 71.782,
                                                               np.nan, 75.5, 70.102, 95.564]))
-    assert_that(result_none_text['Readability Score'], equal_to([np.nan]))
+    assert_that(result_none_text['Reading Ease'], equal_to([np.nan]))
 
 
 def test_calculate_count_unique_urls(manual_text_data_for_properties):
@@ -383,7 +383,7 @@ def test_ignore_properties():
     test_text = ['This is simple sentence.']
     expected_properties = ['Text Length', 'Average Word Length', 'Max Word Length',
                            '% Special Characters', '% Punctuation', 'Language', 'Sentiment', 'Subjectivity',
-                           'Lexical Density', 'Readability Score', 'Average Words Per Sentence']
+                           'Lexical Density', 'Reading Ease', 'Average Words Per Sentence']
 
     # Also check capitalization doesn't matter:
     ignore_properties = ['Unique noun Count', 'toxicity', 'fluency', 'Formality']

From fa069e674375f7f4523609b2c97e588e6d1b0a3a Mon Sep 17 00:00:00 2001
From: Itay Gabbay <itay@deepchecks.com>
Date: Sun, 23 Jul 2023 16:36:43 +0300
Subject: [PATCH 17/23] Fix

---
 deepchecks/nlp/utils/text_properties.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py
index c200c12229..98d514723a 100644
--- a/deepchecks/nlp/utils/text_properties.py
+++ b/deepchecks/nlp/utils/text_properties.py
@@ -416,7 +416,6 @@ def predict_on_batch(text_batch: Sequence[str], classifier,
 
         except Exception:  # pylint: disable=broad-except
             reduced_batch_size = max(reduced_batch_size // 2, 1)  # Reduce the batch size by half
-            text_list_to_predict = []  # Clear the list of texts to predict for retry
             retry_count += 1
 
     return [np.nan] * len(text_batch)  # Prediction failed, return NaN values for the original batch size

From bd951479bc81a3fbfbaf43a7e4f1b804c9e5ce0b Mon Sep 17 00:00:00 2001
From: Shay Tsadok <shayts7@gmail.com>
Date: Tue, 25 Jul 2023 14:34:34 +0300
Subject: [PATCH 18/23] Add ability to cache text prop models (#2641)

* Add ability to cache text prop models

* isort fix

* adding test to model caching

* adding a comment

* adding license

* adding license to test

* exclude certifi.2023.7.22

* disabling the test for now

* add docstrings

* docstring changes
---
 .github/workflows/build.yml                   |   2 +-
 deepchecks/nlp/utils/text_properties.py       | 206 ++-------------
 .../nlp/utils/text_properties_models.py       | 237 ++++++++++++++++++
 tests/nlp/utils/test_properties.py            |  13 +-
 4 files changed, 261 insertions(+), 197 deletions(-)
 create mode 100644 deepchecks/nlp/utils/text_properties_models.py

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index e9b6df7b6a..59dd48ff7f 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -128,7 +128,7 @@ jobs:
       with:
         requirements: 'requirements-all.txt'
         fail: 'Copyleft,Other,Error'
-        exclude: '(pyzmq.*23\.2\.1|debugpy.*1\.6\.7|certifi.*2023\.5\.7|tqdm.*4\.65\.0|webencodings.*0\.5\.1|torch.*1\.10\.2.*|torchvision.*0\.11\.3.*|terminado.*0\.15\.0.*|urllib3.*1\.26\.11.*|imageio.*2\.20\.0.*|jsonschema.*4\.8\.0.*|qudida.*0\.0\.4)'
+        exclude: '(pyzmq.*23\.2\.1|debugpy.*1\.6\.7|certifi.*2023\.7\.22|tqdm.*4\.65\.0|webencodings.*0\.5\.1|torch.*1\.10\.2.*|torchvision.*0\.11\.3.*|terminado.*0\.15\.0.*|urllib3.*1\.26\.11.*|imageio.*2\.20\.0.*|jsonschema.*4\.8\.0.*|qudida.*0\.0\.4)'
         # pyzmq is Revised BSD https://github.com/zeromq/pyzmq/blob/main/examples/LICENSE
         # debugpy is MIT https://github.com/microsoft/debugpy/blob/main/LICENSE
         # certifi is MPL-2.0 https://github.com/certifi/python-certifi/blob/master/LICENSE
diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py
index 98d514723a..52f7708345 100644
--- a/deepchecks/nlp/utils/text_properties.py
+++ b/deepchecks/nlp/utils/text_properties.py
@@ -10,7 +10,6 @@
 #
 """Module containing the text properties for the NLP module."""
 import gc
-import importlib
 import pathlib
 import re
 import string
@@ -20,7 +19,6 @@
 
 import numpy as np
 import pandas as pd
-import requests
 import textblob
 from nltk import corpus
 from nltk import download as nltk_download
@@ -30,6 +28,7 @@
 
 from deepchecks.core.errors import DeepchecksValueError
 from deepchecks.nlp.utils.text import cut_string, hash_text, normalize_text, remove_punctuation
+from deepchecks.nlp.utils.text_properties_models import get_cmudict_dict, get_fasttext_model, get_transformer_pipeline
 from deepchecks.utils.function import run_available_kwargs
 from deepchecks.utils.strings import SPECIAL_CHARACTERS, format_list
 
@@ -37,8 +36,6 @@
 
 from deepchecks.utils.validation import is_sequence_not_str
 
-MODELS_STORAGE = pathlib.Path(__file__).absolute().parent / '.nlp-models'
-FASTTEXT_LANG_MODEL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin'
 DEFAULT_SENTENCE_SAMPLE_SIZE = 300
 MAX_CHARS = 512  # Bert accepts max of 512 tokens, so without counting tokens we go for the lower bound.
 # all SPECIAL_CHARACTERS - all string.punctuation except for <>@[]^_`{|}~ - all whitespace
@@ -103,159 +100,12 @@ def _sample_for_property(text: str, mode: str = 'words', limit: int = 10000, ret
     return ' '.join(all_units) if not return_as_list else list(all_units)
 
 
-def _import_optional_property_dependency(
-        module: str,
-        property_name: str,
-        package_name: Optional[str] = None,
-        error_template: Optional[str] = None
-):
-    try:
-        lib = importlib.import_module(module)
-    except ImportError as error:
-        package_name = package_name or module.split('.', maxsplit=1)[0]
-        error_template = error_template or (
-            'property {property_name} requires the {package_name} python package. '
-            'To get it, run:\n'
-            '>> pip install {package_name}\n\n'
-            'You may install dependencies for all text properties by running:\n'
-            '>> pip install deepchecks[nlp-properties]\n'
-        )
-        raise ImportError(error_template.format(
-            property_name=property_name,
-            package_name=package_name
-        )) from error
-    else:
-        return lib
-
-
 def _warn_if_missing_nltk_dependencies(dependency: str, property_name: str):
     """Warn if NLTK dependency is missing."""
     warnings.warn(f'NLTK {dependency} not found, {property_name} cannot be calculated.'
                   ' Please check your internet connection.', UserWarning)
 
 
-def get_create_model_storage(models_storage: Union[pathlib.Path, str, None] = None):
-    """Get the models storage directory and create it if needed."""
-    if models_storage is None:
-        models_storage = MODELS_STORAGE
-    else:
-        if isinstance(models_storage, str):
-            models_storage = pathlib.Path(models_storage)
-        if not isinstance(models_storage, pathlib.Path):
-            raise ValueError(
-                f'Unexpected type of the "models_storage" parameter - {type(models_storage)}'
-            )
-        if not models_storage.exists():
-            models_storage.mkdir(parents=True)
-        if not models_storage.is_dir():
-            raise ValueError('"model_storage" expected to be a directory')
-
-    return models_storage
-
-
-def get_transformer_model(
-        property_name: str,
-        model_name: str,
-        device: Optional[str] = None,
-        quantize_model: bool = False,
-        models_storage: Union[pathlib.Path, str, None] = None
-):
-    """Get the transformer model and decide if to use optimum.onnxruntime.
-
-    optimum.onnxruntime is used to optimize running times on CPU.
-    """
-    models_storage = get_create_model_storage(models_storage)
-
-    if device not in (None, 'cpu'):
-        transformers = _import_optional_property_dependency('transformers', property_name=property_name)
-        # TODO: quantize if 'quantize_model' is True
-        return transformers.AutoModelForSequenceClassification.from_pretrained(
-            model_name,
-            cache_dir=models_storage,
-            device_map=device
-        )
-
-    onnx = _import_optional_property_dependency(
-        'optimum.onnxruntime',
-        property_name=property_name,
-        error_template=(
-            f'The device was set to {device} while computing the {property_name} property,'
-            'in which case deepchecks resorts to accelerating the inference by using optimum,'
-            'bit it is not installed. Either:\n'
-            '\t- Set the device according to your hardware;\n'
-            '\t- Install optimum by running "pip install optimum";\n'
-            '\t- Install all dependencies needed for text properties by running '
-            '"pip install deepchecks[nlp-properties]";\n'
-        )
-    )
-
-    if quantize_model is False:
-        model_path = models_storage / 'onnx' / model_name
-
-        if model_path.exists():
-            return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1)
-
-        model = onnx.ORTModelForSequenceClassification.from_pretrained(
-            model_name,
-            export=True,
-            cache_dir=models_storage,
-        ).to(device or -1)
-        # NOTE:
-        # 'optimum', after exporting/converting a model to the ONNX format,
-        # does not store it onto disk we need to save it now to not reconvert
-        # it each time
-        model.save_pretrained(model_path)
-        return model
-
-    model_path = models_storage / 'onnx' / 'quantized' / model_name
-
-    if model_path.exists():
-        return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1)
-
-    not_quantized_model = get_transformer_model(
-        property_name,
-        model_name,
-        device,
-        quantize_model=False,
-        models_storage=models_storage
-    )
-
-    quantizer = onnx.ORTQuantizer.from_pretrained(not_quantized_model).to(device or -1)
-
-    quantizer.quantize(
-        save_dir=model_path,
-        # TODO: make it possible to provide a config as a parameter
-        quantization_config=onnx.configuration.AutoQuantizationConfig.avx512_vnni(
-            is_static=False,
-            per_channel=False
-        )
-    )
-    return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1)
-
-
-def get_transformer_pipeline(
-        property_name: str,
-        model_name: str,
-        device: Optional[str] = None,
-        models_storage: Union[pathlib.Path, str, None] = None
-):
-    """Return a transformers pipeline for the given model name."""
-    transformers = _import_optional_property_dependency('transformers', property_name=property_name)
-    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, device_map=device)
-    model = get_transformer_model(
-        property_name=property_name,
-        model_name=model_name,
-        device=device,
-        models_storage=models_storage
-    )
-    return transformers.pipeline(
-        'text-classification',
-        model=model,
-        tokenizer=tokenizer,
-        device=device
-    )
-
-
 def text_length(text: str) -> int:
     """Return text length."""
     return len(text)
@@ -283,36 +133,6 @@ def max_word_length(text: str) -> int:
     return max(len(w) for w in words) if words else 0
 
 
-def _get_fasttext_model(models_storage: Union[pathlib.Path, str, None] = None):
-    """Return fasttext model."""
-    fasttext = _import_optional_property_dependency(module='fasttext', property_name='language')
-
-    model_name = FASTTEXT_LANG_MODEL.rsplit('/', maxsplit=1)[-1]
-    model_path = get_create_model_storage(models_storage)
-    model_path = model_path / 'fasttext'
-
-    if not model_path.exists():
-        model_path.mkdir(parents=True)
-
-    model_path = model_path / model_name
-
-    # Save the model to a file
-    if not model_path.exists():
-        response = requests.get(FASTTEXT_LANG_MODEL, timeout=240)
-        if response.status_code != 200:
-            raise RuntimeError('Failed to donwload fasttext model')
-        model_path.write_bytes(response.content)
-
-    # This weird code is to suppress a warning from fasttext about a deprecated function
-    try:
-        fasttext.FastText.eprint = lambda *args, **kwargs: None
-        fasttext_model = fasttext.load_model(str(model_path))
-    except Exception as exp:
-        raise exp
-
-    return fasttext_model
-
-
 def language(
         text: str,
         lang_certainty_threshold: float = 0.8,
@@ -324,7 +144,7 @@ def language(
     # Load the model if it wasn't received as a parameter. This is done to avoid loading the model
     # each time the function is called.
     if fasttext_model is None:
-        fasttext_model = _get_fasttext_model()
+        fasttext_model = get_fasttext_model()
 
     # Predictions are the first prediction (k=1), only if the probability is above the threshold
     prediction = fasttext_model.predict(text.replace('\n', ' '), k=1, threshold=lang_certainty_threshold)[0]
@@ -830,7 +650,8 @@ def calculate_builtin_properties(
         ignore_non_english_samples_for_english_properties: bool = True,
         device: Optional[str] = None,
         models_storage: Union[pathlib.Path, str, None] = None,
-        batch_size: Optional[int] = 16
+        batch_size: Optional[int] = 16,
+        cache_models: bool = False
 ) -> Tuple[Dict[str, List[float]], Dict[str, str]]:
     """Calculate properties on provided text samples.
 
@@ -875,6 +696,8 @@ def calculate_builtin_properties(
         Also, if a folder already contains relevant resources they are not re-downloaded.
     batch_size : int, default 8
         The batch size.
+    cache_models : bool, default False
+        cache the models being used in this function, to save load time in next execution
 
     Returns
     -------
@@ -902,7 +725,7 @@ def calculate_builtin_properties(
 
     # Prepare kwargs for properties that require outside resources:
     if 'fasttext_model' not in kwargs:
-        kwargs['fasttext_model'] = _get_fasttext_model(models_storage=models_storage)
+        kwargs['fasttext_model'] = get_fasttext_model(models_storage=models_storage, use_cache=cache_models)
 
     if 'cmudict_dict' not in kwargs:
         properties_requiring_cmudict = list(set(CMUDICT_PROPERTIES) & set(text_properties_names))
@@ -911,20 +734,22 @@ def calculate_builtin_properties(
                 _warn_if_missing_nltk_dependencies('cmudict', format_list(properties_requiring_cmudict))
                 for prop in properties_requiring_cmudict:
                     calculated_properties[prop] = [np.nan] * len(raw_text)
-            cmudict_dict = corpus.cmudict.dict()
-            kwargs['cmudict_dict'] = cmudict_dict
+            kwargs['cmudict_dict'] = get_cmudict_dict(use_cache=cache_models)
 
     if 'Toxicity' in text_properties_names and 'toxicity_classifier' not in kwargs:
         kwargs['toxicity_classifier'] = get_transformer_pipeline(
-            property_name='toxicity', model_name=TOXICITY_MODEL_NAME, device=device, models_storage=models_storage)
+            property_name='toxicity', model_name=TOXICITY_MODEL_NAME, device=device,
+            models_storage=models_storage, use_cache=cache_models)
 
     if 'Formality' in text_properties_names and 'formality_classifier' not in kwargs:
         kwargs['formality_classifier'] = get_transformer_pipeline(
-            property_name='formality', model_name=FORMALITY_MODEL_NAME, device=device, models_storage=models_storage)
+            property_name='formality', model_name=FORMALITY_MODEL_NAME, device=device,
+            models_storage=models_storage, use_cache=cache_models)
 
     if 'Fluency' in text_properties_names and 'fluency_classifier' not in kwargs:
         kwargs['fluency_classifier'] = get_transformer_pipeline(
-            property_name='fluency', model_name=FLUENCY_MODEL_NAME, device=device, models_storage=models_storage)
+            property_name='fluency', model_name=FLUENCY_MODEL_NAME, device=device,
+            models_storage=models_storage, use_cache=cache_models)
 
     is_language_property_requested = 'Language' in [prop['name'] for prop in text_properties]
     # Remove language property from the list of properties to calculate as it will be calculated separately:
@@ -994,7 +819,8 @@ def calculate_builtin_properties(
         sentences_cache.clear()
 
     # Clean all remaining RAM:
-    gc.collect()
+    if not cache_models:
+        gc.collect()
 
     if not calculated_properties:
         raise RuntimeError('Failed to calculate any of the properties.')
diff --git a/deepchecks/nlp/utils/text_properties_models.py b/deepchecks/nlp/utils/text_properties_models.py
new file mode 100644
index 0000000000..994dbd1935
--- /dev/null
+++ b/deepchecks/nlp/utils/text_properties_models.py
@@ -0,0 +1,237 @@
+# ----------------------------------------------------------------------------
+# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
+#
+# This file is part of Deepchecks.
+# Deepchecks is distributed under the terms of the GNU Affero General
+# Public License (version 3 or later).
+# You should have received a copy of the GNU Affero General Public License
+# along with Deepchecks.  If not, see <http://www.gnu.org/licenses/>.
+# ----------------------------------------------------------------------------
+#
+"""Module containing the text properties models for the NLP module."""
+import importlib
+import pathlib
+from functools import lru_cache
+from typing import Optional, Union
+
+import requests
+from nltk import corpus
+
+MODELS_STORAGE = pathlib.Path(__file__).absolute().parent / '.nlp-models'
+
+
+def get_create_model_storage(models_storage: Union[pathlib.Path, str, None] = None):
+    """Get the models storage directory and create it if needed."""
+    if models_storage is None:
+        models_storage = MODELS_STORAGE
+    else:
+        if isinstance(models_storage, str):
+            models_storage = pathlib.Path(models_storage)
+        if not isinstance(models_storage, pathlib.Path):
+            raise ValueError(
+                f'Unexpected type of the "models_storage" parameter - {type(models_storage)}'
+            )
+        if not models_storage.exists():
+            models_storage.mkdir(parents=True)
+        if not models_storage.is_dir():
+            raise ValueError('"model_storage" expected to be a directory')
+
+    return models_storage
+
+
+def _get_transformer_model(
+        property_name: str,
+        model_name: str,
+        device: Optional[str] = None,
+        quantize_model: bool = False,
+        models_storage: Union[pathlib.Path, str, None] = None
+):
+    """Get the transformer model and decide if to use optimum.onnxruntime.
+
+    optimum.onnxruntime is used to optimize running times on CPU.
+    """
+    models_storage = get_create_model_storage(models_storage)
+
+    if device not in (None, 'cpu'):
+        transformers = import_optional_property_dependency('transformers', property_name=property_name)
+        # TODO: quantize if 'quantize_model' is True
+        return transformers.AutoModelForSequenceClassification.from_pretrained(
+            model_name,
+            cache_dir=models_storage,
+            device_map=device
+        )
+
+    onnx = import_optional_property_dependency(
+        'optimum.onnxruntime',
+        property_name=property_name,
+        error_template=(
+            f'The device was set to {device} while computing the {property_name} property,'
+            'in which case deepchecks resorts to accelerating the inference by using optimum,'
+            'bit it is not installed. Either:\n'
+            '\t- Set the device according to your hardware;\n'
+            '\t- Install optimum by running "pip install optimum";\n'
+            '\t- Install all dependencies needed for text properties by running '
+            '"pip install deepchecks[nlp-properties]";\n'
+        )
+    )
+
+    if quantize_model is False:
+        model_path = models_storage / 'onnx' / model_name
+
+        if model_path.exists():
+            return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1)
+
+        model = onnx.ORTModelForSequenceClassification.from_pretrained(
+            model_name,
+            export=True,
+            cache_dir=models_storage,
+        ).to(device or -1)
+        # NOTE:
+        # 'optimum', after exporting/converting a model to the ONNX format,
+        # does not store it onto disk we need to save it now to not reconvert
+        # it each time
+        model.save_pretrained(model_path)
+        return model
+
+    model_path = models_storage / 'onnx' / 'quantized' / model_name
+
+    if model_path.exists():
+        return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1)
+
+    not_quantized_model = _get_transformer_model(
+        property_name,
+        model_name,
+        device,
+        quantize_model=False,
+        models_storage=models_storage
+    )
+
+    quantizer = onnx.ORTQuantizer.from_pretrained(not_quantized_model).to(device or -1)
+
+    quantizer.quantize(
+        save_dir=model_path,
+        # TODO: make it possible to provide a config as a parameter
+        quantization_config=onnx.configuration.AutoQuantizationConfig.avx512_vnni(
+            is_static=False,
+            per_channel=False
+        )
+    )
+    return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1)
+
+
+def import_optional_property_dependency(
+        module: str,
+        property_name: str,
+        package_name: Optional[str] = None,
+        error_template: Optional[str] = None
+):
+    """Import additional modules in runtime."""
+    try:
+        lib = importlib.import_module(module)
+    except ImportError as error:
+        package_name = package_name or module.split('.', maxsplit=1)[0]
+        error_template = error_template or (
+            'property {property_name} requires the {package_name} python package. '
+            'To get it, run:\n'
+            '>> pip install {package_name}\n\n'
+            'You may install dependencies for all text properties by running:\n'
+            '>> pip install deepchecks[nlp-properties]\n'
+        )
+        raise ImportError(error_template.format(
+            property_name=property_name,
+            package_name=package_name
+        )) from error
+    else:
+        return lib
+
+
+def get_transformer_pipeline(
+        property_name: str,
+        model_name: str,
+        device: Optional[str] = None,
+        models_storage: Union[pathlib.Path, str, None] = None,
+        use_cache=False
+):
+    """Return a transformers pipeline for the given model name."""
+    if use_cache:
+        return _get_transformer_pipeline(property_name, model_name, device, models_storage)
+    # __wrapped__ is simply the function without decoration, in our case - without caching
+    return _get_transformer_pipeline.__wrapped__(property_name, model_name, device, models_storage)
+
+
+@lru_cache(maxsize=5)
+def _get_transformer_pipeline(
+        property_name: str,
+        model_name: str,
+        device: Optional[str] = None,
+        models_storage: Union[pathlib.Path, str, None] = None
+):
+    """Return a transformers pipeline for the given model name."""
+    transformers = import_optional_property_dependency('transformers', property_name=property_name)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, device_map=device)
+    model = _get_transformer_model(
+        property_name=property_name,
+        model_name=model_name,
+        device=device,
+        models_storage=models_storage
+    )
+    return transformers.pipeline(
+        'text-classification',
+        model=model,
+        tokenizer=tokenizer,
+        device=device
+    )
+
+
+def get_cmudict_dict(use_cache=False):
+    """Return corpus as dict."""
+    if use_cache:
+        return _get_cmudict_dict()
+    return _get_cmudict_dict.__wrapped__()
+
+
+@lru_cache(maxsize=1)
+def _get_cmudict_dict():
+    cmudict_dict = corpus.cmudict.dict()
+    return cmudict_dict
+
+
+FASTTEXT_LANG_MODEL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin'
+
+
+def get_fasttext_model(models_storage: Union[pathlib.Path, str, None] = None, use_cache=False):
+    """Return fasttext model."""
+    if use_cache:
+        return _get_fasttext_model(models_storage)
+    return _get_fasttext_model.__wrapped__(models_storage)
+
+
+@lru_cache(maxsize=1)
+def _get_fasttext_model(models_storage: Union[pathlib.Path, str, None] = None):
+    """Return fasttext model."""
+    fasttext = import_optional_property_dependency(module='fasttext', property_name='language')
+
+    model_name = FASTTEXT_LANG_MODEL.rsplit('/', maxsplit=1)[-1]
+    model_path = get_create_model_storage(models_storage)
+    model_path = model_path / 'fasttext'
+
+    if not model_path.exists():
+        model_path.mkdir(parents=True)
+
+    model_path = model_path / model_name
+
+    # Save the model to a file
+    if not model_path.exists():
+        response = requests.get(FASTTEXT_LANG_MODEL, timeout=240)
+        if response.status_code != 200:
+            raise RuntimeError('Failed to donwload fasttext model')
+        model_path.write_bytes(response.content)
+
+    # This weird code is to suppress a warning from fasttext about a deprecated function
+    try:
+        fasttext.FastText.eprint = lambda *args, **kwargs: None
+        fasttext_model = fasttext.load_model(str(model_path))
+    except Exception as exp:
+        raise exp
+
+    return fasttext_model
diff --git a/tests/nlp/utils/test_properties.py b/tests/nlp/utils/test_properties.py
index af4f61c1da..29057de6e8 100644
--- a/tests/nlp/utils/test_properties.py
+++ b/tests/nlp/utils/test_properties.py
@@ -18,8 +18,9 @@
 from hamcrest import *
 
 from deepchecks.core.errors import DeepchecksValueError
-from deepchecks.nlp.utils.text_properties import (MODELS_STORAGE, _sample_for_property, calculate_builtin_properties,
-                                                  get_transformer_model, is_english)
+from deepchecks.nlp.utils.text_properties import (_sample_for_property, calculate_builtin_properties,
+                                                  is_english)
+from deepchecks.nlp.utils.text_properties_models import MODELS_STORAGE, _get_transformer_model
 
 
 def mock_fn(*args, **kwargs):  # pylint: disable=unused-argument
@@ -419,7 +420,7 @@ def test_properties_models_download():
     model_download_time = timeit.timeit(
         stmt='fn()',
         number=1,
-        globals={'fn': lambda: get_transformer_model(
+        globals={'fn': lambda: _get_transformer_model(
             property_name='',
             model_name=model_name
         )}
@@ -431,7 +432,7 @@ def test_properties_models_download():
     assert onnx_model_path.exists() and onnx_model_path.is_dir()
 
     # Act
-    get_transformer_model(property_name='', model_name=model_name, quantize_model=True)
+    _get_transformer_model(property_name='', model_name=model_name, quantize_model=True)
 
     # Assert
     assert quantized_model_path.exists() and quantized_model_path.is_dir()
@@ -440,7 +441,7 @@ def test_properties_models_download():
     model_creation_time = timeit.timeit(
         stmt='fn()',
         number=1,
-        globals={'fn': lambda: get_transformer_model(
+        globals={'fn': lambda: _get_transformer_model(
             property_name='',
             model_name=model_name,
             quantize_model=True
@@ -462,7 +463,7 @@ def test_properties_models_download_into_provided_directory():
     onnx_model_path = MODELS_STORAGE / 'onnx' / model_name
 
     # Act
-    get_transformer_model(property_name='', model_name=model_name, models_storage=directory)
+    _get_transformer_model(property_name='', model_name=model_name, models_storage=directory)
 
     # Assert
     assert MODELS_STORAGE.exists() and MODELS_STORAGE.is_dir()

From de048941f4655fa4a1770efd242f82fc43c89d2b Mon Sep 17 00:00:00 2001
From: Noam Bressler <noamzbr@gmail.com>
Date: Tue, 25 Jul 2023 15:24:42 +0300
Subject: [PATCH 19/23] Fix mixup in dataset names and colors in feature drift
 (#2644)

* Fix mixup in dataset names and colors in feature drift
---
 deepchecks/utils/abstracts/feature_drift.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepchecks/utils/abstracts/feature_drift.py b/deepchecks/utils/abstracts/feature_drift.py
index 7e4b7bf154..cf9e6db1e3 100644
--- a/deepchecks/utils/abstracts/feature_drift.py
+++ b/deepchecks/utils/abstracts/feature_drift.py
@@ -81,7 +81,7 @@ def _calculate_feature_drift(
                 ignore_na=self.ignore_na,
                 min_samples=self.min_samples,
                 with_display=with_display,
-                dataset_names=(test_dataframe_name, train_dataframe_name)
+                dataset_names=(train_dataframe_name, test_dataframe_name)
             )
 
             if value == 'not_enough_samples':

From deb16bc2b3f24fc7f70274556e968b2802056100 Mon Sep 17 00:00:00 2001
From: Noam Bressler <noamzbr@gmail.com>
Date: Wed, 26 Jul 2023 11:34:31 +0300
Subject: [PATCH 20/23] Fix the Simple Model Comp condition  (#2647)

* Fix the Simple Model Comp condition - handle scorers that are not per class
---
 .../model_evaluation/simple_model_comparison.py | 12 +++++++++---
 .../simple_model_comparison_test.py             | 17 +++++++++++++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/deepchecks/tabular/checks/model_evaluation/simple_model_comparison.py b/deepchecks/tabular/checks/model_evaluation/simple_model_comparison.py
index b5e6438274..b92d23cd84 100644
--- a/deepchecks/tabular/checks/model_evaluation/simple_model_comparison.py
+++ b/deepchecks/tabular/checks/model_evaluation/simple_model_comparison.py
@@ -371,7 +371,8 @@ def add_condition_gain_greater_than(self,
             Used in classification models to limit condition only to given classes.
         average : bool , default: False
             Used in classification models to flag if to run condition on average of classes, or on
-            each class individually
+            each class individually. If any scorer that return a single value is used, this parameter
+            is ignored (will act as if average=True).
         """
         name = f'Model performance gain over simple model is greater than {format_percent(min_allowed_gain)}'
         if classes:
@@ -390,8 +391,13 @@ def condition(result: Dict, include_classes=None, average=False, max_gain=None,
     task_type = result['type']
     scorers_perfect = result['scorers_perfect']
 
+    # If the depth of the nested scores dict is 2, average is not relevant and is set to True
+    inner_dict = scores[list(scores.keys())[0]]
+    inner_inner_dict = inner_dict[list(inner_dict.keys())[0]]
+    force_average = isinstance(inner_inner_dict, Number)
+
     passed_condition = True
-    if task_type in [TaskType.MULTICLASS, TaskType.BINARY] and not average:
+    if task_type in [TaskType.MULTICLASS, TaskType.BINARY] and not average and not force_average:
         passed_metrics = {}
         failed_classes = defaultdict(dict)
         perfect_metrics = []
@@ -433,7 +439,7 @@ def condition(result: Dict, include_classes=None, average=False, max_gain=None,
         passed_metrics = {}
         failed_metrics = {}
         perfect_metrics = []
-        if task_type in [TaskType.MULTICLASS, TaskType.BINARY]:
+        if task_type in [TaskType.MULTICLASS, TaskType.BINARY] and not force_average:
             scores = average_scores(scores, include_classes)
         for metric, models_scores in scores.items():
             # If origin model is perfect, skip the gain calculation
diff --git a/tests/tabular/checks/model_evaluation/simple_model_comparison_test.py b/tests/tabular/checks/model_evaluation/simple_model_comparison_test.py
index 017d89adc2..d8e8fdee08 100644
--- a/tests/tabular/checks/model_evaluation/simple_model_comparison_test.py
+++ b/tests/tabular/checks/model_evaluation/simple_model_comparison_test.py
@@ -194,6 +194,23 @@ def test_condition_pass_for_multiclass_avg(iris_split_dataset_and_model):
     ))
 
 
+def test_condition_pass_for_custom_scorer(iris_dataset_single_class, iris_random_forest_single_class):
+    train_ds = iris_dataset_single_class
+    test_ds = iris_dataset_single_class
+    clf = iris_random_forest_single_class
+    # Arrange
+    check = SimpleModelComparison(scorers=['f1'], strategy='most_frequent').add_condition_gain_greater_than(0.43)
+    # Act X
+    result = check.run(train_ds, test_ds, clf)
+    # Assert
+    assert_that(result.conditions_results, has_items(
+        equal_condition_result(
+            is_pass=True,
+            details='Found metrics with perfect score, no gain is calculated: [\'f1\']',
+            name='Model performance gain over simple model is greater than 43%')
+    ))
+
+
 def test_condition_pass_for_multiclass_avg_with_classes(iris_split_dataset_and_model):
     train_ds, test_ds, clf = iris_split_dataset_and_model
     # Arrange

From daad127a3a922e08420225baa04f84088dc2e011 Mon Sep 17 00:00:00 2001
From: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>
Date: Wed, 26 Jul 2023 12:46:10 +0300
Subject: [PATCH 21/23] Weak segment - return single segment per feature
 argument (#2645)

---
 .../under_annotated_segments.py               | 30 +++++++++++++------
 .../weak_segments_performance.py              | 21 ++++++++++---
 .../weak_segments_performance.py              | 14 ++++++---
 .../utils/abstracts/weak_segment_abstract.py  | 25 ++++++++++++----
 .../weak_segments_performance.py              |  8 ++++-
 .../plot_under_annotated_metadata_segments.py |  7 +++--
 .../plot_under_annotated_property_segments.py |  3 ++
 .../plot_metadata_segments_performance.py     |  7 +++--
 .../plot_property_segments_performance.py     |  4 ++-
 .../plot_weak_segments_performance.py         |  3 ++
 .../plot_multi_label_classification.py        |  6 ++--
 .../quickstarts/plot_text_classification.py   |  4 +--
 .../quickstarts/plot_token_classification.py  |  4 +--
 .../under_annotated_segments_test.py          | 12 ++++----
 .../weak_segment_performance_test.py          | 10 +++----
 .../weak_segments_performance_test.py         |  6 ++--
 16 files changed, 115 insertions(+), 49 deletions(-)

diff --git a/deepchecks/nlp/checks/data_integrity/under_annotated_segments.py b/deepchecks/nlp/checks/data_integrity/under_annotated_segments.py
index 4b942657ad..b0b32e5b5a 100644
--- a/deepchecks/nlp/checks/data_integrity/under_annotated_segments.py
+++ b/deepchecks/nlp/checks/data_integrity/under_annotated_segments.py
@@ -9,7 +9,7 @@
 # ----------------------------------------------------------------------------
 #
 """Module of the under annotated segments check."""
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -41,8 +41,8 @@ class UnderAnnotatedSegments(SingleDatasetCheck, WeakSegmentAbstract):
 
     def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], None],
                  ignore_columns: Union[Hashable, List[Hashable], None], n_top_features: int,
-                 segment_minimum_size_ratio: float, n_samples: int,  n_to_show: int,
-                 categorical_aggregation_threshold: float, **kwargs):
+                 segment_minimum_size_ratio: float, n_samples: int, n_to_show: int,
+                 categorical_aggregation_threshold: float, multiple_segments_per_feature: bool, **kwargs):
         super().__init__(**kwargs)
         self.segment_by = segment_by
         self.columns = columns
@@ -53,6 +53,7 @@ def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], Non
         self.n_to_show = n_to_show
         self.categorical_aggregation_threshold = categorical_aggregation_threshold
         self.annotation_ratio_threshold = ANNOTATION_RATIO_THRESHOLD
+        self.multiple_segments_per_feature = multiple_segments_per_feature
 
     def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         """Run check."""
@@ -81,11 +82,12 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         avg_score = round(score_per_sample.mean(), 3)
         weak_segments = self._weak_segments_search(data=encoded_dataset.features_columns,
                                                    score_per_sample=score_per_sample,
-                                                   scorer_name='Annotation Ratio')
+                                                   scorer_name='Annotation Ratio',
+                                                   multiple_segments_per_feature=self.multiple_segments_per_feature)
 
         if len(weak_segments) == 0:
             display_msg = 'Check was unable to find under annotated segments. Try ' \
-                            f'supplying more {self.segment_by}.'
+                          f'supplying more {self.segment_by}.'
             return CheckResult(value={'message': display_msg}, display=[display_msg])
 
         check_result_value = self._generate_check_result_value(weak_segments, cat_features, avg_score)
@@ -239,7 +241,7 @@ class UnderAnnotatedPropertySegments(UnderAnnotatedSegments):
         Properties to check, if none are given checks all properties except ignored ones.
     ignore_properties : Union[Hashable, List[Hashable]] , default: None
         Properties to ignore, if none given checks based on properties variable
-    n_top_properties : int , default: 10
+    n_top_properties : Optional[int] , default: 10
         Number of properties to use for segment search. Top properties are selected based on feature importance.
     segment_minimum_size_ratio: float , default: 0.05
         Minimum size ratio for segments. Will only search for segments of
@@ -250,16 +252,20 @@ class UnderAnnotatedPropertySegments(UnderAnnotatedSegments):
         number of segments with the weakest performance to show.
     categorical_aggregation_threshold : float , default: 0.05
         In each categorical column, categories with frequency below threshold will be merged into "Other" category.
+    multiple_segments_per_property : bool , default: False
+        If True, will allow the same property to be a segmenting feature in multiple segments,
+        otherwise each property can appear in one segment at most.
     """
 
     def __init__(self,
                  properties: Union[Hashable, List[Hashable], None] = None,
                  ignore_properties: Union[Hashable, List[Hashable], None] = None,
-                 n_top_properties: int = 15,
+                 n_top_properties: Optional[int] = 10,
                  segment_minimum_size_ratio: float = 0.05,
                  n_samples: int = 10_000,
                  categorical_aggregation_threshold: float = 0.05,
                  n_to_show: int = 3,
+                 multiple_segments_per_property: bool = False,
                  **kwargs):
         super().__init__(segment_by='properties',
                          columns=properties,
@@ -269,6 +275,7 @@ def __init__(self,
                          n_samples=n_samples,
                          n_to_show=n_to_show,
                          categorical_aggregation_threshold=categorical_aggregation_threshold,
+                         multiple_segments_per_feature=multiple_segments_per_property,
                          **kwargs)
 
 
@@ -290,7 +297,7 @@ class UnderAnnotatedMetaDataSegments(UnderAnnotatedSegments):
         Columns to check, if none are given checks all columns except ignored ones.
     ignore_columns : Union[Hashable, List[Hashable]] , default: None
         Columns to ignore, if none given checks based on columns variable
-    n_top_columns : int , default: 10
+    n_top_columns : Optional[int] , default: 10
         Number of features to use for segment search. Top columns are selected based on feature importance.
     segment_minimum_size_ratio: float , default: 0.05
         Minimum size ratio for segments. Will only search for segments of
@@ -301,16 +308,20 @@ class UnderAnnotatedMetaDataSegments(UnderAnnotatedSegments):
         number of segments with the weakest performance to show.
     categorical_aggregation_threshold : float , default: 0.05
         In each categorical column, categories with frequency below threshold will be merged into "Other" category.
+     multiple_segments_per_column : bool , default: True
+        If True, will allow the same metadata column to be a segmenting column in multiple segments,
+        otherwise each metadata column can appear in one segment at most.
     """
 
     def __init__(self,
                  columns: Union[Hashable, List[Hashable], None] = None,
                  ignore_columns: Union[Hashable, List[Hashable], None] = None,
-                 n_top_columns: int = 10,
+                 n_top_columns: Optional[int] = 10,
                  segment_minimum_size_ratio: float = 0.05,
                  n_samples: int = 10_000,
                  categorical_aggregation_threshold: float = 0.05,
                  n_to_show: int = 3,
+                 multiple_segments_per_column: bool = True,
                  **kwargs):
         super().__init__(segment_by='metadata',
                          columns=columns,
@@ -320,4 +331,5 @@ def __init__(self,
                          n_samples=n_samples,
                          n_to_show=n_to_show,
                          categorical_aggregation_threshold=categorical_aggregation_threshold,
+                         multiple_segments_per_feature=multiple_segments_per_column,
                          **kwargs)
diff --git a/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py b/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py
index e9c77c96ff..46ff1f2327 100644
--- a/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py
+++ b/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py
@@ -37,7 +37,8 @@ def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], Non
                  ignore_columns: Union[Hashable, List[Hashable], None], n_top_features: Optional[int],
                  segment_minimum_size_ratio: float, alternative_scorer: Dict[str, Union[str, Callable]],
                  score_per_sample: Union[np.ndarray, pd.Series, None], n_samples: int,
-                 categorical_aggregation_threshold: float, n_to_show: int, **kwargs):
+                 categorical_aggregation_threshold: float, n_to_show: int,
+                 multiple_segments_per_feature: bool = False, **kwargs):
         super().__init__(**kwargs)
         self.segment_by = segment_by
         self.columns = columns
@@ -49,6 +50,7 @@ def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], Non
         self.score_per_sample = score_per_sample
         self.alternative_scorer = alternative_scorer if alternative_scorer else None
         self.categorical_aggregation_threshold = categorical_aggregation_threshold
+        self.multiple_segments_per_feature = multiple_segments_per_feature
 
     def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         """Run check."""
@@ -107,7 +109,8 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         weak_segments = self._weak_segments_search(data=encoded_dataset.data, score_per_sample=score_per_sample,
                                                    label_col=pd.Series(original_label, index=score_per_sample.index),
                                                    feature_rank_for_search=np.asarray(encoded_dataset.features),
-                                                   dummy_model=dummy_model, scorer=scorer)
+                                                   dummy_model=dummy_model, scorer=scorer,
+                                                   multiple_segments_per_feature=self.multiple_segments_per_feature)
 
         if len(weak_segments) == 0:
             display_msg = 'WeakSegmentsPerformance was unable to train an error model to find weak segments.'\
@@ -169,18 +172,22 @@ class PropertySegmentsPerformance(WeakSegmentsAbstractText):
         number of segments with the weakest performance to show.
     categorical_aggregation_threshold : float , default: 0.05
         In each categorical column, categories with frequency below threshold will be merged into "Other" category.
+    multiple_segments_per_property : bool , default: False
+        If True, will allow the same property to be a segmenting feature in multiple segments,
+        otherwise each property can appear in one segment at most.
     """
 
     def __init__(self,
                  properties: Union[Hashable, List[Hashable], None] = None,
                  ignore_properties: Union[Hashable, List[Hashable], None] = None,
-                 n_top_properties: Optional[int] = 15,
+                 n_top_properties: Optional[int] = 10,
                  segment_minimum_size_ratio: float = 0.05,
                  alternative_scorer: Dict[str, Union[str, Callable]] = None,
                  score_per_sample: Union[np.ndarray, pd.Series, None] = None,
                  n_samples: int = 5_000,
                  categorical_aggregation_threshold: float = 0.05,
                  n_to_show: int = 3,
+                 multiple_segments_per_property: bool = False,
                  **kwargs):
         super().__init__(segment_by='properties',
                          columns=properties,
@@ -192,6 +199,7 @@ def __init__(self,
                          score_per_sample=score_per_sample,
                          alternative_scorer=alternative_scorer,
                          categorical_aggregation_threshold=categorical_aggregation_threshold,
+                         multiple_segments_per_feature=multiple_segments_per_property,
                          **kwargs)
 
 
@@ -235,18 +243,22 @@ class MetadataSegmentsPerformance(WeakSegmentsAbstractText):
         number of segments with the weakest performance to show.
     categorical_aggregation_threshold : float , default: 0.05
         In each categorical column, categories with frequency below threshold will be merged into "Other" category.
+    multiple_segments_column : bool , default: True
+        If True, will allow the same metadata column to be a segmenting column in multiple segments,
+        otherwise each metadata column can appear in one segment at most.
     """
 
     def __init__(self,
                  columns: Union[Hashable, List[Hashable], None] = None,
                  ignore_columns: Union[Hashable, List[Hashable], None] = None,
-                 n_top_columns: Optional[int] = 15,
+                 n_top_columns: Optional[int] = 10,
                  segment_minimum_size_ratio: float = 0.05,
                  alternative_scorer: Dict[str, Union[str, Callable]] = None,
                  score_per_sample: Union[np.ndarray, pd.Series, None] = None,
                  n_samples: int = 5_000,
                  categorical_aggregation_threshold: float = 0.05,
                  n_to_show: int = 3,
+                 multiple_segments_column: bool = True,
                  **kwargs):
         super().__init__(segment_by='metadata',
                          columns=columns,
@@ -258,4 +270,5 @@ def __init__(self,
                          score_per_sample=score_per_sample,
                          alternative_scorer=alternative_scorer,
                          categorical_aggregation_threshold=categorical_aggregation_threshold,
+                         multiple_segments_per_feature=multiple_segments_column,
                          **kwargs)
diff --git a/deepchecks/tabular/checks/model_evaluation/weak_segments_performance.py b/deepchecks/tabular/checks/model_evaluation/weak_segments_performance.py
index 3d6eaf147d..9e9c4ff757 100644
--- a/deepchecks/tabular/checks/model_evaluation/weak_segments_performance.py
+++ b/deepchecks/tabular/checks/model_evaluation/weak_segments_performance.py
@@ -10,7 +10,7 @@
 #
 """Module of weak segments performance check."""
 import warnings
-from typing import TYPE_CHECKING, Callable, Dict, List, Union
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -49,7 +49,7 @@ class WeakSegmentsPerformance(SingleDatasetCheck, WeakSegmentAbstract):
         Columns to check, if none are given checks all columns except ignored ones.
     ignore_columns : Union[Hashable, List[Hashable]] , default: None
         Columns to ignore, if none given checks based on columns variable
-    n_top_features : int , default: 5
+    n_top_features : Optional[int] , default: 10
         Number of features to use for segment search. Top columns are selected based on feature importance.
     segment_minimum_size_ratio: float , default: 0.05
         Minimum size ratio for segments. Will only search for segments of
@@ -73,13 +73,16 @@ class WeakSegmentsPerformance(SingleDatasetCheck, WeakSegmentAbstract):
         In each categorical column, categories with frequency below threshold will be merged into "Other" category.
     random_state : int, default: 42
         random seed for all check internals.
+    multiple_segments_per_feature : bool , default: True
+        If True, will allow the same feature to be a segmenting feature in multiple segments,
+        otherwise each feature can appear in one segment at most.
     """
 
     def __init__(
             self,
             columns: Union[Hashable, List[Hashable], None] = None,
             ignore_columns: Union[Hashable, List[Hashable], None] = None,
-            n_top_features: int = 5,
+            n_top_features: Optional[int] = 10,
             segment_minimum_size_ratio: float = 0.05,
             alternative_scorer: Dict[str, Union[str, Callable]] = None,
             loss_per_sample: Union[np.ndarray, pd.Series, None] = None,
@@ -88,6 +91,7 @@ def __init__(
             categorical_aggregation_threshold: float = 0.05,
             n_to_show: int = 3,
             random_state: int = 42,
+            multiple_segments_per_feature: bool = True,
             **kwargs
     ):
         super().__init__(**kwargs)
@@ -108,6 +112,7 @@ def __init__(
         self.loss_per_sample = loss_per_sample
         self.alternative_scorer = alternative_scorer
         self.categorical_aggregation_threshold = categorical_aggregation_threshold
+        self.multiple_segments_per_feature = multiple_segments_per_feature
 
     def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         """Run check."""
@@ -160,7 +165,8 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         weak_segments = self._weak_segments_search(data=encoded_dataset.data, score_per_sample=score_per_sample,
                                                    label_col=dataset_subset.label_col,
                                                    feature_rank_for_search=feature_rank,
-                                                   dummy_model=dummy_model, scorer=scorer)
+                                                   dummy_model=dummy_model, scorer=scorer,
+                                                   multiple_segments_per_feature=self.multiple_segments_per_feature)
 
         if len(weak_segments) == 0:
             raise DeepchecksProcessError('WeakSegmentsPerformance was unable to train an error model to find weak '
diff --git a/deepchecks/utils/abstracts/weak_segment_abstract.py b/deepchecks/utils/abstracts/weak_segment_abstract.py
index 2711715f77..80347d9120 100644
--- a/deepchecks/utils/abstracts/weak_segment_abstract.py
+++ b/deepchecks/utils/abstracts/weak_segment_abstract.py
@@ -170,8 +170,8 @@ def _create_heatmap_display(self, data: pd.DataFrame,
     def _weak_segments_search(self, data: pd.DataFrame, score_per_sample: pd.Series,
                               label_col: Optional[pd.Series] = None,
                               feature_rank_for_search: Optional[np.ndarray] = None,
-                              dummy_model: Optional[_DummyModel] = None,
-                              scorer: Optional[DeepcheckScorer] = None, scorer_name: Optional[str] = None) \
+                              dummy_model: Optional[_DummyModel] = None, scorer: Optional[DeepcheckScorer] = None,
+                              scorer_name: Optional[str] = None, multiple_segments_per_feature: bool = False) \
             -> pd.DataFrame:
         """Search for weak segments based on scorer."""
         # Remove samples with NaN score per sample
@@ -213,11 +213,24 @@ def _weak_segments_search(self, data: pd.DataFrame, score_per_sample: pd.Series,
                                                              tuple(filters[feature2]), data_size,
                                                              list(data_of_segment.index)]
 
-        # Drop duplicates without considering column 'Samples in Segment'
-        result_no_duplicates = weak_segments.drop(columns='Samples in Segment').drop_duplicates()
-        result_no_duplicates['Samples in Segment'] = weak_segments.loc[result_no_duplicates.index, 'Samples in Segment']
+        # Sort and drop relevant columns
+        weak_segments = weak_segments.sort_values(score_title).reset_index(drop=True)
+        if multiple_segments_per_feature:
+            result = weak_segments.drop(columns='Samples in Segment').drop_duplicates()
+            result['Samples in Segment'] = weak_segments.loc[result.index, 'Samples in Segment']
+        else:
+            used_features = set()
+            result = pd.DataFrame(columns=weak_segments.columns)
+            for _, row in weak_segments.iterrows():
+                if row['Feature1'] in used_features or row['Feature2'] in used_features:
+                    continue
 
-        return result_no_duplicates.sort_values(score_title).reset_index(drop=True)
+                result.loc[len(result)] = row
+                used_features.add(row['Feature1'])
+                if row['Feature2'] != '':
+                    used_features.add(row['Feature2'])
+
+        return result
 
     def _find_weak_segment(self, data: pd.DataFrame, features_for_segment: List[str], score_per_sample: pd.Series,
                            label_col: Optional[pd.Series] = None, dummy_model: Optional[_DummyModel] = None,
diff --git a/deepchecks/vision/checks/model_evaluation/weak_segments_performance.py b/deepchecks/vision/checks/model_evaluation/weak_segments_performance.py
index 03a4a534bc..f5df02f72f 100644
--- a/deepchecks/vision/checks/model_evaluation/weak_segments_performance.py
+++ b/deepchecks/vision/checks/model_evaluation/weak_segments_performance.py
@@ -66,6 +66,9 @@ class WeakSegmentsPerformance(SingleDatasetCheck, WeakSegmentAbstract):
         number of segments with the weakest performance to show.
     categorical_aggregation_threshold : float , default: 0.05
         For each categorical property, categories with frequency below threshold will be merged into "Other" category.
+    multiple_segments_per_property : bool , default: True
+        If True, will allow the same property to be a segmenting feature in multiple segments,
+        otherwise each property can appear in one segment at most.
     {additional_check_init_params:2*indent}
     """
 
@@ -76,6 +79,7 @@ def __init__(
             segment_minimum_size_ratio: float = 0.05,
             n_samples: Optional[int] = 10000,
             categorical_aggregation_threshold: float = 0.05,
+            multiple_segments_per_property: bool = True,
             **kwargs
     ):
         super().__init__(**kwargs)
@@ -86,6 +90,7 @@ def __init__(
         self.n_to_show = n_to_show
         self.segment_minimum_size_ratio = segment_minimum_size_ratio
         self.categorical_aggregation_threshold = categorical_aggregation_threshold
+        self.multiple_segments_per_property = multiple_segments_per_property
         self._properties_results = None
         self._sample_scores = None
         self._scorer_name = None
@@ -136,7 +141,8 @@ def compute(self, context: Context, dataset_kind: DatasetKind) -> CheckResult:
 
         weak_segments = self._weak_segments_search(data=encoded_dataset.features_columns,
                                                    score_per_sample=score_per_sample_col,
-                                                   scorer_name=self._scorer_name)
+                                                   scorer_name=self._scorer_name,
+                                                   multiple_segments_per_feature=self.multiple_segments_per_property)
         if len(weak_segments) == 0:
             raise DeepchecksProcessError('WeakSegmentsPerformance was unable to train an error model to find weak '
                                          'segments. Try increasing n_samples or supply additional properties.')
diff --git a/docs/source/checks/nlp/data_integrity/plot_under_annotated_metadata_segments.py b/docs/source/checks/nlp/data_integrity/plot_under_annotated_metadata_segments.py
index 79627d3df6..306c31a19c 100644
--- a/docs/source/checks/nlp/data_integrity/plot_under_annotated_metadata_segments.py
+++ b/docs/source/checks/nlp/data_integrity/plot_under_annotated_metadata_segments.py
@@ -63,11 +63,15 @@
 # ``categorical_aggregation_threshold``: By default the check will combine rare categories into a single category called
 # "Other". This parameter determines the frequency threshold for categories to be mapped into to the "other" category.
 #
+# ``multiple_segments_per_column``: If True, will allow the same metadata column to be a segmenting feature in multiple
+# segments, otherwise each metadata column can appear in one segment at most. True by default.
+#
 # see :class:`API reference <deepchecks.tabular.checks.model_evaluation.WeakSegmentsPerformance>` for more details.
 
 from deepchecks.nlp.checks import UnderAnnotatedMetaDataSegments
 
-check = UnderAnnotatedMetaDataSegments(segment_minimum_size_ratio=0.07)
+check = UnderAnnotatedMetaDataSegments(segment_minimum_size_ratio=0.07,
+                                       multiple_segments_per_column=True)
 result = check.run(text_data)
 result.show()
 
@@ -96,7 +100,6 @@
 
 # Let's add a condition and re-run the check:
 
-check = UnderAnnotatedMetaDataSegments(segment_minimum_size_ratio=0.07)
 check.add_condition_segments_annotation_ratio_greater_than(0.7)
 result = check.run(text_data)
 result.show(show_additional_outputs=False)
diff --git a/docs/source/checks/nlp/data_integrity/plot_under_annotated_property_segments.py b/docs/source/checks/nlp/data_integrity/plot_under_annotated_property_segments.py
index 14d80f9cb9..b0b5810995 100644
--- a/docs/source/checks/nlp/data_integrity/plot_under_annotated_property_segments.py
+++ b/docs/source/checks/nlp/data_integrity/plot_under_annotated_property_segments.py
@@ -64,6 +64,9 @@
 # ``categorical_aggregation_threshold``: By default the check will combine rare categories into a single category called
 # "Other". This parameter determines the frequency threshold for categories to be mapped into to the "other" category.
 #
+# ``multiple_segments_per_column``: If True, will allow the same property to be a segmenting feature in multiple
+# segments, otherwise each property can appear in one segment at most. False by default.
+#
 # see :class:`API reference <deepchecks.tabular.checks.model_evaluation.WeakSegmentsPerformance>` for more details.
 
 from deepchecks.nlp.checks import UnderAnnotatedPropertySegments
diff --git a/docs/source/checks/nlp/model_evaluation/plot_metadata_segments_performance.py b/docs/source/checks/nlp/model_evaluation/plot_metadata_segments_performance.py
index 69fff89220..536f837662 100644
--- a/docs/source/checks/nlp/model_evaluation/plot_metadata_segments_performance.py
+++ b/docs/source/checks/nlp/model_evaluation/plot_metadata_segments_performance.py
@@ -73,6 +73,9 @@
 # ``categorical_aggregation_threshold``: By default the check will combine rare categories into a single category called
 # "Other". This parameter determines the frequency threshold for categories to be mapped into to the "other" category.
 #
+# ``multiple_segments_per_column``: If True, will allow the same metadata column to be a segmenting column in
+# multiple segments, otherwise each metadata column can appear in one segment at most. True by default.
+#
 # see :class:`API reference <deepchecks.tabular.checks.model_evaluation.WeakSegmentsPerformance>` for more details.
 
 from deepchecks.nlp.checks import MetadataSegmentsPerformance
@@ -80,7 +83,8 @@
 
 scorer = {'f1': make_scorer(f1_score, average='micro')}
 check = MetadataSegmentsPerformance(alternative_scorer=scorer,
-                                    segment_minimum_size_ratio=0.03)
+                                    segment_minimum_size_ratio=0.03,
+                                    multiple_segments_per_column=True)
 result = check.run(test_dataset, probabilities=test_probas)
 result.show()
 
@@ -106,7 +110,6 @@
 
 # Let's add a condition and re-run the check:
 
-check = MetadataSegmentsPerformance(alternative_scorer=scorer, segment_minimum_size_ratio=0.03)
 check.add_condition_segments_relative_performance_greater_than(0.1)
 result = check.run(test_dataset, probabilities=test_probas)
 result.show(show_additional_outputs=False)
diff --git a/docs/source/checks/nlp/model_evaluation/plot_property_segments_performance.py b/docs/source/checks/nlp/model_evaluation/plot_property_segments_performance.py
index 3a8ad17424..c4022be497 100644
--- a/docs/source/checks/nlp/model_evaluation/plot_property_segments_performance.py
+++ b/docs/source/checks/nlp/model_evaluation/plot_property_segments_performance.py
@@ -73,6 +73,9 @@
 # ``categorical_aggregation_threshold``: By default the check will combine rare categories into a single category called
 # "Other". This parameter determines the frequency threshold for categories to be mapped into to the "other" category.
 #
+# ``multiple_segments_per_column``: If True, will allow the same property to be a segmenting feature in
+# multiple segments,  otherwise each property can appear in one segment at most. False by default.
+#
 # see :class:`API reference <deepchecks.tabular.checks.model_evaluation.WeakSegmentsPerformance>` for more details.
 
 from deepchecks.nlp.checks import PropertySegmentsPerformance
@@ -106,7 +109,6 @@
 
 # Let's add a condition and re-run the check:
 
-check = PropertySegmentsPerformance(alternative_scorer=scorer, segment_minimum_size_ratio=0.03)
 check.add_condition_segments_relative_performance_greater_than(0.1)
 result = check.run(test_dataset, probabilities=test_probas)
 result.show(show_additional_outputs=False)
diff --git a/docs/source/checks/tabular/model_evaluation/plot_weak_segments_performance.py b/docs/source/checks/tabular/model_evaluation/plot_weak_segments_performance.py
index d658b4740b..7e36b8b445 100644
--- a/docs/source/checks/tabular/model_evaluation/plot_weak_segments_performance.py
+++ b/docs/source/checks/tabular/model_evaluation/plot_weak_segments_performance.py
@@ -75,6 +75,9 @@
 # ``categorical_aggregation_threshold``: By default the check will combine rare categories into a single category called
 # "Other". This parameter determines the frequency threshold for categories to be mapped into to the "other" category.
 #
+# ``multiple_segments_per_column``: If True, will allow the same feature to be a segmenting feature in multiple
+# segments, otherwise each feature can appear in one segment at most. True by default.
+#
 # see :class:`API reference <deepchecks.tabular.checks.model_evaluation.WeakSegmentsPerformance>` for more details.
 
 from deepchecks.tabular.checks import WeakSegmentsPerformance
diff --git a/docs/source/nlp/tutorials/quickstarts/plot_multi_label_classification.py b/docs/source/nlp/tutorials/quickstarts/plot_multi_label_classification.py
index 0d35199356..47192d948b 100644
--- a/docs/source/nlp/tutorials/quickstarts/plot_multi_label_classification.py
+++ b/docs/source/nlp/tutorials/quickstarts/plot_multi_label_classification.py
@@ -2,8 +2,8 @@
 """
 .. _nlp__multilabel_quickstart:
 
-NLP Multi Label Classification Quickstart
-*****************************************
+Multi Label Classification Quickstart
+*************************************
 
 Deepchecks NLP tests your models during model development/research and before deploying to production. Using our
 testing package reduces model failures and saves tests development time. In this quickstart guide, you will learn how
@@ -238,7 +238,7 @@
 # In the display we can see some distinct property based segments that our model under performs on.
 #
 # By reviewing the results we can see that our model is performing poorly on samples that have a low level of
-# Subjectivity, by looking at the "Subjectivity vs Average Words Per Sentence" tab
+# Subjectivity, by looking at the "Subjectivity vs Average Sentence Length" tab
 # We can see that the problem is even more severe on samples containing long sentences.
 #
 # In addition to the visual display, most checks also return detailed data describing the results. This data can be
diff --git a/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py b/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py
index 48448dc611..db6c405d93 100644
--- a/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py
+++ b/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py
@@ -2,8 +2,8 @@
 """
 .. _nlp__multiclass_quickstart:
 
-Test NLP Classification Tasks - Quickstart
-******************************************
+Text Classification Quickstart
+******************************
 
 Deepchecks NLP tests your models during model development/research and before deploying to production. Using our
 testing package reduces model failures and saves tests development time. In this quickstart guide, you will learn how
diff --git a/docs/source/nlp/tutorials/quickstarts/plot_token_classification.py b/docs/source/nlp/tutorials/quickstarts/plot_token_classification.py
index 56e149c528..f070af2d60 100644
--- a/docs/source/nlp/tutorials/quickstarts/plot_token_classification.py
+++ b/docs/source/nlp/tutorials/quickstarts/plot_token_classification.py
@@ -2,8 +2,8 @@
 """
 .. _nlp__token_classification_quickstart:
 
-Test NLP Token Classification Tasks - Quickstart
-************************************************
+Token Classification Quickstart
+*******************************
 
 Deepchecks NLP tests your models during model development/research and before deploying to production. Using our
 testing package reduces model failures and saves tests development time. In this quickstart guide, you will learn how
diff --git a/tests/nlp/checks/data_integrity/under_annotated_segments_test.py b/tests/nlp/checks/data_integrity/under_annotated_segments_test.py
index 6989f183ba..0e8526f3bd 100644
--- a/tests/nlp/checks/data_integrity/under_annotated_segments_test.py
+++ b/tests/nlp/checks/data_integrity/under_annotated_segments_test.py
@@ -37,7 +37,7 @@ def test_tweet_emotion_properties(tweet_emotion_train_test_textdata):
     ))
 
     assert_that(result.value['avg_score'], close_to(0.5, 0.001))
-    assert_that(len(result.value['weak_segments_list']), close_to(33, 1))
+    assert_that(len(result.value['weak_segments_list']), close_to(6, 1))
     assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.314, 0.01))
 
 
@@ -47,7 +47,8 @@ def test_tweet_emotion_metadata(tweet_emotion_train_test_textdata):
     test = test.copy()
     test._label = np.asarray(list(test._label[:round(len(test._label) / 2)]) + [None] * round(len(test._label) / 2),
                              dtype=object)
-    check = UnderAnnotatedMetaDataSegments().add_condition_segments_relative_performance_greater_than()
+    check = UnderAnnotatedMetaDataSegments(multiple_segments_per_column=True).\
+        add_condition_segments_relative_performance_greater_than()
     # Act
     result = check.run(test)
     condition_result = check.conditions_decision(result)
@@ -77,7 +78,7 @@ def test_tweet_emotion_metadata_interesting_segment(tweet_emotion_train_test_tex
     test._label = label
 
     # Act
-    result = UnderAnnotatedMetaDataSegments().run(test)
+    result = UnderAnnotatedMetaDataSegments(multiple_segments_per_column=True).run(test)
 
     # Assert
     assert_that(result.value['avg_score'], close_to(0.844, 0.001))
@@ -123,7 +124,7 @@ def test_token_classification_dataset(small_wikiann_train_test_text_data):
     ))
 
     assert_that(result.value['avg_score'], close_to(0.8, 0.001))
-    assert_that(len(result.value['weak_segments_list']), equal_to(25))
+    assert_that(len(result.value['weak_segments_list']), equal_to(6))
     assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.2, 0.01))
 
 
@@ -134,7 +135,8 @@ def test_multilabel_dataset(multilabel_mock_dataset_and_probabilities):
     assert_that(data.is_multi_label_classification(), equal_to(True))
     data._label = np.asarray(list(data._label[:round(len(data._label) / 2)]) + [None] * round(len(data._label) / 2),
                              dtype=object)
-    check = UnderAnnotatedMetaDataSegments().add_condition_segments_relative_performance_greater_than()
+    check = UnderAnnotatedMetaDataSegments(multiple_segments_per_column=True).\
+        add_condition_segments_relative_performance_greater_than()
 
     # Act
     result = check.run(data)
diff --git a/tests/nlp/checks/model_evaluation/weak_segment_performance_test.py b/tests/nlp/checks/model_evaluation/weak_segment_performance_test.py
index c5cbcb3140..ac36c82302 100644
--- a/tests/nlp/checks/model_evaluation/weak_segment_performance_test.py
+++ b/tests/nlp/checks/model_evaluation/weak_segment_performance_test.py
@@ -45,7 +45,7 @@ def test_column_with_nones(tweet_emotion_train_test_textdata, tweet_emotion_trai
     test.set_metadata(metadata)
 
     # Act
-    result = MetadataSegmentsPerformance().run(test, probabilities=test_probas)
+    result = MetadataSegmentsPerformance(multiple_segments_column=True).run(test, probabilities=test_probas)
 
     # Assert
     assert_that(result.value['avg_score'], close_to(0.707, 0.01))
@@ -95,7 +95,7 @@ def test_tweet_emotion_properties(tweet_emotion_train_test_textdata, tweet_emoti
     ))
 
     assert_that(result.value['avg_score'], close_to(0.708, 0.001))
-    assert_that(len(result.value['weak_segments_list']), close_to(33, 1))
+    assert_that(len(result.value['weak_segments_list']), close_to(6, 1))
     assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.525, 0.01))
 
 
@@ -149,7 +149,7 @@ def test_multilabel_dataset(multilabel_mock_dataset_and_probabilities):
                                                    'performance.'))
 
     assert_that(result.value['avg_score'], close_to(0.83, 0.001))
-    assert_that(len(result.value['weak_segments_list']), is_in([5, 6]))  # TODO: check why it's not always 5
+    assert_that(len(result.value['weak_segments_list']), is_in([5, 6]))
     # assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.695, 0.01))  # TODO:
 
 
@@ -168,7 +168,7 @@ def test_multilabel_just_dance(just_dance_train_test_textdata, just_dance_train_
 
     # Assert
     assert_that(result.value['avg_score'], close_to(0.615, 0.001))
-    assert_that(len(result.value['weak_segments_list']), is_in([79, 80]))  # TODO: check why it's not always 80
+    assert_that(len(result.value['weak_segments_list']), equal_to(5))
     assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.401, 0.01))
 
 
@@ -182,7 +182,7 @@ def test_binary_classification(binary_mock_dataset_and_probabilities):
 
     # Assert
     assert_that(result.value['avg_score'], close_to(0.447, 0.001))
-    assert_that(len(result.value['weak_segments_list']), equal_to(6))
+    assert_that(len(result.value['weak_segments_list']), equal_to(5))
     assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.34, 0.01))
 
 
diff --git a/tests/tabular/checks/model_evaluation/weak_segments_performance_test.py b/tests/tabular/checks/model_evaluation/weak_segments_performance_test.py
index 9347232671..bdcbe39ce1 100644
--- a/tests/tabular/checks/model_evaluation/weak_segments_performance_test.py
+++ b/tests/tabular/checks/model_evaluation/weak_segments_performance_test.py
@@ -25,7 +25,7 @@ def test_segment_performance_diabetes(diabetes_split_dataset_and_model):
     _, val, model = diabetes_split_dataset_and_model
 
     # Act
-    result = WeakSegmentsPerformance().run(val, model)
+    result = WeakSegmentsPerformance(n_top_features=5).run(val, model)
     segments = result.value['weak_segments_list']
 
     # Assert
@@ -111,7 +111,7 @@ def test_regression_categorical_features_avocado(avocado_split_dataset_and_model
     _, val, model = avocado_split_dataset_and_model
 
     # Act
-    result = WeakSegmentsPerformance(random_state=42).run(val, model, feature_importance_timeout=0)
+    result = WeakSegmentsPerformance(random_state=42, n_top_features=5).run(val, model, feature_importance_timeout=0)
     segments = result.value['weak_segments_list']
 
     # Assert
@@ -140,7 +140,7 @@ def test_categorical_feat_target(adult_split_dataset_and_model):
     val.data['native-country'].iloc[0] = np.nan
     val.data['native-country'] = pd.Categorical(val.data['native-country'])
     val.data['income'] = pd.Categorical(val.data['income'])
-    check = WeakSegmentsPerformance()
+    check = WeakSegmentsPerformance(n_top_features=5)
 
     # Act
     result = check.run(val, model)

From 79ad491712759e7513146bfe7504a58305599272 Mon Sep 17 00:00:00 2001
From: Noam Bressler <noamzbr@gmail.com>
Date: Wed, 26 Jul 2023 19:20:01 +0300
Subject: [PATCH 22/23] rename is english to english text (#2648)

---
 deepchecks/nlp/utils/text_properties.py |  6 +++---
 tests/nlp/utils/test_properties.py      | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py
index 52f7708345..049603a2aa 100644
--- a/deepchecks/nlp/utils/text_properties.py
+++ b/deepchecks/nlp/utils/text_properties.py
@@ -153,7 +153,7 @@ def language(
     return language_code
 
 
-def is_english(
+def english_text(
         text: str,
         lang_certainty_threshold: float = 0.8,
         fasttext_model: Optional[Dict[object, Any]] = None,
@@ -512,7 +512,7 @@ class TextProperty(TypedDict):
 
 ALL_PROPERTIES: Tuple[TextProperty, ...] = \
     (
-        {'name': 'Is English', 'method': is_english, 'output_type': 'categorical'},
+        {'name': 'English Text', 'method': english_text, 'output_type': 'categorical'},
         {'name': 'URLs Count', 'method': urls_count, 'output_type': 'numeric'},
         {'name': 'Email Addresses Count', 'method': email_addresses_count, 'output_type': 'numeric'},
         {'name': 'Unique URLs Count', 'method': unique_urls_count, 'output_type': 'numeric'},
@@ -563,7 +563,7 @@ class TextProperty(TypedDict):
     'URLs Count': 'Number of URLS per text sample',
     'Email Addresses Count': 'Number of email addresses per text sample',
     'Unique URLs Count': 'Number of unique URLS per text sample',
-    'Is English': 'Whether the text is in English (1) or not (0)',
+    'English Text': 'Whether the text is in English (1) or not (0)',
     'Unique Email Addresses Count': 'Number of unique email addresses per text sample',
     'Unique Syllables Count': 'Number of unique syllables per text sample',
     'Reading Time': 'Time taken in seconds to read a text sample',
diff --git a/tests/nlp/utils/test_properties.py b/tests/nlp/utils/test_properties.py
index 29057de6e8..6c63842dd3 100644
--- a/tests/nlp/utils/test_properties.py
+++ b/tests/nlp/utils/test_properties.py
@@ -19,7 +19,7 @@
 
 from deepchecks.core.errors import DeepchecksValueError
 from deepchecks.nlp.utils.text_properties import (_sample_for_property, calculate_builtin_properties,
-                                                  is_english)
+                                                  english_text)
 from deepchecks.nlp.utils.text_properties_models import MODELS_STORAGE, _get_transformer_model
 
 
@@ -344,15 +344,15 @@ def test_calculate_average_syllable_count(tweet_emotion_train_test_textdata):
     assert_that(result_none_text['Average Syllable Length'], equal_to([np.nan]))
 
 
-def test_calcualte_is_english_property():
+def test_calcualte_english_text_property():
     data = ['This is a sentence in English.', 'Это предложение на русском языке.']
-    result = calculate_builtin_properties(data, include_properties=['Is English'])[0]
-    assert_that(result['Is English'], equal_to([True, False]))
+    result = calculate_builtin_properties(data, include_properties=['English Text'])[0]
+    assert_that(result['English Text'], equal_to([True, False]))
 
 
-def test_calcualte_is_english_property_without_language_precalculation():
+def test_calcualte_english_text_property_without_language_precalculation():
     data = ['This is a sentence in English.', 'Это предложение на русском языке.']
-    assert_that([is_english(data[0]), is_english(data[1])], equal_to([True, False]))
+    assert_that([english_text(data[0]), english_text(data[1])], equal_to([True, False]))
 
 
 def test_include_properties():

From fcd084eaf0802c4a19b3fa13187974282a321636 Mon Sep 17 00:00:00 2001
From: Noam Bressler <noamzbr@gmail.com>
Date: Thu, 27 Jul 2023 09:47:05 +0300
Subject: [PATCH 23/23] Return nan embeddings for empty samples (#2649)

* Return nan embeddings for empty samples

* Handle missing properties in outlier condition

* handle zero length samples in special chars

* improve error message in conflictng labels
---
 .../data_integrity/conflicting_labels.py      |  2 ++
 .../data_integrity/special_characters.py      |  3 ++
 .../data_integrity/text_property_outliers.py  |  2 ++
 deepchecks/nlp/utils/text_embeddings.py       |  7 +++--
 .../data_integrity/special_characters_test.py | 16 +++++++++++
 .../text_property_outliers_test.py            | 28 ++++++++++++++++++-
 6 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/deepchecks/nlp/checks/data_integrity/conflicting_labels.py b/deepchecks/nlp/checks/data_integrity/conflicting_labels.py
index 7e0277c5fe..e5a4051586 100644
--- a/deepchecks/nlp/checks/data_integrity/conflicting_labels.py
+++ b/deepchecks/nlp/checks/data_integrity/conflicting_labels.py
@@ -88,6 +88,8 @@ def _get_labels(self, dataset):
             labels = [tuple(np.where(row == 1)[0]) for row in dataset.label]
         elif dataset.task_type is TaskType.TEXT_CLASSIFICATION:
             labels = dataset.label
+        elif dataset.task_type is TaskType.OTHER:
+            raise DeepchecksValueError('Check is irrelevant when task type is not specified')
         else:
             raise DeepchecksValueError(f'Unknown task type - {dataset.task_type}')
         return labels
diff --git a/deepchecks/nlp/checks/data_integrity/special_characters.py b/deepchecks/nlp/checks/data_integrity/special_characters.py
index 6cb66c8f32..ea91b0bc88 100644
--- a/deepchecks/nlp/checks/data_integrity/special_characters.py
+++ b/deepchecks/nlp/checks/data_integrity/special_characters.py
@@ -102,6 +102,9 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
                 continue
             if len(sample) > self.max_chars_to_review_per_sample:
                 sample = random.sample(sample, self.max_chars_to_review_per_sample)
+            if len(sample) == 0:
+                percent_special_chars_in_sample[idx] = 0
+                continue
             special_chars_in_sample = [char for char in sample if char in self.special_characters_deny_list]
             percent_special_chars_in_sample[idx] = len(special_chars_in_sample) / len(sample)
             for char in frozenset(special_chars_in_sample):
diff --git a/deepchecks/nlp/checks/data_integrity/text_property_outliers.py b/deepchecks/nlp/checks/data_integrity/text_property_outliers.py
index c5fdd81d97..7f48f080a5 100644
--- a/deepchecks/nlp/checks/data_integrity/text_property_outliers.py
+++ b/deepchecks/nlp/checks/data_integrity/text_property_outliers.py
@@ -216,6 +216,8 @@ def condition(result: t.Dict[str, t.Any]):
             for property_name, info in result.items():
                 if properties_to_ignore is not None and property_name in properties_to_ignore:
                     continue
+                if isinstance(info, str):
+                    continue
                 if info['outlier_ratio'] > threshold:
                     failed_properties.append(property_name)
                 if info['outlier_ratio'] > worst_ratio:
diff --git a/deepchecks/nlp/utils/text_embeddings.py b/deepchecks/nlp/utils/text_embeddings.py
index c34690c3fa..9db8ecf258 100644
--- a/deepchecks/nlp/utils/text_embeddings.py
+++ b/deepchecks/nlp/utils/text_embeddings.py
@@ -186,8 +186,11 @@ def len_safe_get_embedding(list_of_texts, model_name=EMBEDDING_MODEL, max_tokens
                         text_lens.append(chunk_lens[idx])
                         idx += 1
 
-                    text_embedding = np.average(text_embeddings, axis=0, weights=text_lens)
-                    text_embedding = text_embedding / np.linalg.norm(text_embedding)  # normalizes length to 1
+                    if sum(text_lens) == 0:
+                        text_embedding = np.ones((EMBEDDING_DIM, )) * np.nan
+                    else:
+                        text_embedding = np.average(text_embeddings, axis=0, weights=text_lens)
+                        text_embedding = text_embedding / np.linalg.norm(text_embedding)  # normalizes length to 1
                 result_embeddings.append(text_embedding.tolist())
 
             return result_embeddings
diff --git a/tests/nlp/checks/data_integrity/special_characters_test.py b/tests/nlp/checks/data_integrity/special_characters_test.py
index 0023f5454c..da99145aef 100644
--- a/tests/nlp/checks/data_integrity/special_characters_test.py
+++ b/tests/nlp/checks/data_integrity/special_characters_test.py
@@ -66,6 +66,22 @@ def test_check_on_clean_dataset(clean_dataset):
     ))  # type: ignore
 
 
+def test_check_on_dataset_with_emptt_sample():
+    # Arrange
+    data = TextData(raw_text=['', 'aa'])
+    check = SpecialCharacters().add_condition_samples_ratio_w_special_characters_less_or_equal(0)
+
+    # Act
+    result = check.run(dataset=data)
+
+    # Assert
+    assert_that(result.value, has_entries({
+        "samples_per_special_char": has_length(0),
+        "percent_of_samples_with_special_chars": equal_to(0),
+        'percent_special_chars_per_sample': has_length(2),
+    }))
+
+
 def test_check_on_samples_with_special_characters(dataset_with_special_characters):
     # Arrange
     check = SpecialCharacters().add_condition_samples_ratio_w_special_characters_less_or_equal(
diff --git a/tests/nlp/checks/data_integrity/text_property_outliers_test.py b/tests/nlp/checks/data_integrity/text_property_outliers_test.py
index cb80eb9a8b..92caaa00a5 100644
--- a/tests/nlp/checks/data_integrity/text_property_outliers_test.py
+++ b/tests/nlp/checks/data_integrity/text_property_outliers_test.py
@@ -9,7 +9,7 @@
 # ----------------------------------------------------------------------------
 #
 """Test for the NLP TextPropertyOutliers check"""
-
+import numpy as np
 import pandas as pd
 from hamcrest import assert_that, close_to, equal_to
 
@@ -76,6 +76,32 @@ def test_tweet_emotion_condition(tweet_emotion_train_test_textdata):
     )
 
 
+def test_tweet_emotion_condition_property_with_nans(tweet_emotion_train_test_textdata):
+    # Arrange
+    _, test = tweet_emotion_train_test_textdata
+    test = test.copy()
+    test._properties['Subjectivity'] = test._properties['Subjectivity'] * np.nan
+    check = TextPropertyOutliers().add_condition_outlier_ratio_less_or_equal()
+    # Act
+    result = check.run(test)
+    conditions_decisions = check.conditions_decision(result)
+
+    # Assert
+    assert_that(len(result.value['Sentiment']['indices']), equal_to(65))
+    assert_that(result.value['Sentiment']['lower_limit'], close_to(-0.90, 0.01))
+    assert_that(result.value['Sentiment']['upper_limit'], close_to(0.92, 0.01))
+
+    assert_that(
+        conditions_decisions[0],
+        equal_condition_result(
+            is_pass=False,
+            name='Outlier ratio in all properties is less or equal than 5%',
+            details='Found 1 properties with outlier ratios above threshold.</br>'
+                    'Property with highest ratio is Toxicity with outlier ratio of 16.43%'
+        )  # type: ignore
+    )
+
+
 def test_not_enough_samples(tweet_emotion_train_test_textdata):
     # Arrange
     _, test = tweet_emotion_train_test_textdata