From 0d105c80ba9ed4f403f092d45227bfcbd314215e Mon Sep 17 00:00:00 2001
From: Yurii Romanyshyn <71635444+yromanyshyn@users.noreply.github.com>
Date: Sun, 16 Apr 2023 16:33:33 +0300
Subject: [PATCH] [DEE-440] added error msg to checks that do not support
 token/multi-label classification (#2445)

* added error msg to checks that do not support token/multi-label classification

* docs fix

* code style fixes

* test fixes

* tests fixes

* pred validation fix

* code style fixes

* code style fixes

* Update deepchecks/nlp/checks/data_integrity/property_label_correlation.py

Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>

* Update deepchecks/nlp/checks/data_integrity/property_label_correlation.py

Co-authored-by: Nir Hutnik <92314933+nirhutnik@users.noreply.github.com>

* fixes

* fix

* docs style fixes

* code style fixes

* code style fixes

* Update deepchecks/nlp/context.py

Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>

* fixes

---------

Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>
Co-authored-by: Nir Hutnik <92314933+nirhutnik@users.noreply.github.com>
---
 deepchecks/core/errors.py                     |  2 -
 .../property_label_correlation.py             |  5 +-
 .../data_integrity/text_property_outliers.py  |  2 +-
 .../confusion_matrix_report.py                |  6 +-
 .../model_evaluation/prediction_drift.py      |  2 +
 .../single_dataset_performance.py             |  5 +-
 .../weak_segments_performance.py              |  4 ++
 .../train_test_validation/label_drift.py      |  3 +
 deepchecks/nlp/context.py                     | 62 ++++++++++++++-----
 .../nlp/metric_utils/token_classification.py  | 41 ++++++++----
 deepchecks/nlp/utils/data_inference.py        | 16 +++--
 deepchecks/utils/validation.py                |  8 ++-
 spelling-allowlist.txt                        |  3 +-
 .../single_dataset_performance_test.py        |  3 +-
 14 files changed, 115 insertions(+), 47 deletions(-)

diff --git a/deepchecks/core/errors.py b/deepchecks/core/errors.py
index 364b2a75f8..179a740c4f 100644
--- a/deepchecks/core/errors.py
+++ b/deepchecks/core/errors.py
@@ -40,8 +40,6 @@ class DeepchecksNotImplementedError(DeepchecksBaseError):
 class DeepchecksNotSupportedError(DeepchecksBaseError):
     """Exception class that represents an unsupported action in Deepchecks."""
 
-    pass
-
 
 class DeepchecksProcessError(DeepchecksBaseError):
     """Exception class that represents an issue with a process."""
diff --git a/deepchecks/nlp/checks/data_integrity/property_label_correlation.py b/deepchecks/nlp/checks/data_integrity/property_label_correlation.py
index 02bdf760c9..0562e83067 100644
--- a/deepchecks/nlp/checks/data_integrity/property_label_correlation.py
+++ b/deepchecks/nlp/checks/data_integrity/property_label_correlation.py
@@ -92,8 +92,11 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         DeepchecksValueError
             If the object is not a Dataset instance with a label.
         """
-        text_data = context.get_data_by_kind(dataset_kind).sample(self.n_samples, random_state=context.random_state)
+        context.raise_if_token_classification_task(self)
+        context.raise_if_multi_label_task(self)
 
+        text_data = context.get_data_by_kind(dataset_kind)
+        text_data = text_data.sample(self.n_samples, random_state=context.random_state)
         label = pd.Series(text_data.label, name='label', index=text_data.get_original_text_indexes())
 
         # Classification labels should be of type object (and not int, for example)
diff --git a/deepchecks/nlp/checks/data_integrity/text_property_outliers.py b/deepchecks/nlp/checks/data_integrity/text_property_outliers.py
index dae30f1ed6..f6b96ba30c 100644
--- a/deepchecks/nlp/checks/data_integrity/text_property_outliers.py
+++ b/deepchecks/nlp/checks/data_integrity/text_property_outliers.py
@@ -24,7 +24,7 @@
 
 
 class TextPropertyOutliers(SingleDatasetCheck):
-    """Find outliers images with respect to the given properties.
+    """Find outliers with respect to the given properties.
 
     The check finds outliers in the text properties.
     For numeric properties, the check uses `IQR <https://en.wikipedia.org/wiki/Interquartile_range#Outliers>`_ to
diff --git a/deepchecks/nlp/checks/model_evaluation/confusion_matrix_report.py b/deepchecks/nlp/checks/model_evaluation/confusion_matrix_report.py
index e3808961ec..ffe7ee7352 100644
--- a/deepchecks/nlp/checks/model_evaluation/confusion_matrix_report.py
+++ b/deepchecks/nlp/checks/model_evaluation/confusion_matrix_report.py
@@ -54,7 +54,11 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         DeepchecksValueError
             If the data is not a Dataset instance with a label
         """
-        dataset = context.get_data_by_kind(dataset_kind).sample(self.n_samples, random_state=self.random_state)
+        context.raise_if_token_classification_task(self)
+        context.raise_if_multi_label_task(self)
+
+        dataset = context.get_data_by_kind(dataset_kind)
+        dataset = dataset.sample(self.n_samples, random_state=self.random_state)
         y_true = np.asarray(dataset.label)
         y_pred = np.array(context.model.predict(dataset)).reshape(len(y_true), )
 
diff --git a/deepchecks/nlp/checks/model_evaluation/prediction_drift.py b/deepchecks/nlp/checks/model_evaluation/prediction_drift.py
index 5b2b52f964..c14bc2bca1 100644
--- a/deepchecks/nlp/checks/model_evaluation/prediction_drift.py
+++ b/deepchecks/nlp/checks/model_evaluation/prediction_drift.py
@@ -145,6 +145,8 @@ def run_logic(self, context: Context) -> CheckResult:
             value: drift score.
             display: prediction distribution graph, comparing the train and test distributions.
         """
+        context.raise_if_token_classification_task(self)
+
         train_dataset = context.train.sample(self.n_samples, random_state=context.random_state)
         test_dataset = context.test.sample(self.n_samples, random_state=context.random_state)
         model = context.model
diff --git a/deepchecks/nlp/checks/model_evaluation/single_dataset_performance.py b/deepchecks/nlp/checks/model_evaluation/single_dataset_performance.py
index 661fc25a8c..6010910a49 100644
--- a/deepchecks/nlp/checks/model_evaluation/single_dataset_performance.py
+++ b/deepchecks/nlp/checks/model_evaluation/single_dataset_performance.py
@@ -10,7 +10,7 @@
 #
 """Module containing the single dataset performance check."""
 from numbers import Number
-from typing import Callable, Dict, List, TypeVar, Union
+from typing import Callable, Dict, List, Union
 
 import pandas as pd
 
@@ -23,9 +23,6 @@
 __all__ = ['SingleDatasetPerformance']
 
 
-SDP = TypeVar('SDP', bound='SingleDatasetPerformance')
-
-
 class SingleDatasetPerformance(SingleDatasetCheck, BaseSingleDatasetPerformance):
     """Summarize given model performance on a dataset based on selected scorers.
 
diff --git a/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py b/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py
index 6d0ca53a85..d60227ee93 100644
--- a/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py
+++ b/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py
@@ -51,6 +51,9 @@ def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], Non
 
     def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         """Run check."""
+        context.raise_if_token_classification_task(self)
+        context.raise_if_multi_label_task(self)
+
         text_data = context.get_data_by_kind(dataset_kind)
         text_data = text_data.sample(self.n_samples, random_state=context.random_state)
 
@@ -86,6 +89,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
 
         if features.shape[1] < 2:
             raise DeepchecksNotSupportedError('Check requires meta data to have at least two columns in order to run.')
+
         # label is not used in the check, just here to avoid errors
         dataset = Dataset(features, label=pd.Series(text_data.label), cat_features=cat_features)
         encoded_dataset = self._target_encode_categorical_features_fill_na(dataset, list(np.unique(text_data.label)))
diff --git a/deepchecks/nlp/checks/train_test_validation/label_drift.py b/deepchecks/nlp/checks/train_test_validation/label_drift.py
index 9e23d20cde..b9f062b37e 100644
--- a/deepchecks/nlp/checks/train_test_validation/label_drift.py
+++ b/deepchecks/nlp/checks/train_test_validation/label_drift.py
@@ -113,6 +113,9 @@ def run_logic(self, context: Context) -> CheckResult:
             value: drift score.
             display: label distribution graph, comparing the train and test distributions.
         """
+        context.raise_if_token_classification_task(self)
+        context.raise_if_multi_label_task(self)
+
         train_dataset = context.train.sample(self.n_samples, random_state=context.random_state)
         test_dataset = context.test.sample(self.n_samples, random_state=context.random_state)
 
diff --git a/deepchecks/nlp/context.py b/deepchecks/nlp/context.py
index 99d54f8ab7..63566d32c5 100644
--- a/deepchecks/nlp/context.py
+++ b/deepchecks/nlp/context.py
@@ -44,7 +44,7 @@
 TClassProba = t.Sequence[t.Sequence[float]]
 TTokenPred = t.Sequence[t.Sequence[t.Tuple[str, int, int, float]]]
 TTextPred = t.Union[TClassPred, TTokenPred]
-TTextProba = t.Union[TClassProba]
+TTextProba = t.Union[TClassProba]  # TODO: incorrect, why union have only one type argument?
 
 
 class _DummyModel(BasicModel):
@@ -216,18 +216,34 @@ def _validate_classification_prediction(dataset: TextData, prediction: TTextPred
     @staticmethod
     def _validate_token_classification_prediction(dataset: TextData, prediction: TTextPred):
         """Validate prediction for given token classification dataset."""
-        if not all(isinstance(pred, collections.abc.Sequence) for pred in prediction):
-            raise ValidationError(f'Check requires predictions for {dataset.name} to be a sequence '
-                                  f'of sequences')
-
-        for i in range(len(prediction)):  # TODO: Goes over all predictions, fix this
-            if not all(isinstance(pred, str) for pred in prediction[i]) \
-                    and not all(isinstance(pred, int) for pred in prediction[i]):
-                raise ValidationError(f'Check requires predictions for {dataset.name} to be a sequence '
-                                      f'of sequences of strings or integers')
-            if len(prediction[i]) != len(dataset.tokenized_text[i]):
-                raise ValidationError(f'Check requires predictions for {dataset.name} to have '
-                                      f'the same number of tokens as the input text')
+        if not is_sequence_not_str(prediction):
+            raise ValidationError(
+                f'Check requires predictions for {dataset.name} to be a sequence of sequences'
+            )
+
+        tokenized_text = dataset.tokenized_text
+
+        for idx, sample_predictions in enumerate(prediction):
+            if not is_sequence_not_str(sample_predictions):
+                raise ValidationError(
+                    f'Check requires predictions for {dataset.name} to be a sequence of sequences'
+                )
+
+            predictions_types_counter = collections.defaultdict(int)
+
+            for p in sample_predictions:
+                predictions_types_counter[type(p)] += 1
+
+            if predictions_types_counter[str] > 0 and predictions_types_counter[int] > 0:
+                raise ValidationError(
+                    f'Check requires predictions for {dataset.name} to be a sequence '
+                    'of sequences of strings or integers'
+                )
+            if len(sample_predictions) != len(tokenized_text[idx]):
+                raise ValidationError(
+                    f'Check requires predictions for {dataset.name} to have '
+                    'the same number of tokens as the input text'
+                )
 
     @staticmethod
     def _validate_proba(dataset: TextData, probabilities: TTextProba, n_classes: int,
@@ -430,6 +446,24 @@ def assert_properties(text_data):
                 'set_properties method to set your own properties with a pandas.DataFrame or use '
                 'TextData.calculate_default_properties to add the default deepchecks properties.')
 
+    def raise_if_token_classification_task(self, check=None):
+        """Raise an exception if it is a token classification task."""
+        check_name = type(check).__name__ if check else 'Check'
+        task_type_name = TaskType.TOKEN_CLASSIFICATION.value
+        if self.task_type is TaskType.TOKEN_CLASSIFICATION:
+            raise DeepchecksNotSupportedError(
+                f'"{check_name}" is not supported for the "{task_type_name}" tasks'
+            )
+
+    def raise_if_multi_label_task(self, check=None):
+        """Raise an exception if it is a multi-label classification task."""
+        dataset = t.cast(TextData, self._train if self._train is not None else self._test)
+        check_name = type(check).__name__ if check else 'Check'
+        if dataset.is_multi_label_classification():
+            raise DeepchecksNotSupportedError(
+                f'"{check_name}" is not supported for the multilable classification tasks'
+            )
+
     def get_scorers(self,
                     scorers: t.Union[t.Mapping[str, t.Union[str, t.Callable]], t.List[str]] = None,
                     use_avg_defaults=True) -> t.List[DeepcheckScorer]:
@@ -454,11 +488,11 @@ def get_scorers(self,
             else:
                 scorers = scorers or get_default_scorers(TabularTaskType.BINARY, use_avg_defaults)
         elif self.task_type == TaskType.TOKEN_CLASSIFICATION:
-            scoring_dict = get_scorer_dict()
             if scorers is None:
                 scorers = get_default_token_scorers(use_avg_defaults)  # Get string names of default scorers
             else:
                 validate_scorers(scorers)  # Validate that use supplied scorer names are OK
+            scoring_dict = get_scorer_dict()
             scorers = {name: scoring_dict[name] for name in scorers}
         else:
             raise DeepchecksValueError(f'Task type must be either {TaskType.TEXT_CLASSIFICATION} or '
diff --git a/deepchecks/nlp/metric_utils/token_classification.py b/deepchecks/nlp/metric_utils/token_classification.py
index 20e8c5fe24..44cdfaa89a 100644
--- a/deepchecks/nlp/metric_utils/token_classification.py
+++ b/deepchecks/nlp/metric_utils/token_classification.py
@@ -21,15 +21,19 @@
 __all__ = ['get_default_token_scorers', 'validate_scorers', 'get_scorer_dict']
 
 DEFAULT_AVG_SCORER_NAMES = ('f1_macro', 'recall_macro', 'precision_macro')
-DEFAULT_PER_CLASS_SCORER_NAMES = ('f1_per_class', 'f1_per_class', 'f1_per_class')
+DEFAULT_PER_CLASS_SCORER_NAMES = tuple()
 
-
-if t.TYPE_CHECKING:
-    from deepchecks.nlp.context import TTokenPred  # pylint: disable=unused-import # noqa: F401
+# see issue DEE-473
+# https://linear.app/deepchecks/issue/DEE-473/incorrectly-inferred-model-classes-for-token-classification-task
+#
+# DEFAULT_PER_CLASS_SCORER_NAMES = ('f1_per_class',)
 
 
-def get_scorer_dict(suffix: bool = False, mode: t.Optional[str] = None, scheme: t.Optional[t.Type[Token]] = None,
-                    ) -> t.Dict[str, t.Callable[[t.List[str], t.List[str]], float]]:
+def get_scorer_dict(
+    suffix: bool = False,
+    mode: t.Optional[str] = None,
+    scheme: t.Optional[t.Type[Token]] = None,
+) -> t.Dict[str, t.Callable[[t.List[str], t.List[str]], float]]:
     """Return a dict of scorers for token classification.
 
     Parameters:
@@ -77,14 +81,25 @@ def validate_scorers(scorers: t.List[str]):
 
     if not isinstance(scorers, Sequence):
         raise DeepchecksValueError(f'Scorers must be a Sequence, got {type(scorers)}')
-    if not all(isinstance(name, str) for name in scorers):
-        # TODO: support custom scorers
-        raise DeepchecksValueError(f'Scorers must be a Sequence of strings, got {type(scorers[0])}')
-    if any(name not in scoring_dict for name in scorers):
-        raise DeepchecksValueError(f'Scorers must be a list of names of existing token classification metrics, which '
-                                   f'is {scoring_dict.keys()}, got {scorers}')
+
+    for name in scorers:
+        if not isinstance(name, str):
+            # TODO: support custom scorers
+            raise DeepchecksValueError(
+                f'Scorers must be a Sequence of strings, got {type(name)}'
+            )
+        if name not in scoring_dict:
+            raise DeepchecksValueError(
+                'Scorers must be a list of names of existing token classification metrics, '
+                f'which is {scoring_dict.keys()}, got {scorers}'
+            )
 
 
 def get_default_token_scorers(use_avg_defaults=True) -> t.List[str]:
     """Return the default scorers for token classification."""
-    return DEFAULT_AVG_SCORER_NAMES if use_avg_defaults else DEFAULT_PER_CLASS_SCORER_NAMES
+    names = (
+        DEFAULT_AVG_SCORER_NAMES
+        if use_avg_defaults
+        else DEFAULT_PER_CLASS_SCORER_NAMES
+    )
+    return [f'token_{it}' for it in names]
diff --git a/deepchecks/nlp/utils/data_inference.py b/deepchecks/nlp/utils/data_inference.py
index 15f52be0c5..0cca06d0fe 100644
--- a/deepchecks/nlp/utils/data_inference.py
+++ b/deepchecks/nlp/utils/data_inference.py
@@ -23,12 +23,16 @@
 __all__ = ['infer_observed_and_model_labels']
 
 
-def infer_observed_and_model_labels(train_dataset=None, test_dataset=None, model: BaseEstimator = None,
-                                    y_pred_train: np.array = None,  # pylint: disable=unused-argument
-                                    y_pred_test: np.array = None,  # pylint: disable=unused-argument
-                                    model_classes: list = None,
-                                    task_type: TaskType = None) -> \
-        Tuple[List, List]:
+# pylint: disable=unused-argument
+def infer_observed_and_model_labels(
+    train_dataset=None,
+    test_dataset=None,
+    model: BaseEstimator = None,
+    y_pred_train: np.ndarray = None,
+    y_pred_test: np.ndarray = None,
+    model_classes: list = None,
+    task_type: TaskType = None
+) -> Tuple[List, List]:
     """
     Infer the observed labels from the given datasets and predictions.
 
diff --git a/deepchecks/utils/validation.py b/deepchecks/utils/validation.py
index 83fbab807f..529e489152 100644
--- a/deepchecks/utils/validation.py
+++ b/deepchecks/utils/validation.py
@@ -14,6 +14,7 @@
 
 import numpy as np
 import pandas as pd
+from typing_extensions import TypeGuard
 
 from deepchecks.core import errors
 from deepchecks.utils.typing import Hashable
@@ -48,6 +49,9 @@ def ensure_hashable_or_mutable_sequence(
     ))
 
 
-def is_sequence_not_str(value):
+def is_sequence_not_str(value) -> TypeGuard[t.Sequence[t.Any]]:
     """Check if value is a non str sequence."""
-    return isinstance(value, (t.Sequence, pd.Series, np.ndarray)) and not isinstance(value, str)
+    return (
+        not isinstance(value, (bytes, str, bytearray))
+        and isinstance(value, (t.Sequence, pd.Series, np.ndarray))
+    )
diff --git a/spelling-allowlist.txt b/spelling-allowlist.txt
index b4e22b03cf..d9d2c26b28 100644
--- a/spelling-allowlist.txt
+++ b/spelling-allowlist.txt
@@ -142,4 +142,5 @@ mergesort
 scikit
 NLP
 embeddings
-ONNX
\ No newline at end of file
+ONNX
+f1
\ No newline at end of file
diff --git a/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py b/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py
index 82fcb7cdf7..c012220cc5 100644
--- a/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py
+++ b/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py
@@ -137,8 +137,7 @@ def test_wikiann_data(wikiann):
     """Temp to test wikiann dataset loads correctly"""
     dataset = wikiann
     check = SingleDatasetPerformance(scorers=['token_f1_macro'])
-    result = check.run(dataset, predictions=dataset.label)
-
+    result = check.run(dataset, predictions=list(dataset.label))
     assert_that(result.value.values[0][-1], equal_to(1))