From 0d105c80ba9ed4f403f092d45227bfcbd314215e Mon Sep 17 00:00:00 2001 From: Yurii Romanyshyn <71635444+yromanyshyn@users.noreply.github.com> Date: Sun, 16 Apr 2023 16:33:33 +0300 Subject: [PATCH] [DEE-440] added error msg to checks that do not support token/multi-label classification (#2445) * added error msg to checks that do not support token/multi-label classification * docs fix * code style fixes * test fixes * tests fixes * pred validation fix * code style fixes * code style fixes * Update deepchecks/nlp/checks/data_integrity/property_label_correlation.py Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com> * Update deepchecks/nlp/checks/data_integrity/property_label_correlation.py Co-authored-by: Nir Hutnik <92314933+nirhutnik@users.noreply.github.com> * fixes * fix * docs style fixes * code style fixes * code style fixes * Update deepchecks/nlp/context.py Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com> * fixes --------- Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com> Co-authored-by: Nir Hutnik <92314933+nirhutnik@users.noreply.github.com> --- deepchecks/core/errors.py | 2 - .../property_label_correlation.py | 5 +- .../data_integrity/text_property_outliers.py | 2 +- .../confusion_matrix_report.py | 6 +- .../model_evaluation/prediction_drift.py | 2 + .../single_dataset_performance.py | 5 +- .../weak_segments_performance.py | 4 ++ .../train_test_validation/label_drift.py | 3 + deepchecks/nlp/context.py | 62 ++++++++++++++----- .../nlp/metric_utils/token_classification.py | 41 ++++++++---- deepchecks/nlp/utils/data_inference.py | 16 +++-- deepchecks/utils/validation.py | 8 ++- spelling-allowlist.txt | 3 +- .../single_dataset_performance_test.py | 3 +- 14 files changed, 115 insertions(+), 47 deletions(-) diff --git a/deepchecks/core/errors.py b/deepchecks/core/errors.py index 364b2a75f8..179a740c4f 100644 --- a/deepchecks/core/errors.py +++ b/deepchecks/core/errors.py @@ -40,8 +40,6 @@ class DeepchecksNotImplementedError(DeepchecksBaseError): class DeepchecksNotSupportedError(DeepchecksBaseError): """Exception class that represents an unsupported action in Deepchecks.""" - pass - class DeepchecksProcessError(DeepchecksBaseError): """Exception class that represents an issue with a process.""" diff --git a/deepchecks/nlp/checks/data_integrity/property_label_correlation.py b/deepchecks/nlp/checks/data_integrity/property_label_correlation.py index 02bdf760c9..0562e83067 100644 --- a/deepchecks/nlp/checks/data_integrity/property_label_correlation.py +++ b/deepchecks/nlp/checks/data_integrity/property_label_correlation.py @@ -92,8 +92,11 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult: DeepchecksValueError If the object is not a Dataset instance with a label. """ - text_data = context.get_data_by_kind(dataset_kind).sample(self.n_samples, random_state=context.random_state) + context.raise_if_token_classification_task(self) + context.raise_if_multi_label_task(self) + text_data = context.get_data_by_kind(dataset_kind) + text_data = text_data.sample(self.n_samples, random_state=context.random_state) label = pd.Series(text_data.label, name='label', index=text_data.get_original_text_indexes()) # Classification labels should be of type object (and not int, for example) diff --git a/deepchecks/nlp/checks/data_integrity/text_property_outliers.py b/deepchecks/nlp/checks/data_integrity/text_property_outliers.py index dae30f1ed6..f6b96ba30c 100644 --- a/deepchecks/nlp/checks/data_integrity/text_property_outliers.py +++ b/deepchecks/nlp/checks/data_integrity/text_property_outliers.py @@ -24,7 +24,7 @@ class TextPropertyOutliers(SingleDatasetCheck): - """Find outliers images with respect to the given properties. + """Find outliers with respect to the given properties. The check finds outliers in the text properties. For numeric properties, the check uses `IQR `_ to diff --git a/deepchecks/nlp/checks/model_evaluation/confusion_matrix_report.py b/deepchecks/nlp/checks/model_evaluation/confusion_matrix_report.py index e3808961ec..ffe7ee7352 100644 --- a/deepchecks/nlp/checks/model_evaluation/confusion_matrix_report.py +++ b/deepchecks/nlp/checks/model_evaluation/confusion_matrix_report.py @@ -54,7 +54,11 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult: DeepchecksValueError If the data is not a Dataset instance with a label """ - dataset = context.get_data_by_kind(dataset_kind).sample(self.n_samples, random_state=self.random_state) + context.raise_if_token_classification_task(self) + context.raise_if_multi_label_task(self) + + dataset = context.get_data_by_kind(dataset_kind) + dataset = dataset.sample(self.n_samples, random_state=self.random_state) y_true = np.asarray(dataset.label) y_pred = np.array(context.model.predict(dataset)).reshape(len(y_true), ) diff --git a/deepchecks/nlp/checks/model_evaluation/prediction_drift.py b/deepchecks/nlp/checks/model_evaluation/prediction_drift.py index 5b2b52f964..c14bc2bca1 100644 --- a/deepchecks/nlp/checks/model_evaluation/prediction_drift.py +++ b/deepchecks/nlp/checks/model_evaluation/prediction_drift.py @@ -145,6 +145,8 @@ def run_logic(self, context: Context) -> CheckResult: value: drift score. display: prediction distribution graph, comparing the train and test distributions. """ + context.raise_if_token_classification_task(self) + train_dataset = context.train.sample(self.n_samples, random_state=context.random_state) test_dataset = context.test.sample(self.n_samples, random_state=context.random_state) model = context.model diff --git a/deepchecks/nlp/checks/model_evaluation/single_dataset_performance.py b/deepchecks/nlp/checks/model_evaluation/single_dataset_performance.py index 661fc25a8c..6010910a49 100644 --- a/deepchecks/nlp/checks/model_evaluation/single_dataset_performance.py +++ b/deepchecks/nlp/checks/model_evaluation/single_dataset_performance.py @@ -10,7 +10,7 @@ # """Module containing the single dataset performance check.""" from numbers import Number -from typing import Callable, Dict, List, TypeVar, Union +from typing import Callable, Dict, List, Union import pandas as pd @@ -23,9 +23,6 @@ __all__ = ['SingleDatasetPerformance'] -SDP = TypeVar('SDP', bound='SingleDatasetPerformance') - - class SingleDatasetPerformance(SingleDatasetCheck, BaseSingleDatasetPerformance): """Summarize given model performance on a dataset based on selected scorers. diff --git a/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py b/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py index 6d0ca53a85..d60227ee93 100644 --- a/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py +++ b/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py @@ -51,6 +51,9 @@ def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], Non def run_logic(self, context: Context, dataset_kind) -> CheckResult: """Run check.""" + context.raise_if_token_classification_task(self) + context.raise_if_multi_label_task(self) + text_data = context.get_data_by_kind(dataset_kind) text_data = text_data.sample(self.n_samples, random_state=context.random_state) @@ -86,6 +89,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult: if features.shape[1] < 2: raise DeepchecksNotSupportedError('Check requires meta data to have at least two columns in order to run.') + # label is not used in the check, just here to avoid errors dataset = Dataset(features, label=pd.Series(text_data.label), cat_features=cat_features) encoded_dataset = self._target_encode_categorical_features_fill_na(dataset, list(np.unique(text_data.label))) diff --git a/deepchecks/nlp/checks/train_test_validation/label_drift.py b/deepchecks/nlp/checks/train_test_validation/label_drift.py index 9e23d20cde..b9f062b37e 100644 --- a/deepchecks/nlp/checks/train_test_validation/label_drift.py +++ b/deepchecks/nlp/checks/train_test_validation/label_drift.py @@ -113,6 +113,9 @@ def run_logic(self, context: Context) -> CheckResult: value: drift score. display: label distribution graph, comparing the train and test distributions. """ + context.raise_if_token_classification_task(self) + context.raise_if_multi_label_task(self) + train_dataset = context.train.sample(self.n_samples, random_state=context.random_state) test_dataset = context.test.sample(self.n_samples, random_state=context.random_state) diff --git a/deepchecks/nlp/context.py b/deepchecks/nlp/context.py index 99d54f8ab7..63566d32c5 100644 --- a/deepchecks/nlp/context.py +++ b/deepchecks/nlp/context.py @@ -44,7 +44,7 @@ TClassProba = t.Sequence[t.Sequence[float]] TTokenPred = t.Sequence[t.Sequence[t.Tuple[str, int, int, float]]] TTextPred = t.Union[TClassPred, TTokenPred] -TTextProba = t.Union[TClassProba] +TTextProba = t.Union[TClassProba] # TODO: incorrect, why union have only one type argument? class _DummyModel(BasicModel): @@ -216,18 +216,34 @@ def _validate_classification_prediction(dataset: TextData, prediction: TTextPred @staticmethod def _validate_token_classification_prediction(dataset: TextData, prediction: TTextPred): """Validate prediction for given token classification dataset.""" - if not all(isinstance(pred, collections.abc.Sequence) for pred in prediction): - raise ValidationError(f'Check requires predictions for {dataset.name} to be a sequence ' - f'of sequences') - - for i in range(len(prediction)): # TODO: Goes over all predictions, fix this - if not all(isinstance(pred, str) for pred in prediction[i]) \ - and not all(isinstance(pred, int) for pred in prediction[i]): - raise ValidationError(f'Check requires predictions for {dataset.name} to be a sequence ' - f'of sequences of strings or integers') - if len(prediction[i]) != len(dataset.tokenized_text[i]): - raise ValidationError(f'Check requires predictions for {dataset.name} to have ' - f'the same number of tokens as the input text') + if not is_sequence_not_str(prediction): + raise ValidationError( + f'Check requires predictions for {dataset.name} to be a sequence of sequences' + ) + + tokenized_text = dataset.tokenized_text + + for idx, sample_predictions in enumerate(prediction): + if not is_sequence_not_str(sample_predictions): + raise ValidationError( + f'Check requires predictions for {dataset.name} to be a sequence of sequences' + ) + + predictions_types_counter = collections.defaultdict(int) + + for p in sample_predictions: + predictions_types_counter[type(p)] += 1 + + if predictions_types_counter[str] > 0 and predictions_types_counter[int] > 0: + raise ValidationError( + f'Check requires predictions for {dataset.name} to be a sequence ' + 'of sequences of strings or integers' + ) + if len(sample_predictions) != len(tokenized_text[idx]): + raise ValidationError( + f'Check requires predictions for {dataset.name} to have ' + 'the same number of tokens as the input text' + ) @staticmethod def _validate_proba(dataset: TextData, probabilities: TTextProba, n_classes: int, @@ -430,6 +446,24 @@ def assert_properties(text_data): 'set_properties method to set your own properties with a pandas.DataFrame or use ' 'TextData.calculate_default_properties to add the default deepchecks properties.') + def raise_if_token_classification_task(self, check=None): + """Raise an exception if it is a token classification task.""" + check_name = type(check).__name__ if check else 'Check' + task_type_name = TaskType.TOKEN_CLASSIFICATION.value + if self.task_type is TaskType.TOKEN_CLASSIFICATION: + raise DeepchecksNotSupportedError( + f'"{check_name}" is not supported for the "{task_type_name}" tasks' + ) + + def raise_if_multi_label_task(self, check=None): + """Raise an exception if it is a multi-label classification task.""" + dataset = t.cast(TextData, self._train if self._train is not None else self._test) + check_name = type(check).__name__ if check else 'Check' + if dataset.is_multi_label_classification(): + raise DeepchecksNotSupportedError( + f'"{check_name}" is not supported for the multilable classification tasks' + ) + def get_scorers(self, scorers: t.Union[t.Mapping[str, t.Union[str, t.Callable]], t.List[str]] = None, use_avg_defaults=True) -> t.List[DeepcheckScorer]: @@ -454,11 +488,11 @@ def get_scorers(self, else: scorers = scorers or get_default_scorers(TabularTaskType.BINARY, use_avg_defaults) elif self.task_type == TaskType.TOKEN_CLASSIFICATION: - scoring_dict = get_scorer_dict() if scorers is None: scorers = get_default_token_scorers(use_avg_defaults) # Get string names of default scorers else: validate_scorers(scorers) # Validate that use supplied scorer names are OK + scoring_dict = get_scorer_dict() scorers = {name: scoring_dict[name] for name in scorers} else: raise DeepchecksValueError(f'Task type must be either {TaskType.TEXT_CLASSIFICATION} or ' diff --git a/deepchecks/nlp/metric_utils/token_classification.py b/deepchecks/nlp/metric_utils/token_classification.py index 20e8c5fe24..44cdfaa89a 100644 --- a/deepchecks/nlp/metric_utils/token_classification.py +++ b/deepchecks/nlp/metric_utils/token_classification.py @@ -21,15 +21,19 @@ __all__ = ['get_default_token_scorers', 'validate_scorers', 'get_scorer_dict'] DEFAULT_AVG_SCORER_NAMES = ('f1_macro', 'recall_macro', 'precision_macro') -DEFAULT_PER_CLASS_SCORER_NAMES = ('f1_per_class', 'f1_per_class', 'f1_per_class') +DEFAULT_PER_CLASS_SCORER_NAMES = tuple() - -if t.TYPE_CHECKING: - from deepchecks.nlp.context import TTokenPred # pylint: disable=unused-import # noqa: F401 +# see issue DEE-473 +# https://linear.app/deepchecks/issue/DEE-473/incorrectly-inferred-model-classes-for-token-classification-task +# +# DEFAULT_PER_CLASS_SCORER_NAMES = ('f1_per_class',) -def get_scorer_dict(suffix: bool = False, mode: t.Optional[str] = None, scheme: t.Optional[t.Type[Token]] = None, - ) -> t.Dict[str, t.Callable[[t.List[str], t.List[str]], float]]: +def get_scorer_dict( + suffix: bool = False, + mode: t.Optional[str] = None, + scheme: t.Optional[t.Type[Token]] = None, +) -> t.Dict[str, t.Callable[[t.List[str], t.List[str]], float]]: """Return a dict of scorers for token classification. Parameters: @@ -77,14 +81,25 @@ def validate_scorers(scorers: t.List[str]): if not isinstance(scorers, Sequence): raise DeepchecksValueError(f'Scorers must be a Sequence, got {type(scorers)}') - if not all(isinstance(name, str) for name in scorers): - # TODO: support custom scorers - raise DeepchecksValueError(f'Scorers must be a Sequence of strings, got {type(scorers[0])}') - if any(name not in scoring_dict for name in scorers): - raise DeepchecksValueError(f'Scorers must be a list of names of existing token classification metrics, which ' - f'is {scoring_dict.keys()}, got {scorers}') + + for name in scorers: + if not isinstance(name, str): + # TODO: support custom scorers + raise DeepchecksValueError( + f'Scorers must be a Sequence of strings, got {type(name)}' + ) + if name not in scoring_dict: + raise DeepchecksValueError( + 'Scorers must be a list of names of existing token classification metrics, ' + f'which is {scoring_dict.keys()}, got {scorers}' + ) def get_default_token_scorers(use_avg_defaults=True) -> t.List[str]: """Return the default scorers for token classification.""" - return DEFAULT_AVG_SCORER_NAMES if use_avg_defaults else DEFAULT_PER_CLASS_SCORER_NAMES + names = ( + DEFAULT_AVG_SCORER_NAMES + if use_avg_defaults + else DEFAULT_PER_CLASS_SCORER_NAMES + ) + return [f'token_{it}' for it in names] diff --git a/deepchecks/nlp/utils/data_inference.py b/deepchecks/nlp/utils/data_inference.py index 15f52be0c5..0cca06d0fe 100644 --- a/deepchecks/nlp/utils/data_inference.py +++ b/deepchecks/nlp/utils/data_inference.py @@ -23,12 +23,16 @@ __all__ = ['infer_observed_and_model_labels'] -def infer_observed_and_model_labels(train_dataset=None, test_dataset=None, model: BaseEstimator = None, - y_pred_train: np.array = None, # pylint: disable=unused-argument - y_pred_test: np.array = None, # pylint: disable=unused-argument - model_classes: list = None, - task_type: TaskType = None) -> \ - Tuple[List, List]: +# pylint: disable=unused-argument +def infer_observed_and_model_labels( + train_dataset=None, + test_dataset=None, + model: BaseEstimator = None, + y_pred_train: np.ndarray = None, + y_pred_test: np.ndarray = None, + model_classes: list = None, + task_type: TaskType = None +) -> Tuple[List, List]: """ Infer the observed labels from the given datasets and predictions. diff --git a/deepchecks/utils/validation.py b/deepchecks/utils/validation.py index 83fbab807f..529e489152 100644 --- a/deepchecks/utils/validation.py +++ b/deepchecks/utils/validation.py @@ -14,6 +14,7 @@ import numpy as np import pandas as pd +from typing_extensions import TypeGuard from deepchecks.core import errors from deepchecks.utils.typing import Hashable @@ -48,6 +49,9 @@ def ensure_hashable_or_mutable_sequence( )) -def is_sequence_not_str(value): +def is_sequence_not_str(value) -> TypeGuard[t.Sequence[t.Any]]: """Check if value is a non str sequence.""" - return isinstance(value, (t.Sequence, pd.Series, np.ndarray)) and not isinstance(value, str) + return ( + not isinstance(value, (bytes, str, bytearray)) + and isinstance(value, (t.Sequence, pd.Series, np.ndarray)) + ) diff --git a/spelling-allowlist.txt b/spelling-allowlist.txt index b4e22b03cf..d9d2c26b28 100644 --- a/spelling-allowlist.txt +++ b/spelling-allowlist.txt @@ -142,4 +142,5 @@ mergesort scikit NLP embeddings -ONNX \ No newline at end of file +ONNX +f1 \ No newline at end of file diff --git a/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py b/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py index 82fcb7cdf7..c012220cc5 100644 --- a/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py +++ b/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py @@ -137,8 +137,7 @@ def test_wikiann_data(wikiann): """Temp to test wikiann dataset loads correctly""" dataset = wikiann check = SingleDatasetPerformance(scorers=['token_f1_macro']) - result = check.run(dataset, predictions=dataset.label) - + result = check.run(dataset, predictions=list(dataset.label)) assert_that(result.value.values[0][-1], equal_to(1))