From 6b6c4cbe0b3770ba7ce398500698612759f5b12e Mon Sep 17 00:00:00 2001 From: Yurii Romanyshyn <71635444+yromanyshyn@users.noreply.github.com> Date: Thu, 11 May 2023 09:53:14 +0300 Subject: [PATCH] [DEE-456] nlp dummy model refactoring (#2511) * nlp dummy model refactoring --- deepchecks/nlp/context.py | 255 ++++++------------ deepchecks/nlp/input_validations.py | 219 ++++++++++++++- deepchecks/utils/typing.py | 4 +- .../under_annotated_segments_test.py | 2 +- .../model_evaluation/confusion_matrix_test.py | 6 +- .../single_dataset_performance_test.py | 12 +- tests/nlp/conftest.py | 2 +- tests/nlp/test_context.py | 250 +++++++++++------ tests/nlp/utils/test_embeddings.py | 1 + .../confusion_matrix_report_test.py | 2 +- tests/utils/metrics_test.py | 2 +- 11 files changed, 482 insertions(+), 273 deletions(-) diff --git a/deepchecks/nlp/context.py b/deepchecks/nlp/context.py index 9068d22eff..09f954f863 100644 --- a/deepchecks/nlp/context.py +++ b/deepchecks/nlp/context.py @@ -9,7 +9,6 @@ # ---------------------------------------------------------------------------- # """Module for base nlp context.""" -import collections import typing as t from operator import itemgetter @@ -17,8 +16,9 @@ from deepchecks.core.context import BaseContext from deepchecks.core.errors import (DatasetValidationError, DeepchecksNotSupportedError, DeepchecksValueError, - ModelValidationError, ValidationError) -from deepchecks.nlp.input_validations import compare_dataframes + ModelValidationError) +from deepchecks.nlp.input_validations import (_validate_multilabel, _validate_text_classification, + _validate_token_classification, compare_dataframes) from deepchecks.nlp.metric_utils.scorers import init_validate_scorers from deepchecks.nlp.metric_utils.token_classification import (get_default_token_scorers, get_scorer_dict, validate_scorers) @@ -27,10 +27,10 @@ from deepchecks.nlp.utils.data_inference import infer_observed_and_model_labels from deepchecks.tabular.metric_utils import DeepcheckScorer, get_default_scorers from deepchecks.tabular.utils.task_type import TaskType as TabularTaskType -from deepchecks.tabular.utils.validation import ensure_predictions_proba, ensure_predictions_shape from deepchecks.utils.docref import doclink from deepchecks.utils.logger import get_logger from deepchecks.utils.typing import BasicModel +from deepchecks.utils.validation import is_sequence_not_str __all__ = [ 'Context', @@ -39,13 +39,19 @@ 'TTokenPred' ] -from deepchecks.utils.validation import is_sequence_not_str -TClassPred = t.Union[t.Sequence[t.Union[str, int]], t.Sequence[t.Sequence[t.Union[str, int]]]] -TClassProba = t.Sequence[t.Sequence[float]] -TTokenPred = t.Sequence[t.Sequence[t.Tuple[str, int, int, float]]] +TClassPred = t.Union[ + t.Sequence[int], + t.Sequence[str], + t.Sequence[t.Sequence[int]] +] +TTokenPred = t.Union[ + t.Sequence[t.Sequence[int]], + t.Sequence[t.Sequence[str]], +] + TTextPred = t.Union[TClassPred, TTokenPred] -TTextProba = t.Union[TClassProba] # TODO: incorrect, why union have only one type argument? +TTextProba = t.Sequence[t.Sequence[float]] class _DummyModel(BasicModel): @@ -72,68 +78,81 @@ class _DummyModel(BasicModel): predictions: t.Dict[str, t.Dict[int, TTextPred]] proba: t.Dict[str, t.Dict[int, TTextProba]] - def __init__(self, - test: TextData, - y_pred_test: TTextPred, - y_proba_test: TTextProba, - train: t.Union[TextData, None] = None, - y_pred_train: TTextPred = None, - y_proba_train: TTextProba = None, - model_classes: list = None, - validate_data_on_predict: bool = True): + def __init__( + self, + *, + test: TextData, + y_pred_test: TTextPred, + y_proba_test: TTextProba, + model_classes: t.List[t.Any], + train: t.Optional[TextData] = None, + y_pred_train: t.Optional[TTextPred] = None, + y_proba_train: t.Optional[TTextProba] = None, + validate_data_on_predict: bool = True, + multilabel_proba_threshold: float = 0.5 + ): """Initialize dummy model.""" predictions = {} probas = {} - if ((y_proba_train is not None) or (y_proba_test is not None)) and \ - (train.task_type == TaskType.TOKEN_CLASSIFICATION): - raise DeepchecksNotSupportedError('For token classification probabilities are not supported') - if train is not None and test is not None: # check if datasets have same indexes - if set(train.get_original_text_indexes()) & set(test.get_original_text_indexes()): - train._original_text_index = np.asarray([f'train-{i}' for i in train.get_original_text_indexes()]) - test._original_text_index = np.asarray([f'test-{i}' for i in test.get_original_text_indexes()]) + train_index = train.get_original_text_indexes() + test_index = test.get_original_text_indexes() + if set(train_index) & set(test_index): + train._original_text_index = np.asarray([f'train-{i}' for i in train_index]) + test._original_text_index = np.asarray([f'test-{i}' for i in test_index]) # # This is commented out as currently text data indices are len(range(len(data))) # # TODO: Uncomment when text data indices are not len(range(len(data))) # get_logger().warning('train and test datasets have common index - adding "train"/"test"' # ' prefixes. To avoid that provide datasets with no common indexes ' # 'or pass the model object instead of the predictions.') - for dataset, y_pred, y_proba in zip([train, test], - [y_pred_train, y_pred_test], - [y_proba_train, y_proba_test]): - if dataset is not None: - if y_pred is not None: - self._validate_prediction(dataset, y_pred, len(model_classes)) - if y_proba is not None: - self._validate_proba(dataset, y_proba, len(model_classes)) - - if dataset.task_type == TaskType.TEXT_CLASSIFICATION: - if (y_pred is None) and (y_proba is not None): - if dataset.is_multi_label_classification(): - y_pred = (np.array(y_proba) > 0.5) # TODO: Replace with user-configurable threshold - else: - y_pred = np.argmax(np.array(y_proba), axis=-1) - y_pred = np.array(model_classes, dtype='str')[y_pred] - - if y_pred is not None: - if dataset.is_multi_label_classification(): - y_pred = np.array(y_pred) - else: - y_pred = np.array(y_pred, dtype='str') - if len(y_pred.shape) > 1 and y_pred.shape[1] == 1: - y_pred = y_pred[:, 0] - ensure_predictions_shape(y_pred, dataset.text) - - if y_proba is not None: - ensure_predictions_proba(y_proba, y_pred) - y_proba_dict = dict(zip(dataset.get_original_text_indexes(), y_proba)) - probas.update({dataset.name: y_proba_dict}) - - if y_pred is not None: - y_pred_dict = dict(zip(dataset.get_original_text_indexes(), y_pred)) - predictions.update({dataset.name: y_pred_dict}) + for dataset, y_pred, y_proba in ( + (train, y_pred_train, y_proba_train), + (test, y_pred_test, y_proba_test), + ): + if dataset is None: + continue + + if dataset.is_multi_label_classification(): + y_pred, y_proba = _validate_multilabel( + dataset=dataset, + predictions=y_pred, + probabilities=y_proba, + n_of_classes=len(model_classes) + ) + if y_pred is None and y_proba is not None: + y_pred = (np.array(y_proba) > multilabel_proba_threshold) + y_pred = y_pred.astype(int) + + elif dataset.task_type is TaskType.TEXT_CLASSIFICATION: + y_pred, y_proba = _validate_text_classification( + dataset=dataset, + predictions=y_pred, + probabilities=y_proba, + n_of_classes=len(model_classes) + ) + if y_pred is None and y_proba is not None: + y_pred = np.argmax(np.array(y_proba), axis=-1) + y_pred = np.array(model_classes, dtype='str')[y_pred] + + elif dataset.task_type is TaskType.TOKEN_CLASSIFICATION: + _validate_token_classification( + dataset=dataset, + predictions=y_pred, + probabilities=y_proba, + ) + + else: + raise ValueError(f'Unknown task type - {type(dataset.task_type)}') + + if y_pred is not None: + y_pred_dict = dict(zip(dataset.get_original_text_indexes(), y_pred)) + predictions.update({dataset.name: y_pred_dict}) + if y_proba is not None: + y_proba_dict = dict(zip(dataset.get_original_text_indexes(), y_proba)) + probas.update({dataset.name: y_proba_dict}) self.predictions = predictions self.probas = probas @@ -142,13 +161,16 @@ def __init__(self, if self.predictions: self.predict = self._predict - self._prediction_indices = \ - {name: set(data_preds.keys()) for name, data_preds in self.predictions.items()} - + self._prediction_indices = { + name: set(data_preds.keys()) + for name, data_preds in self.predictions.items() + } if self.probas: self.predict_proba = self._predict_proba - self._proba_indices = \ - {name: set(data_proba.keys()) for name, data_proba in self.probas.items()} + self._proba_indices = { + name: set(data_proba.keys()) + for name, data_proba in self.probas.items() + } def _predict(self, data: TextData) -> TTextPred: # TODO: Needs to receive list of strings, not TextData """Predict on given data by the data indexes.""" @@ -174,111 +196,6 @@ def fit(self, *args, **kwargs): """Just for python 3.6 (sklearn validates fit method).""" pass - @staticmethod - def _validate_prediction(dataset: TextData, prediction: TTextPred, n_classes: int): - """Validate prediction for given dataset.""" - if not (is_sequence_not_str(prediction) - or (isinstance(prediction, np.ndarray) and prediction.ndim == 1)): - raise ValidationError(f'Check requires predictions for {dataset.name} to be a sequence') - if len(prediction) != dataset.n_samples: - raise ValidationError(f'Check requires predictions for {dataset.name} to have ' - f'{dataset.n_samples} rows, same as dataset') - - if dataset.task_type == TaskType.TEXT_CLASSIFICATION: - _DummyModel._validate_classification_prediction(dataset, prediction, n_classes) - elif dataset.task_type == TaskType.TOKEN_CLASSIFICATION: - _DummyModel._validate_token_classification_prediction(dataset, prediction) - - @staticmethod - def _validate_classification_prediction(dataset: TextData, prediction: TTextPred, n_classes: int): - """Validate prediction for given text classification dataset.""" - classification_format_error = f'Check requires classification predictions for {dataset.name} to be ' \ - f'either a sequence that can be cast to a 1D numpy array of shape' \ - f' (n_samples,), or a sequence of sequences that can be cast to a 2D ' \ - f'numpy array of shape (n_samples, n_classes) for the multilabel case.' - - try: - prediction = np.array(prediction) - if dataset.is_multi_label_classification(): - prediction = prediction.astype(float) # Multilabel prediction is a binary matrix - else: - prediction = prediction.reshape((-1, 1)) # Multiclass (not multilabel) Prediction can be a string - if prediction.shape[0] != dataset.n_samples: - raise ValidationError(classification_format_error) - except ValueError as e: - raise ValidationError(classification_format_error) from e - pred_shape = prediction.shape - if dataset.is_multi_label_classification(): - if len(pred_shape) == 1 or pred_shape[1] != n_classes: - raise ValidationError(classification_format_error) - if not np.array_equal(prediction, prediction.astype(bool)): - raise ValidationError(f'Check requires classification predictions for {dataset.name} dataset ' - f'to be either 0 or 1') - - @staticmethod - def _validate_token_classification_prediction(dataset: TextData, prediction: TTextPred): - """Validate prediction for given token classification dataset.""" - if not is_sequence_not_str(prediction): - raise ValidationError( - f'Check requires predictions for {dataset.name} to be a sequence of sequences' - ) - - tokenized_text = dataset.tokenized_text - - for idx, sample_predictions in enumerate(prediction): - if not is_sequence_not_str(sample_predictions): - raise ValidationError( - f'Check requires predictions for {dataset.name} to be a sequence of sequences' - ) - - predictions_types_counter = collections.defaultdict(int) - - for p in sample_predictions: - predictions_types_counter[type(p)] += 1 - - if predictions_types_counter[str] > 0 and predictions_types_counter[int] > 0: - raise ValidationError( - f'Check requires predictions for {dataset.name} to be a sequence ' - 'of sequences of strings or integers' - ) - if len(sample_predictions) != len(tokenized_text[idx]): - raise ValidationError( - f'Check requires predictions for {dataset.name} to have ' - 'the same number of tokens as the input text' - ) - - @staticmethod - def _validate_proba(dataset: TextData, probabilities: TTextProba, n_classes: int, - eps: float = 1e-3): - """Validate predicted probabilities for given dataset.""" - classification_format_error = f'Check requires classification probabilities for {dataset.name} to be a ' \ - f'sequence of sequences that can be cast to a 2D numpy array of shape' \ - f' (n_samples, n_classes)' - - if len(probabilities) != dataset.n_samples: - raise ValidationError(f'Check requires classification probabilities for {dataset.name} dataset ' - f'to have {dataset.n_samples} rows, same as dataset') - - if dataset.task_type == TaskType.TEXT_CLASSIFICATION: - try: - probabilities = np.array(probabilities, dtype='float') - except ValueError as e: - raise ValidationError(classification_format_error) from e - proba_shape = probabilities.shape - if len(proba_shape) != 2: - raise ValidationError(classification_format_error) - if proba_shape[1] != n_classes: - raise ValidationError(f'Check requires classification probabilities for {dataset.name} dataset ' - f'to have {n_classes} columns, same as the number of classes') - if dataset.is_multi_label_classification(): - if (probabilities > 1).any() or (probabilities < 0).any(): - raise ValidationError(f'Check requires classification probabilities for {dataset.name} ' - f'dataset to be between 0 and 1') - else: - if any(abs(probabilities.sum(axis=1) - 1) > eps): - raise ValidationError(f'Check requires classification probabilities for {dataset.name} ' - f'dataset to be probabilities and sum to 1 for each row') - class Context(BaseContext): """Contains all the data + properties the user has passed to a check/suite, and validates it seamlessly. diff --git a/deepchecks/nlp/input_validations.py b/deepchecks/nlp/input_validations.py index e3ef26250a..e79a2b521c 100644 --- a/deepchecks/nlp/input_validations.py +++ b/deepchecks/nlp/input_validations.py @@ -9,18 +9,22 @@ # ---------------------------------------------------------------------------- # """Module containing input validation functions.""" -from typing import Dict, List, NamedTuple, Optional, Sequence, Set, Tuple, cast +import collections +from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Set, Tuple, Type, cast import numpy as np import pandas as pd -from deepchecks.core.errors import DeepchecksValueError +from deepchecks.core.errors import DeepchecksValueError, ValidationError from deepchecks.nlp.task_type import TaskType, TTextLabel from deepchecks.utils.logger import get_logger from deepchecks.utils.metrics import is_label_none from deepchecks.utils.type_inference import infer_categorical_features from deepchecks.utils.validation import is_sequence_not_str +if TYPE_CHECKING: + from deepchecks.nlp.text_data import TextData + def validate_tokenized_text(tokenized_text: Optional[Sequence[Sequence[str]]]): """Validate tokenized text format.""" @@ -241,3 +245,214 @@ def compare_dataframes( difference = None return DataframesComparison(common, difference) + + +def _validate_text_classification( + *, + dataset: 'TextData', + predictions: Any = None, + probabilities: Any = None, + n_of_classes: Optional[int] = None, + eps: float = 1e-3 +) -> Tuple[ + Optional[np.ndarray], # predictions + Optional[np.ndarray], # probabilities +]: + if predictions is not None: + format_error_message = ( + f'Check requires predictions for the "{dataset.name}" dataset ' + 'to be of a type sequence[str] | sequence[int]' + ) + if not is_sequence_not_str(predictions): + raise ValidationError(format_error_message) + if len(predictions) != dataset.n_samples: + raise ValidationError( + f'Check requires predictions for the "{dataset.name}" dataset ' + f'to have {dataset.n_samples} rows, same as dataset' + ) + try: + predictions = np.array(predictions, dtype='object') + except ValueError as e: + raise ValidationError( + 'Failed to cast predictions to a numpy array. ' + f'{format_error_message}' + ) from e + else: + if predictions.ndim == 2 and predictions.shape[1] == 1: + predictions = predictions[:, 0] + if predictions.ndim != 1: + raise ValidationError(format_error_message) + + predictions = np.array([ + str(it) if it is not None else None + for it in predictions + ], dtype='object') + + if probabilities is not None: + format_error_message = ( + f'Check requires classification probabilities for the "{dataset.name}" ' + 'dataset to be of a type sequence[sequence[float]] that can be cast to ' + 'a 2D numpy array of shape (n_samples, n_classes)' + ) + if len(probabilities) != dataset.n_samples: + raise ValidationError( + f'Check requires classification probabilities for the "{dataset.name}" ' + f'dataset to have {dataset.n_samples} rows, same as dataset' + ) + try: + probabilities = np.array(probabilities, dtype='float') + except ValueError as e: + raise ValidationError( + 'Failed to cast probabilities to a numpy array. ' + f'{format_error_message}' + ) from e + else: + if len(probabilities.shape) != 2: + raise ValidationError(format_error_message) + if n_of_classes is not None and probabilities.shape[1] != n_of_classes: + raise ValidationError( + f'Check requires classification probabilities for the "{dataset.name}" dataset ' + f'to have {n_of_classes} columns, same as the number of classes' + ) + if any(abs(probabilities.sum(axis=1) - 1) > eps): + # TODO: better message + raise ValidationError( + f'Check requires classification probabilities for the "{dataset.name}" ' + f'dataset to be probabilities and sum to 1 for each row' + ) + + return predictions, probabilities + + +def _validate_multilabel( + *, + dataset: 'TextData', + predictions: Any = None, + probabilities: Any = None, + n_of_classes: Optional[int] = None, +) -> Tuple[ + Optional[np.ndarray], # predictions + Optional[np.ndarray], # probabilities +]: + if predictions is not None: + format_error_message = ( + 'Check requires multi-label classification predictions for ' + f'the "{dataset.name}" dataset to be of a type sequence[sequence[int]] ' + 'that can be cast to a 2D numpy array of a shape (n_samples, n_classes)' + ) + if not is_sequence_not_str(predictions): + raise ValidationError(format_error_message) + if len(predictions) != dataset.n_samples: + raise ValidationError( + 'Check requires multi-label classification predictions ' + f'for the "{dataset.name}" dataset to have {dataset.n_samples} rows, ' + 'same as dataset' + ) + try: + predictions = np.array(predictions).astype(float) + except ValueError as e: + raise ValidationError( + 'Failed to cast multi-label predictions to a numpy array. ' + f'{format_error_message}' + ) from e + else: + if predictions.ndim != 2: + raise ValidationError(format_error_message) + if n_of_classes is not None and predictions.shape[1] != n_of_classes: + raise ValidationError( + 'Check requires multi-label classification predictions ' + f'for the "{dataset.name}" dataset to have {n_of_classes} columns, ' + 'same as the number of classes' + ) + if not np.array_equal(predictions, predictions.astype(bool)): + raise ValidationError( + 'Check requires multi-label classification predictions ' + f'for the "{dataset.name}" dataset to be either 0 or 1' + ) + if probabilities is not None: + format_error_message = ( + 'Check requires multi-label classification probabilities ' + f'for the "{dataset.name}" to be of a type sequence[sequences[float]] ' + 'that can be cast to a 2D numpy array of a shape (n_samples, n_classes). ' + 'Each label probability value must lay between 0 and 1' + ) + if len(probabilities) != dataset.n_samples: + raise ValidationError( + 'Check requires multi-label classification probabilities ' + f'for the "{dataset.name}" dataset to have {dataset.n_samples} rows, ' + 'same as dataset' + ) + try: + probabilities = np.array(probabilities, dtype='float') + except ValueError as e: + raise ValidationError( + 'Failed to cast multi-label probabilities to a numpy ' + f'array. {format_error_message}' + ) from e + else: + if probabilities.ndim != 2: + raise ValidationError(format_error_message) + if n_of_classes is not None and probabilities.shape[1] != n_of_classes: + raise ValidationError( + f'Check requires multi-label classification probabilities ' + f'for the "{dataset.name}" dataset to have {n_of_classes} columns, ' + 'same as the number of classes' + ) + if (probabilities > 1).any() or (probabilities < 0).any(): + # TODO: better message + raise ValidationError(format_error_message) + + return predictions, probabilities + + +def _validate_token_classification( + *, + dataset: 'TextData', + predictions: Any = None, + probabilities: Any = None, +): + if probabilities is not None: + raise ValidationError( + 'For token classification probabilities are not supported' + ) + + if predictions is not None: + format_error_message = ( + 'Check requires token-classification predictions for ' + f'the "{dataset.name}" dataset to be of a type ' + 'sequence[sequence[str]] or sequence[sequence[int]]' + ) + if not is_sequence_not_str(predictions): + raise ValidationError(format_error_message) + if len(predictions) != dataset.n_samples: + raise ValidationError( + 'Check requires token-classification predictions for ' + f'the "{dataset.name}" dataset to have {dataset.n_samples} rows, ' + 'same as dataset' + ) + + for idx, sample_predictions in enumerate(predictions): + if not is_sequence_not_str(sample_predictions): + raise ValidationError(format_error_message) + + predictions_types_counter = _count_types(sample_predictions) + criterias = (str in predictions_types_counter, int in predictions_types_counter) + + if all(criterias) or not any(criterias): + raise ValidationError(format_error_message) + + tokenized_text = dataset.tokenized_text + + if len(sample_predictions) != len(tokenized_text[idx]): + raise ValidationError( + 'Check requires token-classification predictions for ' + f'the "{dataset.name}" dataset to have the same number of tokens ' + 'as the input text' + ) + + +def _count_types(sequence: Sequence[Any]) -> Dict[Type, int]: + counter = collections.defaultdict(int) + for it in sequence: + counter[type(it)] += 1 + return counter diff --git a/deepchecks/utils/typing.py b/deepchecks/utils/typing.py index 4722f74817..abc7af3a08 100644 --- a/deepchecks/utils/typing.py +++ b/deepchecks/utils/typing.py @@ -8,10 +8,10 @@ # along with Deepchecks. If not, see . # ---------------------------------------------------------------------------- # +# pylint: disable=invalid-hash-returned,invalid-name,unnecessary-ellipsis """Type definitions.""" from typing import List -# pylint: disable=invalid-hash-returned,invalid-name from typing_extensions import Protocol, runtime_checkable __all__ = ['Hashable', 'BasicModel', 'ClassificationModel'] @@ -46,6 +46,7 @@ class BasicModel(Protocol): def predict(self, X) -> List[Hashable]: """Predict on given X.""" + ... @runtime_checkable @@ -54,3 +55,4 @@ class ClassificationModel(BasicModel, Protocol): def predict_proba(self, X) -> List[Hashable]: """Predict probabilities on given X.""" + ... diff --git a/tests/nlp/checks/data_integrity/under_annotated_segments_test.py b/tests/nlp/checks/data_integrity/under_annotated_segments_test.py index b214de5cdd..b2c803f799 100644 --- a/tests/nlp/checks/data_integrity/under_annotated_segments_test.py +++ b/tests/nlp/checks/data_integrity/under_annotated_segments_test.py @@ -10,7 +10,7 @@ # """Test for the NLP UnderAnnotatedSegments check""" import numpy as np -from hamcrest import assert_that, close_to, equal_to, has_items, calling, raises +from hamcrest import assert_that, calling, close_to, equal_to, has_items, raises from deepchecks.core.errors import DeepchecksProcessError from deepchecks.nlp.checks import UnderAnnotatedMetaDataSegments, UnderAnnotatedPropertySegments diff --git a/tests/nlp/checks/model_evaluation/confusion_matrix_test.py b/tests/nlp/checks/model_evaluation/confusion_matrix_test.py index efbac75288..7c2a3c77d0 100644 --- a/tests/nlp/checks/model_evaluation/confusion_matrix_test.py +++ b/tests/nlp/checks/model_evaluation/confusion_matrix_test.py @@ -26,7 +26,7 @@ def test_defaults(text_classification_dataset_mock): # Act result = check.run(text_classification_dataset_mock, predictions=['0', '1', '1']) - + confusion_matrix = result.value.to_numpy() # Assert @@ -58,7 +58,7 @@ def test_run_default_scorer_string_class_new_cats_in_model_classes(text_classifi # Act result = check.run(text_classification_string_class_dataset_mock, predictions=['wise', 'new', 'meh']) - + confusion_matrix = result.value.to_numpy() # Assert @@ -179,7 +179,7 @@ def test_condition_misclassified_samples_lower_than_fails(tweet_emotion_train_te x, y = max_misclassified_cell_idx max_misclassified_samples = confusion_matrix[x][y] max_misclassified_samples_ratio = max_misclassified_samples / len(test_ds) - + # Assert assert_that(result.conditions_results[0], equal_condition_result( is_pass=False, diff --git a/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py b/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py index 3ad5b8eeb1..33bf4bf17f 100644 --- a/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py +++ b/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py @@ -49,11 +49,13 @@ def test_run_with_scorer_proba_too_many_classes(text_classification_string_class # Act & Assert assert_that( - calling(check.run).with_args(text_classification_string_class_dataset_mock, - probabilities=[[0.1, 0.4, 0.5], [0.9, 0.05, 0.05], [0.9, 0.01, 0.09]], - ), - raises(ValidationError, 'Check requires classification probabilities for Train dataset to have 2 columns, ' - 'same as the number of classes') + calling(check.run).with_args( + text_classification_string_class_dataset_mock, + probabilities=[[0.1, 0.4, 0.5], [0.9, 0.05, 0.05], [0.9, 0.01, 0.09]]), + raises( + ValidationError, + 'Check requires classification probabilities for the "Train" dataset to have 2 columns, ' + 'same as the number of classes') ) diff --git a/tests/nlp/conftest.py b/tests/nlp/conftest.py index bf3476991f..e6543ca4f8 100644 --- a/tests/nlp/conftest.py +++ b/tests/nlp/conftest.py @@ -144,8 +144,8 @@ def text_token_classification_dataset_mock(): def multilabel_mock_dataset_and_probabilities(tweet_emotion_train_test_textdata): """Mock dataset and probabilities for multilabel classification""" from sklearn.datasets import make_multilabel_classification - from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression + from sklearn.model_selection import train_test_split X, y = make_multilabel_classification(n_samples=3_000, n_features=10, n_classes=3, n_labels=2, random_state=42) diff --git a/tests/nlp/test_context.py b/tests/nlp/test_context.py index ab8cbf1802..795ed56726 100644 --- a/tests/nlp/test_context.py +++ b/tests/nlp/test_context.py @@ -26,44 +26,58 @@ def test_wrong_prediction_format(text_classification_dataset_mock): emtpy_suite = Suite('Empty Suite') # Act & Assert - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_classification_dataset_mock, - train_predictions=[0, 0, 1, 1]), - raises(ValidationError, 'Check requires predictions for Train to have 3 rows, same as dataset') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_classification_dataset_mock, + train_predictions=[0, 0, 1, 1]), + raises( + ValidationError, + 'Check requires predictions for the "Train" dataset ' + 'to have 3 rows, same as dataset') ) - - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_classification_dataset_mock, - train_predictions=[[0, 1], [1, 1], [0, 0]]), - raises(ValidationError, CLASSIFICATION_ERROR_FORMAT) + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_classification_dataset_mock, + train_predictions=[[0, 1], [1, 1], [0, 0]]), + raises( + ValidationError, + r'Check requires predictions for the "Train" dataset to ' + r'be of a type sequence\[str\] \| sequence\[int\]') ) - - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_classification_dataset_mock, - train_probabilities=[[0.3, 0.5, 0.2], [0.3, 0.5, 0.2]]), - raises(ValidationError, 'Check requires classification probabilities for Train dataset to have 3 rows,' - ' same as dataset') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_classification_dataset_mock, + train_probabilities=[[0.3, 0.5, 0.2], [0.3, 0.5, 0.2]]), + raises( + ValidationError, + 'Check requires classification probabilities for the "Train" ' + 'dataset to have 3 rows, same as dataset') ) - - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_classification_dataset_mock, - train_probabilities=[[1, 1, 1], [0, 0, 0], [0.5, 0.5, 0.5]]), - raises(ValidationError, 'Check requires classification probabilities for Train dataset to have 2 columns, ' - 'same as the number of classes') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_classification_dataset_mock, + train_probabilities=[[1, 1, 1], [0, 0, 0], [0.5, 0.5, 0.5]]), + raises( + ValidationError, + 'Check requires classification probabilities for the "Train" ' + 'dataset to have 2 columns, same as the number of classes') ) - - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_classification_dataset_mock, - train_probabilities=[[1, 1], [0, 0], [0.5, 0.2]]), - raises(ValidationError, 'Check requires classification probabilities for Train dataset to be probabilities and' - ' sum to 1 for each row') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_classification_dataset_mock, + train_probabilities=[[1, 1], [0, 0], [0.5, 0.2]]), + raises( + ValidationError, + 'Check requires classification probabilities for the "Train" ' + 'dataset to be probabilities and sum to 1 for each row') ) # Run with no error emtpy_suite.run( train_dataset=text_classification_dataset_mock, train_predictions=[1, 1, 1], - train_probabilities=[[0.9, 0.1], [1, 0], [0.5, 0.5]]) + train_probabilities=[[0.9, 0.1], [1, 0], [0.5, 0.5]] + ) def test_wrong_multilabel_prediction_format(text_multilabel_classification_dataset_mock): @@ -71,49 +85,75 @@ def test_wrong_multilabel_prediction_format(text_multilabel_classification_datas emtpy_suite = Suite('Empty Suite') # Act & Assert - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_multilabel_classification_dataset_mock, - train_predictions=[0, 0, 1, 1]), - raises(ValidationError, 'Check requires predictions for Train to have 3 rows, same as dataset') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_multilabel_classification_dataset_mock, + train_predictions=[ + [0, 0, 0], + [0, 0, 0], + [0, 0, 0], + [0, 0, 0],]), + raises( + ValidationError, + 'Check requires multi-label classification predictions for ' + 'the "Train" dataset to have 3 rows, same as dataset') ) - - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_multilabel_classification_dataset_mock, - train_predictions=[0, 1, 1]), - raises(ValidationError, CLASSIFICATION_ERROR_FORMAT) + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_multilabel_classification_dataset_mock, + train_predictions=[0, 1, 1]), + raises( + ValidationError, + r'Check requires multi-label classification predictions for ' + r'the "Train" dataset to be of a type sequence\[sequence\[int\]\] that can ' + r'be cast to a 2D numpy array of a shape \(n_samples, n_classes\)') ) - - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_multilabel_classification_dataset_mock, - train_predictions=[[0], [0, 1], 1]), - raises(ValidationError, CLASSIFICATION_ERROR_FORMAT) + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_multilabel_classification_dataset_mock, + train_predictions=[[0], [0, 1], 1]), + raises( + ValidationError, + r'Check requires multi-label classification predictions for ' + r'the "Train" dataset to be of a type sequence\[sequence\[int\]\] that can ' + r'be cast to a 2D numpy array of a shape \(n_samples, n_classes\)') ) - - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_multilabel_classification_dataset_mock, - train_probabilities=[[0.3, 0.5, 0.2], [0.3, 0.5, 0.2]]), - raises(ValidationError, 'Check requires classification probabilities for Train dataset to have 3 rows,' - ' same as dataset') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_multilabel_classification_dataset_mock, + train_probabilities=[[0.3, 0.5, 0.2], [0.3, 0.5, 0.2]]), + raises( + ValidationError, + 'Check requires multi-label classification probabilities for ' + 'the "Train" dataset to have 3 rows, same as dataset') ) - - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_multilabel_classification_dataset_mock, - train_probabilities=[[1, 1], [0, 0], [0.5, 0.5]]), - raises(ValidationError, 'heck requires classification probabilities for Train dataset to have 3 columns, ' - 'same as the number of classes') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_multilabel_classification_dataset_mock, + train_probabilities=[[1, 1], [0, 0], [0.5, 0.5]]), + raises( + ValidationError, + 'Check requires multi-label classification probabilities for ' + 'the "Train" dataset to have 3 columns, same as the number of classes') ) - - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_multilabel_classification_dataset_mock, - train_probabilities=[[1, 1.2, 1], [0, 0, 0.3], [0.5, 0.2, 0.9]]), - raises(ValidationError, 'Check requires classification probabilities for Train dataset to be between 0 and 1') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_multilabel_classification_dataset_mock, + train_probabilities=[[1, 1.2, 1], [0, 0, 0.3], [0.5, 0.2, 0.9]]), + raises( + ValidationError, + r'Check requires multi-label classification probabilities for the "Train" ' + r'to be of a type sequence\[sequences\[float\]\] that can be cast to a 2D numpy ' + r'array of a shape \(n_samples, n_classes\). Each label probability value ' + r'must lay between 0 and 1') ) # Run with no error emtpy_suite.run( train_dataset=text_multilabel_classification_dataset_mock, train_predictions=[[1, 1, 0], [0, 0, 1], [1, 1, 1]], - train_probabilities=[[0.9, 0.8, 0.3], [0.9, 0.8, 0.3], [0.9, 0.8, 0.3]]) + train_probabilities=[[0.9, 0.8, 0.3], [0.9, 0.8, 0.3], [0.9, 0.8, 0.3]] + ) def test_wrong_token_prediction_format(text_token_classification_dataset_mock): @@ -123,47 +163,69 @@ def test_wrong_token_prediction_format(text_token_classification_dataset_mock): # Act & Assert # Length of predictions does not match length of dataset: - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_token_classification_dataset_mock, - train_predictions=[[1, 2], [3, 4]] - ), - raises(ValidationError, 'Check requires predictions for Train to have 3 rows, same as dataset') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_token_classification_dataset_mock, + train_predictions=[[1, 2], [3, 4]]), + raises( + ValidationError, + 'Check requires token-classification predictions for the "Train" ' + 'dataset to have 3 rows, same as dataset') ) # Not a list: - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_token_classification_dataset_mock, - train_predictions='PER' - ), - raises(ValidationError, 'Check requires predictions for Train to be a sequence') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_token_classification_dataset_mock, + train_predictions='PER'), + raises( + ValidationError, + r'Check requires token-classification predictions for ' + r'the "Train" dataset to be of a type sequence\[sequence\[str\]\] or ' + r'sequence\[sequence\[int\]\]') ) # Not a list of lists: - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_token_classification_dataset_mock, - train_predictions=[3, 3, 3] - ), - raises(ValidationError, 'Check requires predictions for Train to be a sequence of sequences') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_token_classification_dataset_mock, + train_predictions=[3, 3, 3]), + raises( + ValidationError, + r'Check requires token-classification predictions for the ' + r'"Train" dataset to be of a type sequence\[sequence\[str\]\] or ' + r'sequence\[sequence\[int\]\]') ) # Mixed strings and integers: - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_token_classification_dataset_mock, - train_predictions=[['B-PER', 'O', 1, 'O', 'O'], ['B-PER', 'O', 'O', 'B-GEO', 'O', 'B-GEO'], - ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']] - ), - raises(ValidationError, - 'Check requires predictions for Train to be a sequence of sequences of strings or integers') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_token_classification_dataset_mock, + train_predictions=[ + ['B-PER', 'O', 1, 'O', 'O'], + ['B-PER', 'O', 'O', 'B-GEO', 'O', 'B-GEO'], + ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] + ]), + raises( + ValidationError, + r'Check requires token-classification predictions for ' + r'the "Train" dataset to be of a type sequence\[sequence\[str\]\] ' + r'or sequence\[sequence\[int\]\]') ) # Length of predictions does not match length of tokenized text: - assert_that(calling(emtpy_suite.run).with_args( - train_dataset=text_token_classification_dataset_mock, - train_predictions=[['B-PER'], ['B-PER', 'O', 'O', 'B-GEO', 'O', 'B-GEO'], - ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']] - ), - raises(ValidationError, - 'Check requires predictions for Train to have the same number of tokens as the input text') + assert_that( + calling(emtpy_suite.run).with_args( + train_dataset=text_token_classification_dataset_mock, + train_predictions=[ + ['B-PER'], + ['B-PER', 'O', 'O', 'B-GEO', 'O', 'B-GEO'], + ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] + ]), + raises( + ValidationError, + 'Check requires token-classification predictions for the "Train" dataset ' + 'to have the same number of tokens as the input text') ) @@ -193,3 +255,13 @@ def test_same_dataset(tweet_emotion_train_test_textdata): # Assert assert_that(result.value['Drift score'], close_to(0.0, 0.001)) + + +def test_check_execution_with_none_value_in_predictions_sequence(text_classification_dataset_mock): + # Arrange + check = SingleDatasetPerformance(scorers=['recall_macro']) + # Act + result = check.run(text_classification_dataset_mock, predictions=[1, None, 1]) + # Assert + assert_that(result.value['Value'], close_to(0.5, 0.001)) + diff --git a/tests/nlp/utils/test_embeddings.py b/tests/nlp/utils/test_embeddings.py index 2e7a0c2d0c..018fdfc675 100644 --- a/tests/nlp/utils/test_embeddings.py +++ b/tests/nlp/utils/test_embeddings.py @@ -12,6 +12,7 @@ import numpy as np from hamcrest import assert_that, equal_to + from deepchecks.nlp.utils.text_embeddings import calculate_default_embeddings diff --git a/tests/tabular/checks/model_evaluation/confusion_matrix_report_test.py b/tests/tabular/checks/model_evaluation/confusion_matrix_report_test.py index 6c34aecfb5..610406253f 100644 --- a/tests/tabular/checks/model_evaluation/confusion_matrix_report_test.py +++ b/tests/tabular/checks/model_evaluation/confusion_matrix_report_test.py @@ -174,7 +174,7 @@ def test_condition_misclassified_samples_lower_than_fails(iris_split_dataset_and x, y = max_misclassified_cell_idx max_misclassified_samples = confusion_matrix[x][y] max_misclassified_samples_ratio = max_misclassified_samples / len(test) - + assert_that(result.conditions_results[0], equal_condition_result( is_pass=False, name=f'Misclassified cell size lower than {format_percent(threshold)} of the total samples', diff --git a/tests/utils/metrics_test.py b/tests/utils/metrics_test.py index 4e7f99d5b4..ec548a05f5 100644 --- a/tests/utils/metrics_test.py +++ b/tests/utils/metrics_test.py @@ -11,7 +11,7 @@ """Test metrics utils""" import pandas as pd from hamcrest import assert_that, calling, close_to, has_entries, is_, raises -from sklearn.metrics import make_scorer, log_loss, mean_squared_error +from sklearn.metrics import log_loss, make_scorer, mean_squared_error from deepchecks.core.errors import DeepchecksValueError from deepchecks.tabular import Dataset