[DEE-456] nlp dummy model refactoring (#2511)

* nlp dummy model refactoring
deepchecks · May 11, 2023 · 6b6c4cb · 6b6c4cb
1 parent cfc9f9d
commit 6b6c4cb
Show file tree

Hide file tree

Showing 11 changed files with 482 additions and 273 deletions.
diff --git a/deepchecks/nlp/context.py b/deepchecks/nlp/context.py
diff --git a/deepchecks/nlp/input_validations.py b/deepchecks/nlp/input_validations.py
@@ -9,18 +9,22 @@
 # ----------------------------------------------------------------------------
 #
 """Module containing input validation functions."""
-from typing import Dict, List, NamedTuple, Optional, Sequence, Set, Tuple, cast
+import collections
+from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Set, Tuple, Type, cast
 
 import numpy as np
 import pandas as pd
 
-from deepchecks.core.errors import DeepchecksValueError
+from deepchecks.core.errors import DeepchecksValueError, ValidationError
 from deepchecks.nlp.task_type import TaskType, TTextLabel
 from deepchecks.utils.logger import get_logger
 from deepchecks.utils.metrics import is_label_none
 from deepchecks.utils.type_inference import infer_categorical_features
 from deepchecks.utils.validation import is_sequence_not_str
 
+if TYPE_CHECKING:
+    from deepchecks.nlp.text_data import TextData
+
 
 def validate_tokenized_text(tokenized_text: Optional[Sequence[Sequence[str]]]):
     """Validate tokenized text format."""
@@ -241,3 +245,214 @@ def compare_dataframes(
         difference = None
 
     return DataframesComparison(common, difference)
+
+
+def _validate_text_classification(
+    *,
+    dataset: 'TextData',
+    predictions: Any = None,
+    probabilities: Any = None,
+    n_of_classes: Optional[int] = None,
+    eps: float = 1e-3
+) -> Tuple[
+    Optional[np.ndarray],  # predictions
+    Optional[np.ndarray],  # probabilities
+]:
+    if predictions is not None:
+        format_error_message = (
+            f'Check requires predictions for the "{dataset.name}" dataset '
+            'to be of a type sequence[str] | sequence[int]'
+        )
+        if not is_sequence_not_str(predictions):
+            raise ValidationError(format_error_message)
+        if len(predictions) != dataset.n_samples:
+            raise ValidationError(
+                f'Check requires predictions for the "{dataset.name}" dataset '
+                f'to have {dataset.n_samples} rows, same as dataset'
+            )
+        try:
+            predictions = np.array(predictions, dtype='object')
+        except ValueError as e:
+            raise ValidationError(
+                'Failed to cast predictions to a numpy array. '
+                f'{format_error_message}'
+            ) from e
+        else:
+            if predictions.ndim == 2 and predictions.shape[1] == 1:
+                predictions = predictions[:, 0]
+            if predictions.ndim != 1:
+                raise ValidationError(format_error_message)
+
+            predictions = np.array([
+                str(it) if it is not None else None
+                for it in predictions
+            ], dtype='object')
+
+    if probabilities is not None:
+        format_error_message = (
+            f'Check requires classification probabilities for the "{dataset.name}" '
+            'dataset to be of a type sequence[sequence[float]] that can be cast to '
+            'a 2D numpy array of shape (n_samples, n_classes)'
+        )
+        if len(probabilities) != dataset.n_samples:
+            raise ValidationError(
+                f'Check requires classification probabilities for the "{dataset.name}" '
+                f'dataset to have {dataset.n_samples} rows, same as dataset'
+            )
+        try:
+            probabilities = np.array(probabilities, dtype='float')
+        except ValueError as e:
+            raise ValidationError(
+                'Failed to cast probabilities to a numpy array. '
+                f'{format_error_message}'
+            ) from e
+        else:
+            if len(probabilities.shape) != 2:
+                raise ValidationError(format_error_message)
+            if n_of_classes is not None and probabilities.shape[1] != n_of_classes:
+                raise ValidationError(
+                    f'Check requires classification probabilities for the "{dataset.name}" dataset '
+                    f'to have {n_of_classes} columns, same as the number of classes'
+                )
+            if any(abs(probabilities.sum(axis=1) - 1) > eps):
+                # TODO: better message
+                raise ValidationError(
+                    f'Check requires classification probabilities for the "{dataset.name}" '
+                    f'dataset to be probabilities and sum to 1 for each row'
+                )
+
+    return predictions, probabilities
+
+
+def _validate_multilabel(
+    *,
+    dataset: 'TextData',
+    predictions: Any = None,
+    probabilities: Any = None,
+    n_of_classes: Optional[int] = None,
+) -> Tuple[
+    Optional[np.ndarray],  # predictions
+    Optional[np.ndarray],  # probabilities
+]:
+    if predictions is not None:
+        format_error_message = (
+            'Check requires multi-label classification predictions for '
+            f'the "{dataset.name}" dataset to be of a type sequence[sequence[int]] '
+            'that can be cast to a 2D numpy array of a shape (n_samples, n_classes)'
+        )
+        if not is_sequence_not_str(predictions):
+            raise ValidationError(format_error_message)
+        if len(predictions) != dataset.n_samples:
+            raise ValidationError(
+                'Check requires multi-label classification predictions '
+                f'for the "{dataset.name}" dataset to have {dataset.n_samples} rows, '
+                'same as dataset'
+            )
+        try:
+            predictions = np.array(predictions).astype(float)
+        except ValueError as e:
+            raise ValidationError(
+                'Failed to cast multi-label predictions to a numpy array. '
+                f'{format_error_message}'
+            ) from e
+        else:
+            if predictions.ndim != 2:
+                raise ValidationError(format_error_message)
+            if n_of_classes is not None and predictions.shape[1] != n_of_classes:
+                raise ValidationError(
+                    'Check requires multi-label classification predictions '
+                    f'for the "{dataset.name}" dataset to have {n_of_classes} columns, '
+                    'same as the number of classes'
+                )
+            if not np.array_equal(predictions, predictions.astype(bool)):
+                raise ValidationError(
+                    'Check requires multi-label classification predictions '
+                    f'for the "{dataset.name}" dataset to be either 0 or 1'
+                )
+    if probabilities is not None:
+        format_error_message = (
+            'Check requires multi-label classification probabilities '
+            f'for the "{dataset.name}" to be of a type sequence[sequences[float]] '
+            'that can be cast to a 2D numpy array of a shape (n_samples, n_classes). '
+            'Each label probability value must lay between 0 and 1'
+        )
+        if len(probabilities) != dataset.n_samples:
+            raise ValidationError(
+                'Check requires multi-label classification probabilities '
+                f'for the "{dataset.name}" dataset to have {dataset.n_samples} rows, '
+                'same as dataset'
+            )
+        try:
+            probabilities = np.array(probabilities, dtype='float')
+        except ValueError as e:
+            raise ValidationError(
+                'Failed to cast multi-label probabilities to a numpy '
+                f'array. {format_error_message}'
+            ) from e
+        else:
+            if probabilities.ndim != 2:
+                raise ValidationError(format_error_message)
+            if n_of_classes is not None and probabilities.shape[1] != n_of_classes:
+                raise ValidationError(
+                    f'Check requires multi-label classification probabilities '
+                    f'for the "{dataset.name}" dataset to have {n_of_classes} columns, '
+                    'same as the number of classes'
+                )
+            if (probabilities > 1).any() or (probabilities < 0).any():
+                # TODO: better message
+                raise ValidationError(format_error_message)
+
+    return predictions, probabilities
+
+
+def _validate_token_classification(
+    *,
+    dataset: 'TextData',
+    predictions: Any = None,
+    probabilities: Any = None,
+):
+    if probabilities is not None:
+        raise ValidationError(
+            'For token classification probabilities are not supported'
+        )
+
+    if predictions is not None:
+        format_error_message = (
+            'Check requires token-classification predictions for '
+            f'the "{dataset.name}" dataset to be of a type '
+            'sequence[sequence[str]] or sequence[sequence[int]]'
+        )
+        if not is_sequence_not_str(predictions):
+            raise ValidationError(format_error_message)
+        if len(predictions) != dataset.n_samples:
+            raise ValidationError(
+                'Check requires token-classification predictions for '
+                f'the "{dataset.name}" dataset to have {dataset.n_samples} rows, '
+                'same as dataset'
+            )
+
+        for idx, sample_predictions in enumerate(predictions):
+            if not is_sequence_not_str(sample_predictions):
+                raise ValidationError(format_error_message)
+
+            predictions_types_counter = _count_types(sample_predictions)
+            criterias = (str in predictions_types_counter, int in predictions_types_counter)
+
+            if all(criterias) or not any(criterias):
+                raise ValidationError(format_error_message)
+
+            tokenized_text = dataset.tokenized_text
+
+            if len(sample_predictions) != len(tokenized_text[idx]):
+                raise ValidationError(
+                    'Check requires token-classification predictions for '
+                    f'the "{dataset.name}" dataset to have the same number of tokens '
+                    'as the input text'
+                )
+
+
+def _count_types(sequence: Sequence[Any]) -> Dict[Type, int]:
+    counter = collections.defaultdict(int)
+    for it in sequence:
+        counter[type(it)] += 1
+    return counter
diff --git a/deepchecks/utils/typing.py b/deepchecks/utils/typing.py
@@ -8,10 +8,10 @@
 # along with Deepchecks.  If not, see <http://www.gnu.org/licenses/>.
 # ----------------------------------------------------------------------------
 #
+# pylint: disable=invalid-hash-returned,invalid-name,unnecessary-ellipsis
 """Type definitions."""
 from typing import List
 
-# pylint: disable=invalid-hash-returned,invalid-name
 from typing_extensions import Protocol, runtime_checkable
 
 __all__ = ['Hashable', 'BasicModel', 'ClassificationModel']
@@ -46,6 +46,7 @@ class BasicModel(Protocol):
 
     def predict(self, X) -> List[Hashable]:
         """Predict on given X."""
+        ...
 
 
 @runtime_checkable
@@ -54,3 +55,4 @@ class ClassificationModel(BasicModel, Protocol):
 
     def predict_proba(self, X) -> List[Hashable]:
         """Predict probabilities on given X."""
+        ...
diff --git a/tests/nlp/checks/data_integrity/under_annotated_segments_test.py b/tests/nlp/checks/data_integrity/under_annotated_segments_test.py
@@ -10,7 +10,7 @@
 #
 """Test for the NLP UnderAnnotatedSegments check"""
 import numpy as np
-from hamcrest import assert_that, close_to, equal_to, has_items, calling, raises
+from hamcrest import assert_that, calling, close_to, equal_to, has_items, raises
 
 from deepchecks.core.errors import DeepchecksProcessError
 from deepchecks.nlp.checks import UnderAnnotatedMetaDataSegments, UnderAnnotatedPropertySegments

diff --git a/tests/nlp/checks/model_evaluation/confusion_matrix_test.py b/tests/nlp/checks/model_evaluation/confusion_matrix_test.py
@@ -26,7 +26,7 @@ def test_defaults(text_classification_dataset_mock):
     # Act
     result = check.run(text_classification_dataset_mock,
                        predictions=['0', '1', '1'])
-    
+
     confusion_matrix = result.value.to_numpy()
 
     # Assert
@@ -58,7 +58,7 @@ def test_run_default_scorer_string_class_new_cats_in_model_classes(text_classifi
     # Act
     result = check.run(text_classification_string_class_dataset_mock,
                        predictions=['wise', 'new', 'meh'])
-    
+
     confusion_matrix = result.value.to_numpy()
 
     # Assert
@@ -179,7 +179,7 @@ def test_condition_misclassified_samples_lower_than_fails(tweet_emotion_train_te
     x, y = max_misclassified_cell_idx
     max_misclassified_samples = confusion_matrix[x][y]
     max_misclassified_samples_ratio = max_misclassified_samples / len(test_ds)
-    
+
     # Assert
     assert_that(result.conditions_results[0], equal_condition_result(
         is_pass=False,

diff --git a/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py b/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py
@@ -49,11 +49,13 @@ def test_run_with_scorer_proba_too_many_classes(text_classification_string_class
 
     # Act & Assert
     assert_that(
-        calling(check.run).with_args(text_classification_string_class_dataset_mock,
-                                     probabilities=[[0.1, 0.4, 0.5], [0.9, 0.05, 0.05], [0.9, 0.01, 0.09]],
-                                     ),
-        raises(ValidationError, 'Check requires classification probabilities for Train dataset to have 2 columns, '
-                                'same as the number of classes')
+        calling(check.run).with_args(
+            text_classification_string_class_dataset_mock,
+            probabilities=[[0.1, 0.4, 0.5], [0.9, 0.05, 0.05], [0.9, 0.01, 0.09]]),
+        raises(
+            ValidationError,
+            'Check requires classification probabilities for the "Train" dataset to have 2 columns, '
+            'same as the number of classes')
     )
 
 

diff --git a/tests/nlp/conftest.py b/tests/nlp/conftest.py
@@ -144,8 +144,8 @@ def text_token_classification_dataset_mock():
 def multilabel_mock_dataset_and_probabilities(tweet_emotion_train_test_textdata):
     """Mock dataset and probabilities for multilabel classification"""
     from sklearn.datasets import make_multilabel_classification
-    from sklearn.model_selection import train_test_split
     from sklearn.linear_model import LogisticRegression
+    from sklearn.model_selection import train_test_split
 
     X, y = make_multilabel_classification(n_samples=3_000, n_features=10, n_classes=3, n_labels=2,
                                           random_state=42)