Skip to content

Commit

Permalink
[DEE-456] nlp dummy model refactoring (#2511)
Browse files Browse the repository at this point in the history
* nlp dummy model refactoring
  • Loading branch information
yromanyshyn committed May 11, 2023
1 parent cfc9f9d commit 6b6c4cb
Show file tree
Hide file tree
Showing 11 changed files with 482 additions and 273 deletions.
255 changes: 86 additions & 169 deletions deepchecks/nlp/context.py

Large diffs are not rendered by default.

219 changes: 217 additions & 2 deletions deepchecks/nlp/input_validations.py
Expand Up @@ -9,18 +9,22 @@
# ----------------------------------------------------------------------------
#
"""Module containing input validation functions."""
from typing import Dict, List, NamedTuple, Optional, Sequence, Set, Tuple, cast
import collections
from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Set, Tuple, Type, cast

import numpy as np
import pandas as pd

from deepchecks.core.errors import DeepchecksValueError
from deepchecks.core.errors import DeepchecksValueError, ValidationError
from deepchecks.nlp.task_type import TaskType, TTextLabel
from deepchecks.utils.logger import get_logger
from deepchecks.utils.metrics import is_label_none
from deepchecks.utils.type_inference import infer_categorical_features
from deepchecks.utils.validation import is_sequence_not_str

if TYPE_CHECKING:
from deepchecks.nlp.text_data import TextData


def validate_tokenized_text(tokenized_text: Optional[Sequence[Sequence[str]]]):
"""Validate tokenized text format."""
Expand Down Expand Up @@ -241,3 +245,214 @@ def compare_dataframes(
difference = None

return DataframesComparison(common, difference)


def _validate_text_classification(
*,
dataset: 'TextData',
predictions: Any = None,
probabilities: Any = None,
n_of_classes: Optional[int] = None,
eps: float = 1e-3
) -> Tuple[
Optional[np.ndarray], # predictions
Optional[np.ndarray], # probabilities
]:
if predictions is not None:
format_error_message = (
f'Check requires predictions for the "{dataset.name}" dataset '
'to be of a type sequence[str] | sequence[int]'
)
if not is_sequence_not_str(predictions):
raise ValidationError(format_error_message)
if len(predictions) != dataset.n_samples:
raise ValidationError(
f'Check requires predictions for the "{dataset.name}" dataset '
f'to have {dataset.n_samples} rows, same as dataset'
)
try:
predictions = np.array(predictions, dtype='object')
except ValueError as e:
raise ValidationError(
'Failed to cast predictions to a numpy array. '
f'{format_error_message}'
) from e
else:
if predictions.ndim == 2 and predictions.shape[1] == 1:
predictions = predictions[:, 0]
if predictions.ndim != 1:
raise ValidationError(format_error_message)

predictions = np.array([
str(it) if it is not None else None
for it in predictions
], dtype='object')

if probabilities is not None:
format_error_message = (
f'Check requires classification probabilities for the "{dataset.name}" '
'dataset to be of a type sequence[sequence[float]] that can be cast to '
'a 2D numpy array of shape (n_samples, n_classes)'
)
if len(probabilities) != dataset.n_samples:
raise ValidationError(
f'Check requires classification probabilities for the "{dataset.name}" '
f'dataset to have {dataset.n_samples} rows, same as dataset'
)
try:
probabilities = np.array(probabilities, dtype='float')
except ValueError as e:
raise ValidationError(
'Failed to cast probabilities to a numpy array. '
f'{format_error_message}'
) from e
else:
if len(probabilities.shape) != 2:
raise ValidationError(format_error_message)
if n_of_classes is not None and probabilities.shape[1] != n_of_classes:
raise ValidationError(
f'Check requires classification probabilities for the "{dataset.name}" dataset '
f'to have {n_of_classes} columns, same as the number of classes'
)
if any(abs(probabilities.sum(axis=1) - 1) > eps):
# TODO: better message
raise ValidationError(
f'Check requires classification probabilities for the "{dataset.name}" '
f'dataset to be probabilities and sum to 1 for each row'
)

return predictions, probabilities


def _validate_multilabel(
*,
dataset: 'TextData',
predictions: Any = None,
probabilities: Any = None,
n_of_classes: Optional[int] = None,
) -> Tuple[
Optional[np.ndarray], # predictions
Optional[np.ndarray], # probabilities
]:
if predictions is not None:
format_error_message = (
'Check requires multi-label classification predictions for '
f'the "{dataset.name}" dataset to be of a type sequence[sequence[int]] '
'that can be cast to a 2D numpy array of a shape (n_samples, n_classes)'
)
if not is_sequence_not_str(predictions):
raise ValidationError(format_error_message)
if len(predictions) != dataset.n_samples:
raise ValidationError(
'Check requires multi-label classification predictions '
f'for the "{dataset.name}" dataset to have {dataset.n_samples} rows, '
'same as dataset'
)
try:
predictions = np.array(predictions).astype(float)
except ValueError as e:
raise ValidationError(
'Failed to cast multi-label predictions to a numpy array. '
f'{format_error_message}'
) from e
else:
if predictions.ndim != 2:
raise ValidationError(format_error_message)
if n_of_classes is not None and predictions.shape[1] != n_of_classes:
raise ValidationError(
'Check requires multi-label classification predictions '
f'for the "{dataset.name}" dataset to have {n_of_classes} columns, '
'same as the number of classes'
)
if not np.array_equal(predictions, predictions.astype(bool)):
raise ValidationError(
'Check requires multi-label classification predictions '
f'for the "{dataset.name}" dataset to be either 0 or 1'
)
if probabilities is not None:
format_error_message = (
'Check requires multi-label classification probabilities '
f'for the "{dataset.name}" to be of a type sequence[sequences[float]] '
'that can be cast to a 2D numpy array of a shape (n_samples, n_classes). '
'Each label probability value must lay between 0 and 1'
)
if len(probabilities) != dataset.n_samples:
raise ValidationError(
'Check requires multi-label classification probabilities '
f'for the "{dataset.name}" dataset to have {dataset.n_samples} rows, '
'same as dataset'
)
try:
probabilities = np.array(probabilities, dtype='float')
except ValueError as e:
raise ValidationError(
'Failed to cast multi-label probabilities to a numpy '
f'array. {format_error_message}'
) from e
else:
if probabilities.ndim != 2:
raise ValidationError(format_error_message)
if n_of_classes is not None and probabilities.shape[1] != n_of_classes:
raise ValidationError(
f'Check requires multi-label classification probabilities '
f'for the "{dataset.name}" dataset to have {n_of_classes} columns, '
'same as the number of classes'
)
if (probabilities > 1).any() or (probabilities < 0).any():
# TODO: better message
raise ValidationError(format_error_message)

return predictions, probabilities


def _validate_token_classification(
*,
dataset: 'TextData',
predictions: Any = None,
probabilities: Any = None,
):
if probabilities is not None:
raise ValidationError(
'For token classification probabilities are not supported'
)

if predictions is not None:
format_error_message = (
'Check requires token-classification predictions for '
f'the "{dataset.name}" dataset to be of a type '
'sequence[sequence[str]] or sequence[sequence[int]]'
)
if not is_sequence_not_str(predictions):
raise ValidationError(format_error_message)
if len(predictions) != dataset.n_samples:
raise ValidationError(
'Check requires token-classification predictions for '
f'the "{dataset.name}" dataset to have {dataset.n_samples} rows, '
'same as dataset'
)

for idx, sample_predictions in enumerate(predictions):
if not is_sequence_not_str(sample_predictions):
raise ValidationError(format_error_message)

predictions_types_counter = _count_types(sample_predictions)
criterias = (str in predictions_types_counter, int in predictions_types_counter)

if all(criterias) or not any(criterias):
raise ValidationError(format_error_message)

tokenized_text = dataset.tokenized_text

if len(sample_predictions) != len(tokenized_text[idx]):
raise ValidationError(
'Check requires token-classification predictions for '
f'the "{dataset.name}" dataset to have the same number of tokens '
'as the input text'
)


def _count_types(sequence: Sequence[Any]) -> Dict[Type, int]:
counter = collections.defaultdict(int)
for it in sequence:
counter[type(it)] += 1
return counter
4 changes: 3 additions & 1 deletion deepchecks/utils/typing.py
Expand Up @@ -8,10 +8,10 @@
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
# pylint: disable=invalid-hash-returned,invalid-name,unnecessary-ellipsis
"""Type definitions."""
from typing import List

# pylint: disable=invalid-hash-returned,invalid-name
from typing_extensions import Protocol, runtime_checkable

__all__ = ['Hashable', 'BasicModel', 'ClassificationModel']
Expand Down Expand Up @@ -46,6 +46,7 @@ class BasicModel(Protocol):

def predict(self, X) -> List[Hashable]:
"""Predict on given X."""
...


@runtime_checkable
Expand All @@ -54,3 +55,4 @@ class ClassificationModel(BasicModel, Protocol):

def predict_proba(self, X) -> List[Hashable]:
"""Predict probabilities on given X."""
...
Expand Up @@ -10,7 +10,7 @@
#
"""Test for the NLP UnderAnnotatedSegments check"""
import numpy as np
from hamcrest import assert_that, close_to, equal_to, has_items, calling, raises
from hamcrest import assert_that, calling, close_to, equal_to, has_items, raises

from deepchecks.core.errors import DeepchecksProcessError
from deepchecks.nlp.checks import UnderAnnotatedMetaDataSegments, UnderAnnotatedPropertySegments
Expand Down
6 changes: 3 additions & 3 deletions tests/nlp/checks/model_evaluation/confusion_matrix_test.py
Expand Up @@ -26,7 +26,7 @@ def test_defaults(text_classification_dataset_mock):
# Act
result = check.run(text_classification_dataset_mock,
predictions=['0', '1', '1'])

confusion_matrix = result.value.to_numpy()

# Assert
Expand Down Expand Up @@ -58,7 +58,7 @@ def test_run_default_scorer_string_class_new_cats_in_model_classes(text_classifi
# Act
result = check.run(text_classification_string_class_dataset_mock,
predictions=['wise', 'new', 'meh'])

confusion_matrix = result.value.to_numpy()

# Assert
Expand Down Expand Up @@ -179,7 +179,7 @@ def test_condition_misclassified_samples_lower_than_fails(tweet_emotion_train_te
x, y = max_misclassified_cell_idx
max_misclassified_samples = confusion_matrix[x][y]
max_misclassified_samples_ratio = max_misclassified_samples / len(test_ds)

# Assert
assert_that(result.conditions_results[0], equal_condition_result(
is_pass=False,
Expand Down
Expand Up @@ -49,11 +49,13 @@ def test_run_with_scorer_proba_too_many_classes(text_classification_string_class

# Act & Assert
assert_that(
calling(check.run).with_args(text_classification_string_class_dataset_mock,
probabilities=[[0.1, 0.4, 0.5], [0.9, 0.05, 0.05], [0.9, 0.01, 0.09]],
),
raises(ValidationError, 'Check requires classification probabilities for Train dataset to have 2 columns, '
'same as the number of classes')
calling(check.run).with_args(
text_classification_string_class_dataset_mock,
probabilities=[[0.1, 0.4, 0.5], [0.9, 0.05, 0.05], [0.9, 0.01, 0.09]]),
raises(
ValidationError,
'Check requires classification probabilities for the "Train" dataset to have 2 columns, '
'same as the number of classes')
)


Expand Down
2 changes: 1 addition & 1 deletion tests/nlp/conftest.py
Expand Up @@ -144,8 +144,8 @@ def text_token_classification_dataset_mock():
def multilabel_mock_dataset_and_probabilities(tweet_emotion_train_test_textdata):
"""Mock dataset and probabilities for multilabel classification"""
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X, y = make_multilabel_classification(n_samples=3_000, n_features=10, n_classes=3, n_labels=2,
random_state=42)
Expand Down

0 comments on commit 6b6c4cb

Please sign in to comment.