Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DEE-456] nlp dummy model refactoring #2511

Merged
merged 14 commits into from May 11, 2023
Merged
255 changes: 86 additions & 169 deletions deepchecks/nlp/context.py

Large diffs are not rendered by default.

219 changes: 217 additions & 2 deletions deepchecks/nlp/input_validations.py
Expand Up @@ -9,18 +9,22 @@
# ----------------------------------------------------------------------------
#
"""Module containing input validation functions."""
from typing import Dict, List, NamedTuple, Optional, Sequence, Set, Tuple, cast
import collections
from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Set, Tuple, Type, cast

import numpy as np
import pandas as pd

from deepchecks.core.errors import DeepchecksValueError
from deepchecks.core.errors import DeepchecksValueError, ValidationError
from deepchecks.nlp.task_type import TaskType, TTextLabel
from deepchecks.utils.logger import get_logger
from deepchecks.utils.metrics import is_label_none
from deepchecks.utils.type_inference import infer_categorical_features
from deepchecks.utils.validation import is_sequence_not_str

if TYPE_CHECKING:
from deepchecks.nlp.text_data import TextData


def validate_tokenized_text(tokenized_text: Optional[Sequence[Sequence[str]]]):
"""Validate tokenized text format."""
Expand Down Expand Up @@ -241,3 +245,214 @@ def compare_dataframes(
difference = None

return DataframesComparison(common, difference)


def _validate_text_classification(
noamzbr marked this conversation as resolved.
Show resolved Hide resolved
yromanyshyn marked this conversation as resolved.
Show resolved Hide resolved
*,
noamzbr marked this conversation as resolved.
Show resolved Hide resolved
dataset: 'TextData',
predictions: Any = None,
probabilities: Any = None,
n_of_classes: Optional[int] = None,
eps: float = 1e-3
) -> Tuple[
Optional[np.ndarray], # predictions
Optional[np.ndarray], # probabilities
]:
if predictions is not None:
format_error_message = (
f'Check requires predictions for the "{dataset.name}" dataset '
'to be of a type sequence[str] | sequence[int]'
)
if not is_sequence_not_str(predictions):
raise ValidationError(format_error_message)
if len(predictions) != dataset.n_samples:
raise ValidationError(
f'Check requires predictions for the "{dataset.name}" dataset '
f'to have {dataset.n_samples} rows, same as dataset'
)
try:
predictions = np.array(predictions, dtype='object')
except ValueError as e:
raise ValidationError(
'Failed to cast predictions to a numpy array. '
f'{format_error_message}'
) from e
else:
if predictions.ndim == 2 and predictions.shape[1] == 1:
predictions = predictions[:, 0]
if predictions.ndim != 1:
raise ValidationError(format_error_message)

predictions = np.array([
str(it) if it is not None else None
for it in predictions
], dtype='object')

if probabilities is not None:
format_error_message = (
f'Check requires classification probabilities for the "{dataset.name}" '
'dataset to be of a type sequence[sequence[float]] that can be cast to '
'a 2D numpy array of shape (n_samples, n_classes)'
)
if len(probabilities) != dataset.n_samples:
raise ValidationError(
f'Check requires classification probabilities for the "{dataset.name}" '
f'dataset to have {dataset.n_samples} rows, same as dataset'
)
try:
probabilities = np.array(probabilities, dtype='float')
except ValueError as e:
raise ValidationError(
'Failed to cast probabilities to a numpy array. '
f'{format_error_message}'
) from e
else:
if len(probabilities.shape) != 2:
raise ValidationError(format_error_message)
if n_of_classes is not None and probabilities.shape[1] != n_of_classes:
raise ValidationError(
f'Check requires classification probabilities for the "{dataset.name}" dataset '
f'to have {n_of_classes} columns, same as the number of classes'
)
if any(abs(probabilities.sum(axis=1) - 1) > eps):
# TODO: better message
raise ValidationError(
f'Check requires classification probabilities for the "{dataset.name}" '
f'dataset to be probabilities and sum to 1 for each row'
)

return predictions, probabilities


def _validate_multilabel(
*,
dataset: 'TextData',
predictions: Any = None,
probabilities: Any = None,
n_of_classes: Optional[int] = None,
) -> Tuple[
Optional[np.ndarray], # predictions
Optional[np.ndarray], # probabilities
]:
if predictions is not None:
format_error_message = (
'Check requires multi-label classification predictions for '
f'the "{dataset.name}" dataset to be of a type sequence[sequence[int]] '
'that can be cast to a 2D numpy array of a shape (n_samples, n_classes)'
)
if not is_sequence_not_str(predictions):
raise ValidationError(format_error_message)
if len(predictions) != dataset.n_samples:
raise ValidationError(
'Check requires multi-label classification predictions '
f'for the "{dataset.name}" dataset to have {dataset.n_samples} rows, '
'same as dataset'
)
try:
predictions = np.array(predictions).astype(float)
except ValueError as e:
raise ValidationError(
'Failed to cast multi-label predictions to a numpy array. '
f'{format_error_message}'
) from e
else:
if predictions.ndim != 2:
raise ValidationError(format_error_message)
if n_of_classes is not None and predictions.shape[1] != n_of_classes:
raise ValidationError(
'Check requires multi-label classification predictions '
f'for the "{dataset.name}" dataset to have {n_of_classes} columns, '
'same as the number of classes'
)
if not np.array_equal(predictions, predictions.astype(bool)):
raise ValidationError(
'Check requires multi-label classification predictions '
f'for the "{dataset.name}" dataset to be either 0 or 1'
)
if probabilities is not None:
format_error_message = (
'Check requires multi-label classification probabilities '
f'for the "{dataset.name}" to be of a type sequence[sequences[float]] '
'that can be cast to a 2D numpy array of a shape (n_samples, n_classes). '
'Each label probability value must lay between 0 and 1'
)
if len(probabilities) != dataset.n_samples:
raise ValidationError(
'Check requires multi-label classification probabilities '
f'for the "{dataset.name}" dataset to have {dataset.n_samples} rows, '
'same as dataset'
)
try:
probabilities = np.array(probabilities, dtype='float')
except ValueError as e:
raise ValidationError(
'Failed to cast multi-label probabilities to a numpy '
f'array. {format_error_message}'
) from e
else:
if probabilities.ndim != 2:
raise ValidationError(format_error_message)
if n_of_classes is not None and probabilities.shape[1] != n_of_classes:
raise ValidationError(
f'Check requires multi-label classification probabilities '
f'for the "{dataset.name}" dataset to have {n_of_classes} columns, '
'same as the number of classes'
)
if (probabilities > 1).any() or (probabilities < 0).any():
# TODO: better message
raise ValidationError(format_error_message)

return predictions, probabilities


def _validate_token_classification(
*,
dataset: 'TextData',
predictions: Any = None,
probabilities: Any = None,
):
if probabilities is not None:
raise ValidationError(
'For token classification probabilities are not supported'
)

if predictions is not None:
format_error_message = (
'Check requires token-classification predictions for '
f'the "{dataset.name}" dataset to be of a type '
'sequence[sequence[str]] or sequence[sequence[int]]'
)
if not is_sequence_not_str(predictions):
raise ValidationError(format_error_message)
if len(predictions) != dataset.n_samples:
raise ValidationError(
'Check requires token-classification predictions for '
f'the "{dataset.name}" dataset to have {dataset.n_samples} rows, '
'same as dataset'
)

for idx, sample_predictions in enumerate(predictions):
if not is_sequence_not_str(sample_predictions):
raise ValidationError(format_error_message)

predictions_types_counter = _count_types(sample_predictions)
criterias = (str in predictions_types_counter, int in predictions_types_counter)

if all(criterias) or not any(criterias):
raise ValidationError(format_error_message)

tokenized_text = dataset.tokenized_text

if len(sample_predictions) != len(tokenized_text[idx]):
raise ValidationError(
'Check requires token-classification predictions for '
f'the "{dataset.name}" dataset to have the same number of tokens '
'as the input text'
)


def _count_types(sequence: Sequence[Any]) -> Dict[Type, int]:
counter = collections.defaultdict(int)
for it in sequence:
counter[type(it)] += 1
return counter
4 changes: 3 additions & 1 deletion deepchecks/utils/typing.py
Expand Up @@ -8,10 +8,10 @@
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
# pylint: disable=invalid-hash-returned,invalid-name,unnecessary-ellipsis
"""Type definitions."""
from typing import List

# pylint: disable=invalid-hash-returned,invalid-name
from typing_extensions import Protocol, runtime_checkable

__all__ = ['Hashable', 'BasicModel', 'ClassificationModel']
Expand Down Expand Up @@ -46,6 +46,7 @@ class BasicModel(Protocol):

def predict(self, X) -> List[Hashable]:
"""Predict on given X."""
...


@runtime_checkable
Expand All @@ -54,3 +55,4 @@ class ClassificationModel(BasicModel, Protocol):

def predict_proba(self, X) -> List[Hashable]:
"""Predict probabilities on given X."""
...
Expand Up @@ -10,7 +10,7 @@
#
"""Test for the NLP UnderAnnotatedSegments check"""
import numpy as np
from hamcrest import assert_that, close_to, equal_to, has_items, calling, raises
from hamcrest import assert_that, calling, close_to, equal_to, has_items, raises

from deepchecks.core.errors import DeepchecksProcessError
from deepchecks.nlp.checks import UnderAnnotatedMetaDataSegments, UnderAnnotatedPropertySegments
Expand Down
6 changes: 3 additions & 3 deletions tests/nlp/checks/model_evaluation/confusion_matrix_test.py
Expand Up @@ -26,7 +26,7 @@ def test_defaults(text_classification_dataset_mock):
# Act
result = check.run(text_classification_dataset_mock,
predictions=['0', '1', '1'])

confusion_matrix = result.value.to_numpy()

# Assert
Expand Down Expand Up @@ -58,7 +58,7 @@ def test_run_default_scorer_string_class_new_cats_in_model_classes(text_classifi
# Act
result = check.run(text_classification_string_class_dataset_mock,
predictions=['wise', 'new', 'meh'])

confusion_matrix = result.value.to_numpy()

# Assert
Expand Down Expand Up @@ -179,7 +179,7 @@ def test_condition_misclassified_samples_lower_than_fails(tweet_emotion_train_te
x, y = max_misclassified_cell_idx
max_misclassified_samples = confusion_matrix[x][y]
max_misclassified_samples_ratio = max_misclassified_samples / len(test_ds)

# Assert
assert_that(result.conditions_results[0], equal_condition_result(
is_pass=False,
Expand Down
Expand Up @@ -49,11 +49,13 @@ def test_run_with_scorer_proba_too_many_classes(text_classification_string_class

# Act & Assert
assert_that(
calling(check.run).with_args(text_classification_string_class_dataset_mock,
probabilities=[[0.1, 0.4, 0.5], [0.9, 0.05, 0.05], [0.9, 0.01, 0.09]],
),
raises(ValidationError, 'Check requires classification probabilities for Train dataset to have 2 columns, '
'same as the number of classes')
calling(check.run).with_args(
text_classification_string_class_dataset_mock,
probabilities=[[0.1, 0.4, 0.5], [0.9, 0.05, 0.05], [0.9, 0.01, 0.09]]),
raises(
ValidationError,
'Check requires classification probabilities for the "Train" dataset to have 2 columns, '
'same as the number of classes')
)


Expand Down
2 changes: 1 addition & 1 deletion tests/nlp/conftest.py
Expand Up @@ -144,8 +144,8 @@ def text_token_classification_dataset_mock():
def multilabel_mock_dataset_and_probabilities(tweet_emotion_train_test_textdata):
"""Mock dataset and probabilities for multilabel classification"""
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X, y = make_multilabel_classification(n_samples=3_000, n_features=10, n_classes=3, n_labels=2,
random_state=42)
Expand Down