Skip to content

Commit

Permalink
[DEE-440] added error msg to checks that do not support token/multi-l…
Browse files Browse the repository at this point in the history
…abel classification (#2445)

* added error msg to checks that do not support token/multi-label classification

* docs fix

* code style fixes

* test fixes

* tests fixes

* pred validation fix

* code style fixes

* code style fixes

* Update deepchecks/nlp/checks/data_integrity/property_label_correlation.py

Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>

* Update deepchecks/nlp/checks/data_integrity/property_label_correlation.py

Co-authored-by: Nir Hutnik <92314933+nirhutnik@users.noreply.github.com>

* fixes

* fix

* docs style fixes

* code style fixes

* code style fixes

* Update deepchecks/nlp/context.py

Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>

* fixes

---------

Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>
Co-authored-by: Nir Hutnik <92314933+nirhutnik@users.noreply.github.com>
  • Loading branch information
3 people committed Apr 16, 2023
1 parent c03a2ae commit 0d105c8
Show file tree
Hide file tree
Showing 14 changed files with 115 additions and 47 deletions.
2 changes: 0 additions & 2 deletions deepchecks/core/errors.py
Expand Up @@ -40,8 +40,6 @@ class DeepchecksNotImplementedError(DeepchecksBaseError):
class DeepchecksNotSupportedError(DeepchecksBaseError):
"""Exception class that represents an unsupported action in Deepchecks."""

pass


class DeepchecksProcessError(DeepchecksBaseError):
"""Exception class that represents an issue with a process."""
Expand Down
Expand Up @@ -92,8 +92,11 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
DeepchecksValueError
If the object is not a Dataset instance with a label.
"""
text_data = context.get_data_by_kind(dataset_kind).sample(self.n_samples, random_state=context.random_state)
context.raise_if_token_classification_task(self)
context.raise_if_multi_label_task(self)

text_data = context.get_data_by_kind(dataset_kind)
text_data = text_data.sample(self.n_samples, random_state=context.random_state)
label = pd.Series(text_data.label, name='label', index=text_data.get_original_text_indexes())

# Classification labels should be of type object (and not int, for example)
Expand Down
Expand Up @@ -24,7 +24,7 @@


class TextPropertyOutliers(SingleDatasetCheck):
"""Find outliers images with respect to the given properties.
"""Find outliers with respect to the given properties.
The check finds outliers in the text properties.
For numeric properties, the check uses `IQR <https://en.wikipedia.org/wiki/Interquartile_range#Outliers>`_ to
Expand Down
Expand Up @@ -54,7 +54,11 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
DeepchecksValueError
If the data is not a Dataset instance with a label
"""
dataset = context.get_data_by_kind(dataset_kind).sample(self.n_samples, random_state=self.random_state)
context.raise_if_token_classification_task(self)
context.raise_if_multi_label_task(self)

dataset = context.get_data_by_kind(dataset_kind)
dataset = dataset.sample(self.n_samples, random_state=self.random_state)
y_true = np.asarray(dataset.label)
y_pred = np.array(context.model.predict(dataset)).reshape(len(y_true), )

Expand Down
2 changes: 2 additions & 0 deletions deepchecks/nlp/checks/model_evaluation/prediction_drift.py
Expand Up @@ -145,6 +145,8 @@ def run_logic(self, context: Context) -> CheckResult:
value: drift score.
display: prediction distribution graph, comparing the train and test distributions.
"""
context.raise_if_token_classification_task(self)

train_dataset = context.train.sample(self.n_samples, random_state=context.random_state)
test_dataset = context.test.sample(self.n_samples, random_state=context.random_state)
model = context.model
Expand Down
Expand Up @@ -10,7 +10,7 @@
#
"""Module containing the single dataset performance check."""
from numbers import Number
from typing import Callable, Dict, List, TypeVar, Union
from typing import Callable, Dict, List, Union

import pandas as pd

Expand All @@ -23,9 +23,6 @@
__all__ = ['SingleDatasetPerformance']


SDP = TypeVar('SDP', bound='SingleDatasetPerformance')


class SingleDatasetPerformance(SingleDatasetCheck, BaseSingleDatasetPerformance):
"""Summarize given model performance on a dataset based on selected scorers.
Expand Down
Expand Up @@ -51,6 +51,9 @@ def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], Non

def run_logic(self, context: Context, dataset_kind) -> CheckResult:
"""Run check."""
context.raise_if_token_classification_task(self)
context.raise_if_multi_label_task(self)

text_data = context.get_data_by_kind(dataset_kind)
text_data = text_data.sample(self.n_samples, random_state=context.random_state)

Expand Down Expand Up @@ -86,6 +89,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:

if features.shape[1] < 2:
raise DeepchecksNotSupportedError('Check requires meta data to have at least two columns in order to run.')

# label is not used in the check, just here to avoid errors
dataset = Dataset(features, label=pd.Series(text_data.label), cat_features=cat_features)
encoded_dataset = self._target_encode_categorical_features_fill_na(dataset, list(np.unique(text_data.label)))
Expand Down
3 changes: 3 additions & 0 deletions deepchecks/nlp/checks/train_test_validation/label_drift.py
Expand Up @@ -113,6 +113,9 @@ def run_logic(self, context: Context) -> CheckResult:
value: drift score.
display: label distribution graph, comparing the train and test distributions.
"""
context.raise_if_token_classification_task(self)
context.raise_if_multi_label_task(self)

train_dataset = context.train.sample(self.n_samples, random_state=context.random_state)
test_dataset = context.test.sample(self.n_samples, random_state=context.random_state)

Expand Down
62 changes: 48 additions & 14 deletions deepchecks/nlp/context.py
Expand Up @@ -44,7 +44,7 @@
TClassProba = t.Sequence[t.Sequence[float]]
TTokenPred = t.Sequence[t.Sequence[t.Tuple[str, int, int, float]]]
TTextPred = t.Union[TClassPred, TTokenPred]
TTextProba = t.Union[TClassProba]
TTextProba = t.Union[TClassProba] # TODO: incorrect, why union have only one type argument?


class _DummyModel(BasicModel):
Expand Down Expand Up @@ -216,18 +216,34 @@ def _validate_classification_prediction(dataset: TextData, prediction: TTextPred
@staticmethod
def _validate_token_classification_prediction(dataset: TextData, prediction: TTextPred):
"""Validate prediction for given token classification dataset."""
if not all(isinstance(pred, collections.abc.Sequence) for pred in prediction):
raise ValidationError(f'Check requires predictions for {dataset.name} to be a sequence '
f'of sequences')

for i in range(len(prediction)): # TODO: Goes over all predictions, fix this
if not all(isinstance(pred, str) for pred in prediction[i]) \
and not all(isinstance(pred, int) for pred in prediction[i]):
raise ValidationError(f'Check requires predictions for {dataset.name} to be a sequence '
f'of sequences of strings or integers')
if len(prediction[i]) != len(dataset.tokenized_text[i]):
raise ValidationError(f'Check requires predictions for {dataset.name} to have '
f'the same number of tokens as the input text')
if not is_sequence_not_str(prediction):
raise ValidationError(
f'Check requires predictions for {dataset.name} to be a sequence of sequences'
)

tokenized_text = dataset.tokenized_text

for idx, sample_predictions in enumerate(prediction):
if not is_sequence_not_str(sample_predictions):
raise ValidationError(
f'Check requires predictions for {dataset.name} to be a sequence of sequences'
)

predictions_types_counter = collections.defaultdict(int)

for p in sample_predictions:
predictions_types_counter[type(p)] += 1

if predictions_types_counter[str] > 0 and predictions_types_counter[int] > 0:
raise ValidationError(
f'Check requires predictions for {dataset.name} to be a sequence '
'of sequences of strings or integers'
)
if len(sample_predictions) != len(tokenized_text[idx]):
raise ValidationError(
f'Check requires predictions for {dataset.name} to have '
'the same number of tokens as the input text'
)

@staticmethod
def _validate_proba(dataset: TextData, probabilities: TTextProba, n_classes: int,
Expand Down Expand Up @@ -430,6 +446,24 @@ def assert_properties(text_data):
'set_properties method to set your own properties with a pandas.DataFrame or use '
'TextData.calculate_default_properties to add the default deepchecks properties.')

def raise_if_token_classification_task(self, check=None):
"""Raise an exception if it is a token classification task."""
check_name = type(check).__name__ if check else 'Check'
task_type_name = TaskType.TOKEN_CLASSIFICATION.value
if self.task_type is TaskType.TOKEN_CLASSIFICATION:
raise DeepchecksNotSupportedError(
f'"{check_name}" is not supported for the "{task_type_name}" tasks'
)

def raise_if_multi_label_task(self, check=None):
"""Raise an exception if it is a multi-label classification task."""
dataset = t.cast(TextData, self._train if self._train is not None else self._test)
check_name = type(check).__name__ if check else 'Check'
if dataset.is_multi_label_classification():
raise DeepchecksNotSupportedError(
f'"{check_name}" is not supported for the multilable classification tasks'
)

def get_scorers(self,
scorers: t.Union[t.Mapping[str, t.Union[str, t.Callable]], t.List[str]] = None,
use_avg_defaults=True) -> t.List[DeepcheckScorer]:
Expand All @@ -454,11 +488,11 @@ def get_scorers(self,
else:
scorers = scorers or get_default_scorers(TabularTaskType.BINARY, use_avg_defaults)
elif self.task_type == TaskType.TOKEN_CLASSIFICATION:
scoring_dict = get_scorer_dict()
if scorers is None:
scorers = get_default_token_scorers(use_avg_defaults) # Get string names of default scorers
else:
validate_scorers(scorers) # Validate that use supplied scorer names are OK
scoring_dict = get_scorer_dict()
scorers = {name: scoring_dict[name] for name in scorers}
else:
raise DeepchecksValueError(f'Task type must be either {TaskType.TEXT_CLASSIFICATION} or '
Expand Down
41 changes: 28 additions & 13 deletions deepchecks/nlp/metric_utils/token_classification.py
Expand Up @@ -21,15 +21,19 @@
__all__ = ['get_default_token_scorers', 'validate_scorers', 'get_scorer_dict']

DEFAULT_AVG_SCORER_NAMES = ('f1_macro', 'recall_macro', 'precision_macro')
DEFAULT_PER_CLASS_SCORER_NAMES = ('f1_per_class', 'f1_per_class', 'f1_per_class')
DEFAULT_PER_CLASS_SCORER_NAMES = tuple()


if t.TYPE_CHECKING:
from deepchecks.nlp.context import TTokenPred # pylint: disable=unused-import # noqa: F401
# see issue DEE-473
# https://linear.app/deepchecks/issue/DEE-473/incorrectly-inferred-model-classes-for-token-classification-task
#
# DEFAULT_PER_CLASS_SCORER_NAMES = ('f1_per_class',)


def get_scorer_dict(suffix: bool = False, mode: t.Optional[str] = None, scheme: t.Optional[t.Type[Token]] = None,
) -> t.Dict[str, t.Callable[[t.List[str], t.List[str]], float]]:
def get_scorer_dict(
suffix: bool = False,
mode: t.Optional[str] = None,
scheme: t.Optional[t.Type[Token]] = None,
) -> t.Dict[str, t.Callable[[t.List[str], t.List[str]], float]]:
"""Return a dict of scorers for token classification.
Parameters:
Expand Down Expand Up @@ -77,14 +81,25 @@ def validate_scorers(scorers: t.List[str]):

if not isinstance(scorers, Sequence):
raise DeepchecksValueError(f'Scorers must be a Sequence, got {type(scorers)}')
if not all(isinstance(name, str) for name in scorers):
# TODO: support custom scorers
raise DeepchecksValueError(f'Scorers must be a Sequence of strings, got {type(scorers[0])}')
if any(name not in scoring_dict for name in scorers):
raise DeepchecksValueError(f'Scorers must be a list of names of existing token classification metrics, which '
f'is {scoring_dict.keys()}, got {scorers}')

for name in scorers:
if not isinstance(name, str):
# TODO: support custom scorers
raise DeepchecksValueError(
f'Scorers must be a Sequence of strings, got {type(name)}'
)
if name not in scoring_dict:
raise DeepchecksValueError(
'Scorers must be a list of names of existing token classification metrics, '
f'which is {scoring_dict.keys()}, got {scorers}'
)


def get_default_token_scorers(use_avg_defaults=True) -> t.List[str]:
"""Return the default scorers for token classification."""
return DEFAULT_AVG_SCORER_NAMES if use_avg_defaults else DEFAULT_PER_CLASS_SCORER_NAMES
names = (
DEFAULT_AVG_SCORER_NAMES
if use_avg_defaults
else DEFAULT_PER_CLASS_SCORER_NAMES
)
return [f'token_{it}' for it in names]
16 changes: 10 additions & 6 deletions deepchecks/nlp/utils/data_inference.py
Expand Up @@ -23,12 +23,16 @@
__all__ = ['infer_observed_and_model_labels']


def infer_observed_and_model_labels(train_dataset=None, test_dataset=None, model: BaseEstimator = None,
y_pred_train: np.array = None, # pylint: disable=unused-argument
y_pred_test: np.array = None, # pylint: disable=unused-argument
model_classes: list = None,
task_type: TaskType = None) -> \
Tuple[List, List]:
# pylint: disable=unused-argument
def infer_observed_and_model_labels(
train_dataset=None,
test_dataset=None,
model: BaseEstimator = None,
y_pred_train: np.ndarray = None,
y_pred_test: np.ndarray = None,
model_classes: list = None,
task_type: TaskType = None
) -> Tuple[List, List]:
"""
Infer the observed labels from the given datasets and predictions.
Expand Down
8 changes: 6 additions & 2 deletions deepchecks/utils/validation.py
Expand Up @@ -14,6 +14,7 @@

import numpy as np
import pandas as pd
from typing_extensions import TypeGuard

from deepchecks.core import errors
from deepchecks.utils.typing import Hashable
Expand Down Expand Up @@ -48,6 +49,9 @@ def ensure_hashable_or_mutable_sequence(
))


def is_sequence_not_str(value):
def is_sequence_not_str(value) -> TypeGuard[t.Sequence[t.Any]]:
"""Check if value is a non str sequence."""
return isinstance(value, (t.Sequence, pd.Series, np.ndarray)) and not isinstance(value, str)
return (
not isinstance(value, (bytes, str, bytearray))
and isinstance(value, (t.Sequence, pd.Series, np.ndarray))
)
3 changes: 2 additions & 1 deletion spelling-allowlist.txt
Expand Up @@ -142,4 +142,5 @@ mergesort
scikit
NLP
embeddings
ONNX
ONNX
f1
Expand Up @@ -137,8 +137,7 @@ def test_wikiann_data(wikiann):
"""Temp to test wikiann dataset loads correctly"""
dataset = wikiann
check = SingleDatasetPerformance(scorers=['token_f1_macro'])
result = check.run(dataset, predictions=dataset.label)

result = check.run(dataset, predictions=list(dataset.label))
assert_that(result.value.values[0][-1], equal_to(1))


Expand Down

0 comments on commit 0d105c8

Please sign in to comment.