From 656d5fc64f48449ebedc84dc32330c0b34021803 Mon Sep 17 00:00:00 2001 From: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com> Date: Tue, 4 Apr 2023 13:55:55 +0300 Subject: [PATCH] refactor text data (#2432) --- .../property_label_correlation.py | 2 +- .../data_integrity/text_property_outliers.py | 2 +- .../weak_segments_performance.py | 4 +- deepchecks/nlp/context.py | 52 ++- .../datasets/classification/tweet_emotion.py | 80 +++- deepchecks/nlp/input_validations.py | 110 +++++ deepchecks/nlp/task_type.py | 8 +- deepchecks/nlp/text_data.py | 442 +++++++----------- deepchecks/nlp/utils/text_properties.py | 8 +- deepchecks/utils/validation.py | 13 +- .../quickstarts/plot_text_classification.py | 16 +- .../property_label_correlation_test.py | 5 +- .../model_evaluation/confusion_matrix_test.py | 8 +- .../model_evaluation/prediction_drift_test.py | 2 +- .../single_dataset_performance_test.py | 11 +- tests/nlp/conftest.py | 29 +- tests/nlp/test_text_data.py | 26 +- 17 files changed, 418 insertions(+), 400 deletions(-) create mode 100644 deepchecks/nlp/input_validations.py diff --git a/deepchecks/nlp/checks/data_integrity/property_label_correlation.py b/deepchecks/nlp/checks/data_integrity/property_label_correlation.py index f142aec72f..02bdf760c9 100644 --- a/deepchecks/nlp/checks/data_integrity/property_label_correlation.py +++ b/deepchecks/nlp/checks/data_integrity/property_label_correlation.py @@ -94,7 +94,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult: """ text_data = context.get_data_by_kind(dataset_kind).sample(self.n_samples, random_state=context.random_state) - label = pd.Series(text_data.label, name='label', index=text_data.index) + label = pd.Series(text_data.label, name='label', index=text_data.get_original_text_indexes()) # Classification labels should be of type object (and not int, for example) if context.task_type in [TaskType.TEXT_CLASSIFICATION, TaskType.TOKEN_CLASSIFICATION]: diff --git a/deepchecks/nlp/checks/data_integrity/text_property_outliers.py b/deepchecks/nlp/checks/data_integrity/text_property_outliers.py index 15e8538c20..c2105f9068 100644 --- a/deepchecks/nlp/checks/data_integrity/text_property_outliers.py +++ b/deepchecks/nlp/checks/data_integrity/text_property_outliers.py @@ -119,7 +119,7 @@ def run_logic(self, context: Context, dataset_kind: DatasetKind) -> CheckResult: text_outliers = np.concatenate([bottom_outliers, top_outliers]) result[name] = { - 'indices': [dataset.index[i] for i in text_outliers], + 'indices': [dataset.get_original_text_indexes()[i] for i in text_outliers], # For the upper and lower limits doesn't show values that are smaller/larger than the actual values # we have in the data 'lower_limit': max(lower_limit, min(values_arr)), diff --git a/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py b/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py index 0b8e1efe47..2f40cb8fe9 100644 --- a/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py +++ b/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py @@ -73,7 +73,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult: predictions = context.model.predict(text_data) if self.loss_per_sample is not None: - loss_per_sample = self.loss_per_sample[list(text_data.index)] + loss_per_sample = self.loss_per_sample[text_data.get_original_text_indexes()] proba_values = None elif not hasattr(context.model, 'predict_proba'): raise DeepchecksNotSupportedError('Predicted probabilities not supplied. The weak segment checks relies' @@ -87,7 +87,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult: if features.shape[1] < 2: raise DeepchecksNotSupportedError('Check requires meta data to have at least two columns in order to run.') # label is not used in the check, just here to avoid errors - dataset = Dataset(features, label=pd.Series(text_data.label, index=text_data.index), cat_features=cat_features) + dataset = Dataset(features, label=pd.Series(text_data.label), cat_features=cat_features) encoded_dataset = self._target_encode_categorical_features_fill_na(dataset, list(np.unique(text_data.label))) dummy_model = _DummyModel(test=encoded_dataset, y_pred_test=np.asarray(predictions), diff --git a/deepchecks/nlp/context.py b/deepchecks/nlp/context.py index 8a74ee13ba..99d54f8ab7 100644 --- a/deepchecks/nlp/context.py +++ b/deepchecks/nlp/context.py @@ -90,9 +90,9 @@ def __init__(self, if train is not None and test is not None: # check if datasets have same indexes - if set(train.index) & set(test.index): - train.reindex(list(map(lambda x: f'train-{x}', list(train.index)))) - test.reindex(list(map(lambda x: f'test-{x}', list(test.index)))) + if set(train.get_original_text_indexes()) & set(test.get_original_text_indexes()): + train._original_text_index = np.asarray([f'train-{i}' for i in train.get_original_text_indexes()]) + test._original_text_index = np.asarray([f'test-{i}' for i in test.get_original_text_indexes()]) get_logger().warning('train and test datasets have common index - adding "train"/"test"' ' prefixes. To avoid that provide datasets with no common indexes ' 'or pass the model object instead of the predictions.') @@ -108,26 +108,29 @@ def __init__(self, if dataset.task_type == TaskType.TEXT_CLASSIFICATION: if (y_pred is None) and (y_proba is not None): - if dataset.is_multilabel: + if dataset.is_multi_label_classification(): y_pred = (np.array(y_proba) > 0.5) # TODO: Replace with user-configurable threshold y_pred = [np.array(model_classes)[pred] for pred in y_pred] else: y_pred = np.argmax(np.array(y_proba), axis=-1) - y_pred = np.array(model_classes)[y_pred] + y_pred = np.array(model_classes, dtype='str')[y_pred] if y_pred is not None: - y_pred = np.array(y_pred) + if dataset.is_multi_label_classification(): + y_pred = np.array(y_pred) + else: + y_pred = np.array(y_pred, dtype='str') if len(y_pred.shape) > 1 and y_pred.shape[1] == 1: y_pred = y_pred[:, 0] ensure_predictions_shape(y_pred, dataset.text) if y_proba is not None: ensure_predictions_proba(y_proba, y_pred) - y_proba_dict = dict(zip(dataset.index, y_proba)) + y_proba_dict = dict(zip(dataset.get_original_text_indexes(), y_proba)) probas.update({dataset.name: y_proba_dict}) if y_pred is not None: - y_pred_dict = dict(zip(dataset.index, y_pred)) + y_pred_dict = dict(zip(dataset.get_original_text_indexes(), y_pred)) predictions.update({dataset.name: y_pred_dict}) self.predictions = predictions @@ -148,20 +151,22 @@ def __init__(self, def _predict(self, data: TextData) -> TTextPred: # TODO: Needs to receive list of strings, not TextData """Predict on given data by the data indexes.""" if self.validate_data_on_predict: - data_indices = set(np.random.choice(data.index, min(100, len(data.index)), replace=False)) + data_indices = set(np.random.choice(data.get_original_text_indexes(), min(100, len(data)), replace=False)) if not data_indices.issubset(self._prediction_indices[data.name]): raise DeepchecksValueError('Data that has not been seen before passed for inference with pre computed ' 'predictions.') - return list(itemgetter(*data.index)(self.predictions[data.name])) # pylint: disable=unsubscriptable-object + return list(itemgetter(*data.get_original_text_indexes())( + self.predictions[data.name])) # pylint: disable=unsubscriptable-object def _predict_proba(self, data: TextData) -> TTextProba: # TODO: Needs to receive list of strings, not TextData """Predict probabilities on given data by the data indexes.""" if self.validate_data_on_predict: - data_indices = set(np.random.choice(data.index, min(100, len(data.index)), replace=False)) + data_indices = set(np.random.choice(data.get_original_text_indexes(), min(100, len(data)), replace=False)) if not data_indices.issubset(self._proba_indices[data.name]): raise DeepchecksValueError('Data that has not been seen before passed for inference with pre computed ' 'probabilities.') - return list(itemgetter(*data.index)(self.probas[data.name])) # pylint: disable=unsubscriptable-object + return list(itemgetter(*data.get_original_text_indexes())( + self.probas[data.name])) # pylint: disable=unsubscriptable-object def fit(self, *args, **kwargs): """Just for python 3.6 (sklearn validates fit method).""" @@ -192,7 +197,7 @@ def _validate_classification_prediction(dataset: TextData, prediction: TTextPred try: prediction = np.array(prediction) - if dataset.is_multilabel: + if dataset.is_multi_label_classification(): prediction = prediction.astype(float) # Multilabel prediction is a binary matrix else: prediction = prediction.reshape((-1, 1)) # Multiclass (not multilabel) Prediction can be a string @@ -201,7 +206,7 @@ def _validate_classification_prediction(dataset: TextData, prediction: TTextPred except ValueError as e: raise ValidationError(classification_format_error) from e pred_shape = prediction.shape - if dataset.is_multilabel: + if dataset.is_multi_label_classification(): if len(pred_shape) == 1 or pred_shape[1] != n_classes: raise ValidationError(classification_format_error) if not np.array_equal(prediction, prediction.astype(bool)): @@ -247,7 +252,7 @@ def _validate_proba(dataset: TextData, probabilities: TTextProba, n_classes: int if proba_shape[1] != n_classes: raise ValidationError(f'Check requires classification probabilities for {dataset.name} dataset ' f'to have {n_classes} columns, same as the number of classes') - if dataset.is_multilabel: + if dataset.is_multi_label_classification(): if (probabilities > 1).any() or (probabilities < 0).any(): raise ValidationError(f'Check requires classification probabilities for {dataset.name} ' f'dataset to be between 0 and 1') @@ -308,18 +313,19 @@ def __init__( # If both dataset, validate they fit each other if train_dataset and test_dataset: if test_dataset.has_label() and train_dataset.has_label() and not \ - TextData.datasets_share_task_type(train_dataset, test_dataset): + train_dataset.validate_textdata_compatibility(test_dataset): raise DatasetValidationError('train_dataset and test_dataset must share the same label and task type') if test_dataset and not train_dataset: raise DatasetValidationError('Can\'t initialize context with only test_dataset. if you have single ' 'dataset, initialize it as train_dataset') - if model_classes and len(model_classes) == 0: - raise DeepchecksValueError('Received empty model_classes') - if model_classes and sorted(model_classes) != model_classes: - supported_models_link = doclink( - 'nlp-supported-predictions-format', - template='For more information please refer to the Supported Tasks guide {link}') - raise DeepchecksValueError(f'Received unsorted model_classes. {supported_models_link}') + if model_classes is not None: + if (not is_sequence_not_str(model_classes)) or len(model_classes) == 0: + raise DeepchecksValueError('model_classes must be a non-empty sequence') + if sorted(model_classes) != model_classes: + supported_models_link = doclink( + 'nlp-supported-predictions-format', + template='For more information please refer to the Supported Tasks guide {link}') + raise DeepchecksValueError(f'Received unsorted model_classes. {supported_models_link}') self._task_type = self.infer_task_type(train_dataset, test_dataset) diff --git a/deepchecks/nlp/datasets/classification/tweet_emotion.py b/deepchecks/nlp/datasets/classification/tweet_emotion.py index e5e55439e7..bf91dc204c 100644 --- a/deepchecks/nlp/datasets/classification/tweet_emotion.py +++ b/deepchecks/nlp/datasets/classification/tweet_emotion.py @@ -98,13 +98,7 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True, if data_format.lower() not in ['textdata', 'dataframe']: raise ValueError('data_format must be either "Dataset" or "Dataframe"') - os.makedirs(ASSETS_DIR, exist_ok=True) - if (ASSETS_DIR / 'tweet_emotion_data.csv').exists(): - dataset = pd.read_csv(ASSETS_DIR / 'tweet_emotion_data.csv', index_col=0) - else: - dataset = pd.read_csv(_FULL_DATA_URL, index_col=0) - dataset.to_csv(ASSETS_DIR / 'tweet_emotion_data.csv') - + dataset = _read_and_save('tweet_emotion_data.csv', _FULL_DATA_URL, to_numpy=False) if not as_train_test: dataset.drop(columns=['train_test_split'], inplace=True) if data_format.lower() == 'textdata': @@ -114,7 +108,7 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True, properties = None dataset = TextData(dataset.text, label=dataset[_target], task_type='text_classification', metadata=dataset.drop(columns=[_target, 'text']), - properties=properties, index=dataset.index) + properties=properties) return dataset else: # train has more sport and Customer Complains but less Terror and Optimism @@ -128,26 +122,39 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True, train_properties, test_properties = None, None train = TextData(train.text, label=train[_target], task_type='text_classification', - index=train.index, metadata=train.drop(columns=[_target, 'text']), + metadata=train.drop(columns=[_target, 'text']), properties=train_properties) test = TextData(test.text, label=test[_target], task_type='text_classification', - index=test.index, metadata=test.drop(columns=[_target, 'text']), + metadata=test.drop(columns=[_target, 'text']), properties=test_properties) return train, test -def load_embeddings() -> np.ndarray: +def load_embeddings(as_train_test: bool = False) -> np.ndarray: """Load and return the embeddings of the tweet_emotion dataset calculated by OpenAI. + Parameters + ---------- + as_train_test : bool, default: True + If True, the returned data is split into train and test exactly like the toy model + was trained. The first return value is the train data and the second is the test data. + Otherwise, returns a single object. + Returns ------- embeddings : np.ndarray Embeddings for the tweet_emotion dataset. """ - return pd.read_csv(_EMBEDDINGS_URL, index_col=0).to_numpy() + all_embeddings = _read_and_save('tweet_emotion_embeddings.csv', _EMBEDDINGS_URL) + if as_train_test: + train_indexes, test_indexes = _get_train_test_indexes() + return all_embeddings[train_indexes], all_embeddings[test_indexes] + else: + return all_embeddings -def load_precalculated_predictions(pred_format: str = 'predictions') -> np.array: +def load_precalculated_predictions(pred_format: str = 'predictions', + as_train_test: bool = False) -> np.array: """Load and return a precalculated predictions for the dataset. Parameters @@ -156,6 +163,10 @@ def load_precalculated_predictions(pred_format: str = 'predictions') -> np.array Represent the format of the returned value. Can be 'predictions' or 'probabilities'. 'predictions' will return the predicted class for each sample. 'probabilities' will return the predicted probabilities for each sample. + as_train_test : bool, default: True + If True, the returned data is split into train and test exactly like the toy model + was trained. The first return value is the train data and the second is the test data. + Otherwise, returns a single object. Returns ------- @@ -163,18 +174,41 @@ def load_precalculated_predictions(pred_format: str = 'predictions') -> np.array The prediction of the data elements in the dataset. """ + all_preds = _read_and_save('tweet_emotion_probabilities.csv', _PREDICTIONS_URL) + if pred_format == 'predictions': + all_preds = np.array([_LABEL_MAP[x] for x in np.argmax(all_preds, axis=1)]) + elif pred_format != 'probabilities': + raise ValueError('pred_format must be either "predictions" or "probabilities"') + + if as_train_test: + train_indexes, test_indexes = _get_train_test_indexes() + return all_preds[train_indexes], all_preds[test_indexes] + else: + return all_preds + + +def _read_and_save(file_name, url_to_file, to_numpy=True): + """Read a file from a url and save it to the assets directory.""" os.makedirs(ASSETS_DIR, exist_ok=True) - if (ASSETS_DIR / 'tweet_emotion_probabilities.csv').exists(): - preds = pd.read_csv(ASSETS_DIR / 'tweet_emotion_probabilities.csv', index_col=0) + if (ASSETS_DIR / file_name).exists(): + data = pd.read_csv(ASSETS_DIR / file_name, index_col=0) else: - preds = pd.read_csv(_PREDICTIONS_URL, index_col=0) - preds.to_csv(ASSETS_DIR / 'tweet_emotion_probabilities.csv') + data = pd.read_csv(url_to_file, index_col=0) + data.to_csv(ASSETS_DIR / file_name) - preds = preds.to_numpy() + if to_numpy: + data = data.to_numpy() + return data - if pred_format == 'predictions': - return np.array([_LABEL_MAP[x] for x in np.argmax(preds, axis=1)]) - elif pred_format == 'probabilities': - return preds + +def _get_train_test_indexes() -> t.Tuple[np.array, np.array]: + """Get the indexes of the train and test sets.""" + if (ASSETS_DIR / 'tweet_emotion_data.csv').exists(): + dataset = pd.read_csv(ASSETS_DIR / 'tweet_emotion_data.csv', index_col=0, + usecols=['Unnamed: 0', 'train_test_split']) else: - raise ValueError('pred_format must be either "predictions" or "probabilities"') + dataset = pd.read_csv(_FULL_DATA_URL, index_col=0, usecols=['Unnamed: 0', 'train_test_split']) + + train_indexes = dataset[dataset['train_test_split'] == 'Train'].index + test_indexes = dataset[dataset['train_test_split'] == 'Test'].index + return train_indexes, test_indexes diff --git a/deepchecks/nlp/input_validations.py b/deepchecks/nlp/input_validations.py new file mode 100644 index 0000000000..3e3f9f0e45 --- /dev/null +++ b/deepchecks/nlp/input_validations.py @@ -0,0 +1,110 @@ +# ---------------------------------------------------------------------------- +# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com) +# +# This file is part of Deepchecks. +# Deepchecks is distributed under the terms of the GNU Affero General +# Public License (version 3 or later). +# You should have received a copy of the GNU Affero General Public License +# along with Deepchecks. If not, see . +# ---------------------------------------------------------------------------- +# +"""Module containing input validation functions.""" +from typing import Dict, Optional, Sequence + +import numpy as np +import pandas as pd + +from deepchecks.core.errors import DeepchecksValueError +from deepchecks.nlp.task_type import TaskType, TTextLabel +from deepchecks.utils.logger import get_logger +from deepchecks.utils.type_inference import infer_categorical_features +from deepchecks.utils.validation import is_sequence_not_str + + +def validate_tokenized_text(tokenized_text: Optional[Sequence[Sequence[str]]]): + """Validate tokenized text format.""" + error_string = 'tokenized_text must be a Sequence of Sequences of strings' + if not is_sequence_not_str(tokenized_text): + raise DeepchecksValueError(error_string) + if not all(is_sequence_not_str(x) for x in tokenized_text): + raise DeepchecksValueError(error_string) + if not all(isinstance(x, str) for tokens in tokenized_text for x in tokens): + raise DeepchecksValueError(error_string) + + +def validate_raw_text(raw_text: Optional[Sequence[str]]): + """Validate text format.""" + error_string = 'raw_text must be a Sequence of strings' + if not is_sequence_not_str(raw_text): + raise DeepchecksValueError(error_string) + if not all(isinstance(x, str) for x in raw_text): + raise DeepchecksValueError(error_string) + + +def validate_modify_label(labels: Optional[TTextLabel], task_type: TaskType, expected_size: int, + tokenized_text: Optional[Sequence[Sequence[str]]]) -> Optional[TTextLabel]: + """Validate and process label to accepted formats.""" + if labels is None or is_sequence_not_str(labels) and all(x is None for x in labels): + return None + + if not is_sequence_not_str(labels): + raise DeepchecksValueError('label must be a Sequence') + if not len(labels) == expected_size: + raise DeepchecksValueError(f'Label length ({len(labels)}) does not match expected length ({expected_size})') + + if task_type == TaskType.TEXT_CLASSIFICATION: + if all(is_sequence_not_str(x) for x in labels): # Multilabel + multilabel_error = 'multilabel was identified. It must be a Sequence of Sequences of 0 or 1.' + if not all(all(y in (0, 1) for y in x) for x in labels): + raise DeepchecksValueError(multilabel_error) + if any(len(labels[0]) != len(labels[i]) for i in range(len(labels))): + raise DeepchecksValueError('All multilabel entries must be of the same length, which is the number' + ' of possible classes.') + labels = [[int(x) for x in label_per_sample] for label_per_sample in labels] + elif not all(isinstance(x, (str, int)) for x in labels): # Classic classification + raise DeepchecksValueError('label must be a Sequence of strings or ints (multiclass classification) ' + 'or a Sequence of Sequences of strings or ints (multilabel classification)') + else: + labels = [str(x) for x in labels] + elif task_type == TaskType.TOKEN_CLASSIFICATION: + token_class_error = 'label must be a Sequence of Sequences of either strings or integers.' + if not is_sequence_not_str(labels): + raise DeepchecksValueError(token_class_error) + + result = [] + for idx, (tokens, label) in enumerate(zip(tokenized_text, labels)): # TODO: Runs on all labels, very costly + if not is_sequence_not_str(label): + raise DeepchecksValueError(token_class_error + f' label at {idx} was of type {type(label)}') + if not len(tokens) == len(label): + raise DeepchecksValueError(f'label must be the same length as tokenized_text. ' + f'However, for sample index {idx} received token list of length ' + f'{len(tokens)} and label list of length {len(label)}') + result.append([str(x) for x in label]) + labels = np.asarray(result, dtype=object) + + return np.asarray(labels) + + +def validate_length_and_calculate_column_types(data_table: pd.DataFrame, data_table_name: str, expected_size: int, + column_types: Optional[Dict[str, str]] = None) -> \ + Optional[Dict[str, str]]: + """Validate length of data table and calculate column types.""" + if data_table is None: + return None + + if not isinstance(data_table, pd.DataFrame): + raise DeepchecksValueError(f'{data_table_name} type {type(data_table)} is not supported, must be a' + f' pandas DataFrame') + if len(data_table) != expected_size: + raise DeepchecksValueError(f'received metadata with {len(data_table)} rows, expected {expected_size}') + + if column_types is None: # TODO: Add tests + cat_features = infer_categorical_features(data_table) + column_types = {data_table.columns[i]: 'categorical' if data_table.columns[i] in cat_features else 'numeric' + for i in range(len(data_table.columns))} + get_logger().info('%s types were not provided, auto inferred types are: ', data_table_name) + get_logger().info(column_types) + elif sorted(list(column_types.keys())) != sorted(list(data_table.columns)): + raise DeepchecksValueError(f'{data_table_name} types keys must identical to {data_table_name} table columns') + + return column_types diff --git a/deepchecks/nlp/task_type.py b/deepchecks/nlp/task_type.py index d8ff426bd5..a85fc3c96b 100644 --- a/deepchecks/nlp/task_type.py +++ b/deepchecks/nlp/task_type.py @@ -9,9 +9,15 @@ # ---------------------------------------------------------------------------- # """The task type module containing the TaskType enum.""" +import typing as t from enum import Enum -__all__ = ['TaskType'] +__all__ = ['TaskType', 'TTokenLabel', 'TClassLabel', 'TTextLabel'] + +TSingleLabel = t.Union[int, str] +TClassLabel = t.Sequence[t.Union[TSingleLabel, t.Tuple[TSingleLabel]]] +TTokenLabel = t.Sequence[t.Sequence[t.Union[str, int]]] +TTextLabel = t.Union[TClassLabel, TTokenLabel] class TaskType(Enum): diff --git a/deepchecks/nlp/text_data.py b/deepchecks/nlp/text_data.py index f830817352..c6d4a28b2f 100644 --- a/deepchecks/nlp/text_data.py +++ b/deepchecks/nlp/text_data.py @@ -11,25 +11,22 @@ """The dataset module containing the tabular Dataset class and its functions.""" import typing as t import warnings -from operator import itemgetter +from numbers import Number import numpy as np import pandas as pd from deepchecks.core.errors import DeepchecksNotSupportedError, DeepchecksValueError -from deepchecks.nlp.task_type import TaskType +from deepchecks.nlp.input_validations import (validate_length_and_calculate_column_types, validate_modify_label, + validate_raw_text, validate_tokenized_text) +from deepchecks.nlp.task_type import TaskType, TTextLabel from deepchecks.nlp.utils.text_properties import calculate_default_properties from deepchecks.utils.logger import get_logger -from deepchecks.utils.type_inference import infer_categorical_features from deepchecks.utils.validation import is_sequence_not_str -__all__ = ['TextData', 'TTokenLabel', 'TClassLabel', 'TTextLabel'] +__all__ = ['TextData'] TDataset = t.TypeVar('TDataset', bound='TextData') -TSingleLabel = t.Tuple[int, str] -TClassLabel = t.Sequence[t.Union[TSingleLabel, t.Tuple[TSingleLabel]]] -TTokenLabel = t.Sequence[t.Sequence[t.Union[str, int]]] -TTextLabel = t.Union[TClassLabel, TTokenLabel] class TextData: @@ -65,10 +62,8 @@ class TextData: task_type : str, default: None The task type for the text data. Can be either 'text_classification' or 'token_classification'. Must be set if label is provided. - dataset_name : t.Optional[str] , default: None + name : t.Optional[str] , default: None The name of the dataset. If None, the dataset name will be defined when running it within a check. - index : t.Optional[t.Sequence[int]] , default: None - The index of the samples. If None, the index is set to np.arange(len(raw_text)). metadata : t.Optional[pd.DataFrame] , default: None Metadata for the samples. If None, no metadata is set. If a DataFrame is given, it must contain the same number of samples as the raw_text and identical index. @@ -76,199 +71,107 @@ class TextData: The text properties for the samples. If None, no properties are set. If 'auto', the properties are calculated using the default properties. If a DataFrame is given, it must contain the properties for each sample as the raw text and identical index. - device : str, default: None - The device to use to calculate the text properties. """ - _text: t.Sequence[str] + _text: np.ndarray _label: TTextLabel - index: t.Sequence[t.Any] - _task_type: t.Optional[TaskType] - _has_label: bool - _is_multilabel: bool = False + task_type: t.Optional[TaskType] + _tokenized_text: t.Optional[t.Sequence[t.Sequence[str]]] = None # Outer sequence is np array name: t.Optional[str] = None _metadata: t.Optional[pd.DataFrame] = None + _metadata_types: t.Optional[t.Dict[str, str]] = None _properties: t.Optional[t.Union[pd.DataFrame, str]] = None + _properties_types: t.Optional[t.Dict[str, str]] = None + _original_text_index: t.Optional[t.Sequence[int]] = None # Sequence is np array def __init__( self, raw_text: t.Optional[t.Sequence[str]] = None, tokenized_text: t.Optional[t.Sequence[t.Sequence[str]]] = None, label: t.Optional[TTextLabel] = None, - task_type: t.Optional[str] = None, - dataset_name: t.Optional[str] = None, - index: t.Optional[t.Sequence[t.Any]] = None, + task_type: str = 'other', + name: t.Optional[str] = None, metadata: t.Optional[pd.DataFrame] = None, - properties: t.Optional[t.Union[pd.DataFrame, str]] = None, - device: t.Optional[str] = None + properties: t.Optional[t.Union[pd.DataFrame]] = None, ): # Require explicitly setting task type if label is provided if task_type in [None, 'other']: if label is not None: - if isinstance(label, t.Sequence): - if pd.notnull(label).any(): - raise DeepchecksValueError('task_type must be set when label is provided') - else: - raise DeepchecksValueError('task_type must be set when label is provided') - + raise DeepchecksValueError('task_type must be set when label is provided') self._task_type = TaskType.OTHER elif task_type == 'text_classification': self._task_type = TaskType.TEXT_CLASSIFICATION elif task_type == 'token_classification': + if tokenized_text is None: + raise DeepchecksValueError('tokenized_text must be provided for token_classification task type') + validate_tokenized_text(tokenized_text) + modified = [[str(token) for token in tokens_per_sample] for tokens_per_sample in tokenized_text] + self._tokenized_text = np.asarray(modified, dtype=object) self._task_type = TaskType.TOKEN_CLASSIFICATION else: raise DeepchecksNotSupportedError(f'task_type {task_type} is not supported, must be one of ' f'text_classification, token_classification, other') - if raw_text is None and tokenized_text is None: - raise DeepchecksValueError('raw_text and tokenized_text cannot both be None') - elif raw_text is None: - self._validate_tokenized_text(tokenized_text) - self._tokenized_text = list(tokenized_text) - self._text = [' '.join(tokens) for tokens in tokenized_text] - elif tokenized_text is None: - self._validate_text(raw_text) - self._text = list(raw_text) - if self._task_type == TaskType.TOKEN_CLASSIFICATION: - self._tokenized_text = [sample.split() for sample in self._text] - else: - self._tokenized_text = None - else: - self._validate_text(raw_text) - self._validate_tokenized_text(tokenized_text) - self._text, self._tokenized_text = list(raw_text), list(tokenized_text) - if len(raw_text) != len(tokenized_text): - raise DeepchecksValueError('raw_text and tokenized_text must have the same length') - - if index is None: - self.index = list(range(len(raw_text))) - elif len(index) != len(raw_text): - raise DeepchecksValueError('index must be the same length as raw_text') + if raw_text is None: + if tokenized_text is None: + raise DeepchecksValueError('Either raw_text or tokenized_text must be provided') + self._text = np.asarray([' '.join(tokens) for tokens in tokenized_text]) # Revisit this decision else: - self.index = list(index) + validate_raw_text(raw_text) + self._text = np.asarray([str(x) for x in raw_text]) + if tokenized_text is not None and len(raw_text) != len(tokenized_text): + raise DeepchecksValueError('raw_text and tokenized_text sequences must have the same length') - self._validate_and_set_label(label) + self._label = validate_modify_label(label, self._task_type, len(self), tokenized_text) - if dataset_name is not None: - if not isinstance(dataset_name, str): - raise DeepchecksNotSupportedError(f'dataset_name type {type(dataset_name)} is not supported, must be a' - f' str') - self.name = dataset_name + if name is not None and not isinstance(name, str): + raise DeepchecksNotSupportedError(f'name must be a string, got {type(name)}') + self.name = name - if metadata is not None: - self.set_metadata(metadata) - else: - self._metadata = None - self._metadata_types = None - - if properties is not None: - if isinstance(properties, str) and properties == 'auto': - self.calculate_default_properties(device=device) - else: - self.set_properties(properties) - else: - self._properties = None - self._properties_types = None - - @staticmethod - def _validate_text(raw_text: t.Sequence[str]): - """Validate text format.""" - error_string = 'raw_text must be a Sequence of strings' - if not is_sequence_not_str(raw_text): - raise DeepchecksValueError(error_string) - if not all(isinstance(x, str) for x in raw_text): - raise DeepchecksValueError(error_string) - - @staticmethod - def _validate_tokenized_text(tokenized_text: t.Sequence[t.Sequence[str]]): - """Validate tokenized text format.""" - error_string = 'tokenized_text must be a Sequence of sequences of strings' - if not is_sequence_not_str(tokenized_text): - raise DeepchecksValueError(error_string) - if not all(is_sequence_not_str(x) for x in tokenized_text): - raise DeepchecksValueError(error_string) - if not all(isinstance(x, str) for tokens in tokenized_text for x in tokens): - raise DeepchecksValueError(error_string) - - def _validate_and_set_label(self, label: t.Optional[TTextLabel]): - """Validate and process label to accepted formats.""" - # If label is not set, create an empty label of nulls - if label is None: - self._has_label, self._label = False, [None] * len(self._text) - return - - # Check if label is n array of None, if so return - if isinstance(label, t.Sequence) and all(x is None for x in label): - self._has_label, self._label = False, label - return - - self._has_label = True - if not is_sequence_not_str(label): - raise DeepchecksValueError('label must be a Sequence') - - if not len(label) == len(self._text): - raise DeepchecksValueError('label must be the same length as raw_text') - - if self.task_type == TaskType.TEXT_CLASSIFICATION: - if all((isinstance(x, t.Sequence) and not isinstance(x, str)) for x in label): - self._is_multilabel = True - multilabel_error = 'multilabel was identified. It must be a Sequence of Sequences of 0 or 1.' - if not all(all(y in (0, 1) for y in x) for x in label): - raise DeepchecksValueError(multilabel_error) - if any(len(label[0]) != len(label[i]) for i in range(len(label))): - raise DeepchecksValueError('All multilabel entries must be of the same length, which is the number' - ' of classes.') - elif not all(isinstance(x, (str, int)) for x in label): - raise DeepchecksValueError('label must be a Sequence of strings or ints or a Sequence of Sequences' - 'of strings or ints (for multilabel classification)') - - if self.task_type == TaskType.TOKEN_CLASSIFICATION: - token_class_error = 'label must be a Sequence of Sequences of either strings or integers' - if not all(isinstance(x, t.Sequence) for x in label): - raise DeepchecksValueError(token_class_error) - - for i in range(len(label)): # TODO: Runs on all labels, very costly - if not (all(isinstance(x, str) for x in label[i]) or all(isinstance(x, int) for x in label[i])): - raise DeepchecksValueError(token_class_error) - if not len(label[i]) == len(self._tokenized_text[i]): - raise DeepchecksValueError(f'label must be the same length as tokenized_text. ' - f'However, for sample index {self.index[i]} of length ' - f'{len(self._tokenized_text[i])} received label of ' - f'length {len(label[i])}') - self._label = list(label) - - def reindex(self, index: t.Sequence[t.Any]): - """Reindex the TextData with a new index.""" - if not is_sequence_not_str(index): - raise DeepchecksValueError('index must be a Sequence') - if not len(index) == len(self.index): - raise DeepchecksValueError('new index must be the same length as original index') - self.index = list(index) - if self._metadata is not None: - self._metadata = self._metadata.reindex(index) - if self._properties is not None: - self._properties = self._properties.reindex(index) + self.set_metadata(metadata) + self.set_properties(properties) - def copy(self: TDataset, rows_to_use: t.Optional[t.Sequence[t.Any]] = None) -> TDataset: - """Create a copy of this Dataset with new data.""" + # Used for display purposes + self._original_text_index = np.arange(len(self)) + + def is_multi_label_classification(self) -> bool: + """Check if the dataset is multi-label.""" + if self.task_type == TaskType.TEXT_CLASSIFICATION and self._label is not None: + return is_sequence_not_str(self._label[0]) + return False + + def copy(self: TDataset, rows_to_use: t.Optional[t.Sequence[int]] = None) -> TDataset: + """Create a copy of this Dataset with new data. + + Parameters + ---------- + rows_to_use : t.Optional[t.List[int]] , default: None + The rows to use in the new copy. If None, the new copy will contain all the rows. + """ cls = type(self) logger_state = get_logger().disabled get_logger().disabled = True # Make sure we won't get the warning for setting class in the non multilabel case if rows_to_use is None: new_copy = cls(raw_text=self._text, tokenized_text=self._tokenized_text, label=self._label, - task_type=self._task_type.value, - dataset_name=self.name, index=self.index, metadata=self.metadata, - properties=self._properties) + task_type=self._task_type.value, name=self.name) + metadata, properties = self._metadata, self._properties + index_kept = self._original_text_index else: - new_copy = cls( - raw_text=list(itemgetter(*rows_to_use)(self._text)), - tokenized_text=list( - itemgetter(*rows_to_use)(self._tokenized_text)) if self._tokenized_text else None, - label=list(itemgetter(*rows_to_use)(self._label)) if self._label else None, - index=list(itemgetter(*rows_to_use)(self.index)), - metadata=self._metadata.iloc[rows_to_use, :] if self._metadata is not None else None, - properties=self._properties.iloc[rows_to_use, :] if self._properties is not None else None, - task_type=self._task_type.value, dataset_name=self.name) + if not isinstance(rows_to_use, t.Sequence) or any(not isinstance(x, Number) for x in rows_to_use): + raise DeepchecksValueError('rows_to_use must be a list of integers') + rows_to_use = sorted(rows_to_use) + new_copy = cls(raw_text=self._text[rows_to_use], + tokenized_text=self._tokenized_text[ + rows_to_use] if self._tokenized_text is not None else None, + label=self._label[rows_to_use] if self.has_label() else None, + task_type=self._task_type.value, name=self.name) + metadata = self._metadata.iloc[rows_to_use, :] if self._metadata is not None else None + properties = self._properties.iloc[rows_to_use, :] if self._properties is not None else None + index_kept = self._original_text_index[rows_to_use] + + new_copy.set_metadata(metadata, self._metadata_types) + new_copy.set_properties(properties, self._properties_types) + new_copy._original_text_index = index_kept # pylint: disable=protected-access get_logger().disabled = logger_state return new_copy @@ -292,79 +195,78 @@ def sample(self: TDataset, n_samples: int, replace: bool = False, random_state: Dataset instance of the Dataset with sampled internal dataframe. """ - samples = self.index + samples = np.arange(len(self)) if drop_na_label and self.has_label(): samples = samples[pd.notnull(self._label)] n_samples = min(n_samples, len(samples)) np.random.seed(random_state) sample_idx = np.random.choice(range(len(samples)), n_samples, replace=replace) - return self.copy(rows_to_use=sample_idx) + return self.copy(rows_to_use=sorted(sample_idx)) - def get_raw_sample(self, index: t.Any) -> str: - """Get the raw text of a sample. - - Parameters - ---------- - index : int - Index of sample to get. - Returns - ------- - str - Raw text of sample. - """ - return self._text[self.index.index(index)] - - def get_tokenized_sample(self, index: t.Any) -> t.List[str]: - """Get the tokenized text of a sample. - - Parameters - ---------- - index : int - Index of sample to get. - Returns - ------- - List[str] - Tokenized text of sample. - """ - if self._tokenized_text is None: - raise DeepchecksValueError('TextData does not contain tokenized text') - return self._tokenized_text[self.index.index(index)] + def __len__(self) -> int: + """Return number of samples in the dataset.""" + return self.n_samples @property def n_samples(self) -> int: """Return number of samples in the dataset.""" - return len(self._text) + if self._text is not None: + return len(self._text) + elif self._label is not None: + return len(self._label) + else: + return 0 @property def metadata(self) -> pd.DataFrame: """Return the metadata of for the dataset.""" + if self._metadata is None: + raise DeepchecksValueError('Metadata does not exist, please set it first using the set_metadata method') return self._metadata @property def metadata_types(self) -> t.Dict[str, str]: """Return the metadata types of for the dataset.""" + if self._metadata_types is None: + raise DeepchecksValueError('Metadata does not exist, please set it first using the set_metadata method') return self._metadata_types def set_metadata(self, metadata: pd.DataFrame, metadata_types: t.Optional[t.Dict[str, str]] = None): - """Set the metadata of the dataset.""" + """Set metadata for the dataset. + + Parameters + ---------- + metadata : pd.DataFrame + Metadata of the provided textual samples. + metadata_types : t.Optional[t.Dict[str, str]] , default : None + The types of the metadata columns. Can be either 'numeric' or 'categorical'. + If not provided, will be inferred automatically. + """ if self._metadata is not None: warnings.warn('Metadata already exist, overwriting it', UserWarning) - if not isinstance(metadata, pd.DataFrame): - raise DeepchecksValueError(f'metadata type {type(metadata)} is not supported, must be a' - f' pandas DataFrame') - if self.index != list(metadata.index): - raise DeepchecksValueError('metadata index must be the same as the text data index') - self._metadata = metadata - - if metadata_types is None: # TODO: Add tests - cat_features = infer_categorical_features(metadata) - metadata_types = {metadata.columns[i]: 'categorical' if metadata.columns[i] in cat_features else 'numeric' - for i in range(len(metadata.columns))} - elif sorted(list(metadata_types.keys())) != sorted(list(metadata.columns)): - raise DeepchecksValueError('metadata_types keys must identical to metadata columns') - self._metadata_types = metadata_types + self._metadata_types = validate_length_and_calculate_column_types(metadata, 'Metadata', + len(self), metadata_types) + self._metadata = metadata.reset_index(drop=True) if isinstance(metadata, pd.DataFrame) else None + + def set_properties(self, properties: pd.DataFrame, properties_types: t.Optional[t.Dict[str, str]] = None): + """Set properties for the dataset. + + Parameters + ---------- + properties : pd.DataFrame + Properties of the provided textual samples. + properties_types : t.Optional[t.Dict[str, str]] , default : None + The types of the properties columns. Can be either 'numeric' or 'categorical'. + If not provided, will be inferred automatically. + """ + if self._properties is not None: + warnings.warn('Properties already exist, overwriting it', UserWarning) + + self._properties_types = validate_length_and_calculate_column_types(properties, 'Properties', + len(self), properties_types) + self._properties = properties.reset_index(drop=True) if isinstance(properties, pd.DataFrame) else None def calculate_default_properties(self, include_properties: t.List[str] = None, ignore_properties: t.List[str] = None, @@ -392,50 +294,24 @@ def calculate_default_properties(self, include_properties: t.List[str] = None, properties, properties_types = calculate_default_properties( self.text, include_properties=include_properties, ignore_properties=ignore_properties, include_long_calculation_properties=include_long_calculation_properties, device=device) - self._properties = pd.DataFrame(properties, index=self.index) - self._properties_types = properties_types - - def set_properties(self, properties: pd.DataFrame, properties_types: t.Optional[t.Dict[str, str]] = None): - """Set the properties of the dataset.""" - if self._properties is not None: - warnings.warn('Properties already exist, overwriting them', UserWarning) - - if not isinstance(properties, pd.DataFrame): - raise DeepchecksValueError(f'properties type {type(properties)} is not supported, must be a' - f' pandas DataFrame') - if list(properties.index) != self.index: - raise DeepchecksValueError('properties index must be the same as the text data index') - self._properties = properties - - if properties_types is None: - # TODO: move infer_categorical_features to core - cat_features = infer_categorical_features(properties) - properties_types = { - properties.columns[i]: 'categorical' if properties.columns[i] in cat_features else 'numeric' - for i in range(len(properties.columns))} - elif sorted(list(properties_types.keys())) != sorted(list(properties.columns)): - raise DeepchecksValueError('properties_types keys must identical to properties columns') - + self._properties = pd.DataFrame(properties, index=self.get_original_text_indexes()) self._properties_types = properties_types @property def properties(self) -> pd.DataFrame: """Return the properties of the dataset.""" if self._properties is None: - raise DeepchecksNotSupportedError( - 'TextData does not contain properties, add them by using calculate_default_properties or ' - 'set_properties functions') + raise DeepchecksValueError('TextData does not contain properties, add them by using ' + 'calculate_default_properties or set_properties functions') return self._properties @property def properties_types(self) -> t.Dict[str, str]: """Return the property types of the dataset.""" + if self._properties is None: + raise DeepchecksValueError('Properties does not exist, please set it first using the set_properties method') return self._properties_types - def __len__(self): - """Return number of samples in the dataset.""" - return self.n_samples - @property def task_type(self) -> t.Optional[TaskType]: """Return the task type. @@ -467,6 +343,9 @@ def tokenized_text(self) -> t.Sequence[t.Sequence[str]]: t.Sequence[t.Sequence[str]] Sequence of tokenized text samples. """ + if self._tokenized_text is None: + raise DeepchecksValueError('Tokenized text is not set, provide it when initializing the TextData object ' + 'to run the requested functionalities') return self._tokenized_text @property @@ -477,28 +356,30 @@ def label(self) -> TTextLabel: ------- TTextLabel """ + if not self.has_label(): + raise DeepchecksValueError('Label is not set, provide it when initializing the TextData object ' + 'to run the requested functionalities') return self._label - @property - def is_multilabel(self) -> bool: - """Return True if label is multilabel. + def has_label(self) -> bool: + """Return True if label was set. Returns ------- bool - True if label is multilabel. + True if label was set. """ - return self._is_multilabel + return self._label is not None - def has_label(self) -> bool: - """Return True if label was set. + def get_original_text_indexes(self) -> t.Sequence[int]: + """Return the original indexes of the text samples. Returns ------- - bool - True if label was set. + t.Sequence[int] + Original indexes of the text samples. """ - return self._has_label + return self._original_text_index @classmethod def cast_to_dataset(cls, obj: t.Any) -> 'TextData': @@ -523,53 +404,30 @@ def cast_to_dataset(cls, obj: t.Any) -> 'TextData': raise DeepchecksValueError(f'{obj} is not a {cls.__name__} instance') return obj.copy() - @classmethod - def datasets_share_task_type(cls, *datasets: 'TextData') -> bool: + def validate_textdata_compatibility(self, other_text_data: 'TextData') -> bool: """Verify that all provided datasets share same label name and task types. Parameters ---------- - datasets : List[TextData] - list of TextData to validate + other_text_data : TextData + The other dataset TextData object to compare with. Returns ------- bool - True if all TextData share same label names and task types, otherwise False - - Raises - ------ - AssertionError - 'datasets' parameter is not a list; - 'datasets' contains less than one dataset; + True if provided dataset share same label name and task types. """ - assert len(datasets) > 1, "'datasets' must contains at least two items" - - task_type = datasets[0].task_type - - for ds in datasets[1:]: - if ds.task_type != task_type: - return False + assert other_text_data is not None + if self.task_type != other_text_data.task_type: + return False return True - def len_when_sampled(self, n_samples: t.Optional[int]): - """Return number of samples in the sampled dataframe this dataset is sampled with n_samples samples.""" - if n_samples is None: - return self.n_samples - return min(self.n_samples, n_samples) - - def is_sampled(self, n_samples: t.Optional[int]): - """Return True if the dataset number of samples will decrease when sampled with n_samples samples.""" - if n_samples is None: - return False - return self.n_samples > n_samples - def head(self, n_samples: int = 5) -> pd.DataFrame: """Return a copy of the dataset as a pandas Dataframe with the first n_samples samples.""" if n_samples > len(self): n_samples = len(self) - 1 - result = pd.DataFrame({'text': self.text[:n_samples]}, index=self.index[:n_samples]) + result = pd.DataFrame({'text': self.text[:n_samples]}, index=self.get_original_text_indexes()[:n_samples]) if self.has_label(): result['label'] = self.label[:n_samples] if self._tokenized_text is not None: @@ -577,3 +435,15 @@ def head(self, n_samples: int = 5) -> pd.DataFrame: if self._metadata is not None: result = result.join(self.metadata.loc[result.index]) return result + + def len_when_sampled(self, n_samples: t.Optional[int]): + """Return number of samples in the sampled dataframe this dataset is sampled with n_samples samples.""" + if n_samples is None: + return self.n_samples + return min(self.n_samples, n_samples) + + def is_sampled(self, n_samples: t.Optional[int]): + """Return True if the dataset number of samples will decrease when sampled with n_samples samples.""" + if n_samples is None: + return False + return self.n_samples > n_samples diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py index e9088b26d7..ad6adb9b3f 100644 --- a/deepchecks/nlp/utils/text_properties.py +++ b/deepchecks/nlp/utils/text_properties.py @@ -154,7 +154,6 @@ def formality(raw_text: Sequence[str], device: Optional[int] = None) -> List[flo {'name': 'Formality', 'method': formality, 'output_type': 'numeric'} ] - LONG_RUN_PROPERTIES = ['Toxicity', 'Fluency', 'Formality'] ENGLISH_ONLY_PROPERTIES = ['Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality'] LARGE_SAMPLE_SIZE = 10_000 @@ -222,8 +221,8 @@ def calculate_default_properties(raw_text: Sequence[str], include_properties: Op else: # Check if the run may take a long time and warn heavy_properties = [prop for prop in default_text_properties if prop['name'] in LONG_RUN_PROPERTIES] if heavy_properties and len(raw_text) > LARGE_SAMPLE_SIZE: - h_property_names = [prop['name'] for prop in heavy_properties] - warning_message = f'Calculating the properties {h_property_names} on a large dataset may take a long time.'\ + h_prop_names = [prop['name'] for prop in heavy_properties] + warning_message = f'Calculating the properties {h_prop_names} on a large dataset may take a long time.' \ f' Consider using a smaller sample size or running this code on better hardware.' if device is None or device == 'cpu': warning_message += ' Consider using a GPU or a similar device to run these properties.' @@ -240,6 +239,7 @@ def calculate_default_properties(raw_text: Sequence[str], include_properties: Op if not calculated_properties: raise RuntimeError('Failed to calculate any of the properties.') - properties_types = {prop['name']: prop['output_type'] for prop in default_text_properties} # TODO: Add tests + properties_types = {prop['name']: prop['output_type'] for prop in default_text_properties + if prop['name'] in calculated_properties} # TODO: Add tests return calculated_properties, properties_types diff --git a/deepchecks/utils/validation.py b/deepchecks/utils/validation.py index 7d267c32d7..83fbab807f 100644 --- a/deepchecks/utils/validation.py +++ b/deepchecks/utils/validation.py @@ -10,8 +10,6 @@ # """Objects validation utilities.""" -# TODO: move tabular functionality to the tabular sub-package - import typing as t import numpy as np @@ -22,18 +20,17 @@ __all__ = [ 'ensure_hashable_or_mutable_sequence', - 'is_sequence_not_str' + 'is_sequence_not_str', ] - T = t.TypeVar('T', bound=Hashable) def ensure_hashable_or_mutable_sequence( - value: t.Union[T, t.MutableSequence[T]], - message: str = ( - 'Provided value is neither hashable nor mutable ' - 'sequence of hashable items. Got {type}') + value: t.Union[T, t.MutableSequence[T]], + message: str = ( + 'Provided value is neither hashable nor mutable ' + 'sequence of hashable items. Got {type}') ) -> t.List[T]: """Validate that provided value is either hashable or mutable sequence of hashable values.""" if isinstance(value, Hashable): diff --git a/docs/source/user-guide/nlp/quickstarts/plot_text_classification.py b/docs/source/user-guide/nlp/quickstarts/plot_text_classification.py index ccbac8259d..cd9b4a2660 100644 --- a/docs/source/user-guide/nlp/quickstarts/plot_text_classification.py +++ b/docs/source/user-guide/nlp/quickstarts/plot_text_classification.py @@ -67,9 +67,9 @@ # train = TextData(train.text, label=train['label'], task_type='text_classification', - index=train.index, metadata=train.drop(columns=['label', 'text'])) + metadata=train.drop(columns=['label', 'text'])) test = TextData(test.text, label=test['label'], task_type='text_classification', - index=test.index, metadata=test.drop(columns=['label', 'text'])) + metadata=test.drop(columns=['label', 'text'])) #%% # Building a Model @@ -86,13 +86,11 @@ from catboost import CatBoostClassifier # Load Embeddings and Split to Train and Test -embeddings = tweet_emotion.load_embeddings() -train_embeddings, test_embeddings = embeddings[train.index, :], embeddings[test.index, :] +train_embeddings, test_embeddings = tweet_emotion.load_embeddings(as_train_test=True) model = CatBoostClassifier(max_depth=2, n_estimators=50, random_state=42) -model.fit(embeddings[train.index, :], train.label, verbose=0) -print(roc_auc_score(test.label, - model.predict_proba(embeddings[test.index, :]), +model.fit(train_embeddings, train.label, verbose=0) +print(roc_auc_score(test.label, model.predict_proba(test_embeddings), multi_class="ovr", average="macro")) #%% @@ -120,8 +118,8 @@ # We'll also add a condition to the check, which will make it fail if the drift score is higher than 0.1. # Start by computing the predictions for the train and test data: -train_preds, train_probas = model.predict(embeddings[train.index, :]), model.predict_proba(embeddings[train.index, :]) -test_preds, test_probas = model.predict(embeddings[test.index, :]), model.predict_proba(embeddings[test.index, :]) +train_preds, train_probas = model.predict(train_embeddings), model.predict_proba(train_embeddings) +test_preds, test_probas = model.predict(test_embeddings), model.predict_proba(test_embeddings) # Run the check from deepchecks.nlp.checks import PredictionDrift diff --git a/tests/nlp/checks/data_integrity/property_label_correlation_test.py b/tests/nlp/checks/data_integrity/property_label_correlation_test.py index 52d4b5d198..13143d230e 100644 --- a/tests/nlp/checks/data_integrity/property_label_correlation_test.py +++ b/tests/nlp/checks/data_integrity/property_label_correlation_test.py @@ -17,13 +17,12 @@ from tests.base.utils import equal_condition_result -def test_tweet_emotion_properties(tweet_emotion_train_test_textdata): +def test_tweet_emotion_properties(tweet_emotion_train_test_textdata, tweet_emotion_train_test_probabilities): # Arrange _, test = tweet_emotion_train_test_textdata - test_probas = tweet_emotion.load_precalculated_predictions(pred_format='probabilities')[test.index] check = PropertyLabelCorrelation().add_condition_property_pps_less_than(0.1) # Act - result = check.run(test, probabilities=test_probas) + result = check.run(test, probabilities=tweet_emotion_train_test_probabilities[1]) condition_result = check.conditions_decision(result) # Assert diff --git a/tests/nlp/checks/model_evaluation/confusion_matrix_test.py b/tests/nlp/checks/model_evaluation/confusion_matrix_test.py index dcd876ad70..2137c29360 100644 --- a/tests/nlp/checks/model_evaluation/confusion_matrix_test.py +++ b/tests/nlp/checks/model_evaluation/confusion_matrix_test.py @@ -20,10 +20,10 @@ def test_defaults(text_classification_dataset_mock): # Act result = check.run(text_classification_dataset_mock, - predictions=[0, 1, 1]) + predictions=['0', '1', '1']) # Assert - assert_that(text_classification_dataset_mock.label, equal_to([0, 0, 1])) + assert_that(list(text_classification_dataset_mock.label), equal_to(['0', '0', '1'])) assert_that(result.value[0][0], close_to(1, 0.001)) assert_that(result.value.shape[0], close_to(2, 0.001)) @@ -37,7 +37,7 @@ def test_run_default_scorer_string_class(text_classification_string_class_datase predictions=['wise', 'wise', 'meh']) # Assert - assert_that(text_classification_string_class_dataset_mock.label, equal_to(['wise', 'meh', 'meh'])) + assert_that(list(text_classification_string_class_dataset_mock.label), equal_to(['wise', 'meh', 'meh'])) assert_that(result.value[0][0], close_to(1, 0.001)) assert_that(result.value.shape[0], close_to(2, 0.001)) @@ -51,7 +51,7 @@ def test_run_default_scorer_string_class_new_cats_in_model_classes(text_classifi predictions=['wise', 'new', 'meh']) # Assert - assert_that(text_classification_string_class_dataset_mock.label, equal_to(['wise', 'meh', 'meh'])) + assert_that(list(text_classification_string_class_dataset_mock.label), equal_to(['wise', 'meh', 'meh'])) assert_that(result.value[0][0], close_to(1, 0.001)) assert_that(result.value.shape[0], close_to(3, 0.001)) diff --git a/tests/nlp/checks/model_evaluation/prediction_drift_test.py b/tests/nlp/checks/model_evaluation/prediction_drift_test.py index c65a272177..4b9a19b614 100644 --- a/tests/nlp/checks/model_evaluation/prediction_drift_test.py +++ b/tests/nlp/checks/model_evaluation/prediction_drift_test.py @@ -60,7 +60,7 @@ def test_tweet_emotion_no_drift_no_label(tweet_emotion_train_test_textdata, twee # Arrange train, _ = tweet_emotion_train_test_textdata train = TextData(train.text, task_type='text_classification', metadata=train.metadata, - properties=train.properties, index=train.index) + properties=train.properties) train_preds, _ = tweet_emotion_train_test_predictions check = PredictionDrift().add_condition_drift_score_less_than() # Act diff --git a/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py b/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py index c138f62b33..b209a95b30 100644 --- a/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py +++ b/tests/nlp/checks/model_evaluation/single_dataset_performance_test.py @@ -10,10 +10,9 @@ # """Test for the nlp SingleDatasetPerformance check""" -from hamcrest import assert_that, close_to, calling, raises, equal_to, has_items - from deepchecks.core.errors import DeepchecksValueError, ValidationError from deepchecks.nlp.checks.model_evaluation.single_dataset_performance import SingleDatasetPerformance +from hamcrest import assert_that, close_to, calling, raises, equal_to, has_items from tests.base.utils import equal_condition_result @@ -133,6 +132,7 @@ def test_run_with_scorer_multilabel_class_names(text_multilabel_classification_d assert_that(result.value.values[0][-1], close_to(1.0, 0.001)) assert_that(result.value.values[0][0], equal_to('a')) + def test_wikiann_data(wikiann): """Temp to test wikiann dataset loads correctly""" dataset = wikiann @@ -141,14 +141,15 @@ def test_wikiann_data(wikiann): assert_that(result.value.values[0][-1], equal_to(1)) + def test_run_with_scorer_token(text_token_classification_dataset_mock): # Arrange check = SingleDatasetPerformance(scorers=['token_f1_macro']) correct_predictions = [['B-PER', 'O', 'O', 'O', 'O'], ['B-PER', 'O', 'O', 'B-GEO', 'O', 'B-GEO'], - ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']] - almost_correct_predictions = [['B-PER', 'O', 'O', 'O', 'O'], ['B-PER', 'O', 'O', 'O', 'O', 'B-GEO'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']] + almost_correct_predictions = [['B-PER', 'O', 'O', 'O', 'O'], ['B-PER', 'O', 'O', 'O', 'O', 'B-GEO'], + ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']] # Act result = check.run(text_token_classification_dataset_mock, @@ -168,7 +169,6 @@ def test_run_with_scorer_token(text_token_classification_dataset_mock): def test_run_with_scorer_token_per_class(text_token_classification_dataset_mock): - # Arrange check = SingleDatasetPerformance(scorers=['token_recall_per_class']) @@ -188,6 +188,7 @@ def test_run_with_scorer_token_per_class(text_token_classification_dataset_mock) assert_that(result.value.values[2][-1], close_to(1., 0.001)) assert_that(result.value.values[2][0], equal_to('B-PER')) + def test_ignore_O_label_in_model_classes(text_token_classification_dataset_mock): # Arrange check = SingleDatasetPerformance(scorers=['token_recall_per_class']) diff --git a/tests/nlp/conftest.py b/tests/nlp/conftest.py index f8c4aed2a0..afd8fc69dc 100644 --- a/tests/nlp/conftest.py +++ b/tests/nlp/conftest.py @@ -37,22 +37,16 @@ def tweet_emotion_train_test_textdata(): @pytest.fixture(scope='session') -def tweet_emotion_train_test_predictions(tweet_emotion_train_test_textdata): +def tweet_emotion_train_test_predictions(): """Tweet emotion text classification dataset predictions""" - train_data, test_data = tweet_emotion_train_test_textdata - train_preds = tweet_emotion.load_precalculated_predictions(pred_format='predictions')[train_data.index] - test_preds = tweet_emotion.load_precalculated_predictions(pred_format='predictions')[test_data.index] - - return train_preds, test_preds + return tweet_emotion.load_precalculated_predictions(pred_format='predictions', as_train_test=True) @pytest.fixture(scope='session') -def tweet_emotion_train_test_probabilities(tweet_emotion_train_test_textdata): +def tweet_emotion_train_test_probabilities(): """Tweet emotion text classification dataset probabilities""" - train_data, test_data = tweet_emotion_train_test_textdata - train_probas = tweet_emotion.load_precalculated_predictions(pred_format='probabilities')[train_data.index] - test_probas = tweet_emotion.load_precalculated_predictions(pred_format='probabilities')[test_data.index] - return train_probas, test_probas + return tweet_emotion.load_precalculated_predictions(pred_format='probabilities', as_train_test=True) + @pytest.fixture(scope='session') @@ -95,7 +89,7 @@ def movie_reviews_data_positive(): download_nltk_resources() random.seed(42) pos_sentences = [' '.join(x) for x in movie_reviews.sents(categories='pos')] - pos_data = TextData(random.choices(pos_sentences, k=1000), dataset_name='Positive') + pos_data = TextData(random.choices(pos_sentences, k=1000), name='Positive') return pos_data @@ -105,15 +99,18 @@ def movie_reviews_data_negative(): download_nltk_resources() random.seed(42) neg_sentences = [' '.join(x) for x in movie_reviews.sents(categories='neg')] - neg_data = TextData(random.choices(neg_sentences, k=1000), dataset_name='Negative') + neg_data = TextData(random.choices(neg_sentences, k=1000), name='Negative') return neg_data +def _tokenize_raw_text(raw_text): + """Tokenize raw text""" + return [x.split() for x in raw_text] @pytest.fixture(scope='session') def text_token_classification_dataset_mock(): """Mock for a token classification dataset""" - return TextData(raw_text=['Mary had a little lamb', 'Mary lives in London and Paris', - 'How much wood can a wood chuck chuck?'], + return TextData(tokenized_text=_tokenize_raw_text(['Mary had a little lamb', 'Mary lives in London and Paris', + 'How much wood can a wood chuck chuck?']), label=[['B-PER', 'O', 'O', 'O', 'O'], ['B-PER', 'O', 'O', 'B-GEO', 'O', 'B-GEO'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']], task_type='token_classification') @@ -129,4 +126,4 @@ def wikiann(): ner_to_iob_dict = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'} ner_tags_translated = [[ner_to_iob_dict[ner_tag] for ner_tag in ner_tag_list.as_py()] for ner_tag_list in ner_tags] - return TextData(raw_text=data, label=ner_tags_translated, task_type='token_classification') + return TextData(tokenized_text=_tokenize_raw_text(data), label=ner_tags_translated, task_type='token_classification') diff --git a/tests/nlp/test_text_data.py b/tests/nlp/test_text_data.py index 726258522d..fe2cd1c4d3 100644 --- a/tests/nlp/test_text_data.py +++ b/tests/nlp/test_text_data.py @@ -43,7 +43,7 @@ def test_init_mismatched_task_type(): # Act & Assert assert_that( calling(TextData).with_args(raw_text=text, label=label, task_type='token_classification'), - raises(DeepchecksValueError, r'label must be a Sequence of Sequences of either strings or integers') + raises(DeepchecksValueError, r'tokenized_text must be provided for token_classification task type') ) # Arrange @@ -59,7 +59,7 @@ def test_init_mismatched_task_type(): def test_wrong_token_label_format(): # Arrange - text = ['a', 'b b b', 'c c c c'] + tokenized_text = [['a'] ,['b', 'b' ,'b'], ['c', 'c', 'c', 'c']] label_structure_error = r'label must be a Sequence of Sequences of either strings or integers' @@ -68,28 +68,28 @@ def test_wrong_token_label_format(): label = [['B-PER'], ['B-PER', 'B-GEO', 'B-GEO'], ['B-PER', 'B-GEO', 'B-GEO', 'B-GEO']] - _ = TextData(raw_text=text, label=label, task_type='token_classification') # Should pass + _ = TextData(tokenized_text=tokenized_text, label=label, task_type='token_classification') # Should pass # Not a list: label = 'PER' assert_that( - calling(TextData).with_args(raw_text=text, label=label, task_type='token_classification'), + calling(TextData).with_args(tokenized_text=tokenized_text, label=label, task_type='token_classification'), raises(DeepchecksValueError, 'label must be a Sequence') ) # Not a list of lists: label = [3, 3, 3] assert_that( - calling(TextData).with_args(raw_text=text, label=label, task_type='token_classification'), + calling(TextData).with_args(tokenized_text=tokenized_text, label=label, task_type='token_classification'), raises(DeepchecksValueError, label_structure_error) ) # Mixed strings and integers: label = [['B-PER'], - ['B-PER', 1, 'B-GEO'], + 1, ['B-PER', 'B-GEO', 'B-GEO', 'B-GEO']] assert_that( - calling(TextData).with_args(raw_text=text, label=label, task_type='token_classification'), + calling(TextData).with_args(tokenized_text=tokenized_text, label=label, task_type='token_classification'), raises(DeepchecksValueError, label_structure_error) ) @@ -98,9 +98,9 @@ def test_wrong_token_label_format(): ['B-PER', 'B-GEO', 'B-GEO'], ['B-PER', 'B-GEO', 'B-GEO']] assert_that( - calling(TextData).with_args(raw_text=text, label=label, task_type='token_classification'), - raises(DeepchecksValueError, r'label must be the same length as tokenized_text. ' - r'However, for sample index 2 of length 4 received label of length 3') + calling(TextData).with_args(tokenized_text=tokenized_text, label=label, task_type='token_classification'), + raises(DeepchecksValueError, r'label must be the same length as tokenized_text. However, for sample ' + r'index 2 received token list of length 4 and label list of length 3') ) @@ -115,7 +115,7 @@ def test_metadata_format(): assert_that( calling(TextData).with_args(raw_text=text, metadata=metadata, task_type='text_classification'), raises(DeepchecksValueError, - r"metadata type is not supported, must be a pandas DataFrame") + r"Metadata type is not supported, must be a pandas DataFrame") ) @@ -178,7 +178,7 @@ def test_set_metadata(text_classification_dataset_mock): dataset._metadata_types = None # pylint: disable=protected-access assert_that(calling(dataset.set_metadata).with_args(metadata, metadata_types={'first': 'numeric'}), - raises(DeepchecksValueError, 'metadata_types keys must identical to metadata columns')) + raises(DeepchecksValueError, 'Metadata types keys must identical to Metadata table columns')) def test_set_properties(text_classification_dataset_mock): @@ -204,4 +204,4 @@ def test_set_properties(text_classification_dataset_mock): dataset._properties_types = None # pylint: disable=protected-access assert_that(calling(dataset.set_properties).with_args(properties, properties_types={'text_length': 'numeric'}), - raises(DeepchecksValueError, 'properties_types keys must identical to properties columns')) + raises(DeepchecksValueError, 'Properties types keys must identical to Properties table columns'))