deepchecks · noamzbr · Apr 27, 2023 · Apr 25, 2023 · Apr 25, 2023 · Apr 25, 2023
@@ -11,59 +11,80 @@
 """Module with common docstrings for the nlp package."""
 from deepchecks.utils.decorators import Substitution
 
-_shared_docstrings = {'prediction_formats': """
+_shared_docstrings = {}
 
-        Notes
-        -----
-        The accepted formats for providing model predictions and probabilities are detailed below
+_shared_docstrings['prediction_formats'] = """
+Notes
+-----
+The accepted formats for providing model predictions and probabilities are detailed below
 
-        **Text Classification**
+**Text Classification**
 
-        *Single Class Predictions*
+*Single Class Predictions*
 
-        - predictions - A sequence of class names or indices with one entry per sample, matching the set of classes
-          present in the labels.
-        - probabilities - A sequence of sequences with each element containing the vector of class probabilities for
-          each sample. Each such vector should have one probability per class according to the class (sorted) order, and
-          the probabilities should sum to 1 for each sample.
+- predictions - A sequence of class names or indices with one entry per sample, matching the set of classes
+  present in the labels.
+- probabilities - A sequence of sequences with each element containing the vector of class probabilities for
+  each sample. Each such vector should have one probability per class according to the class (sorted) order, and
+  the probabilities should sum to 1 for each sample.
 
-        *Multilabel Predictions*
+*Multilabel Predictions*
 
-        - predictions - A sequence of sequences with each element containing a binary vector denoting the presence of
-          the i-th class for the given sample. Each such vector should have one binary indicator per class according to
-          the class (sorted) order. More than one class can be present for each sample.
-        - probabilities - A sequence of sequences with each element containing the vector of class probabilities for
-          each sample. Each such vector should have one probability per class according to the class (sorted) order, and
-          the probabilities should range from 0 to 1 for each sample, but are not required to sum to 1.
+- predictions - A sequence of sequences with each element containing a binary vector denoting the presence of
+  the i-th class for the given sample. Each such vector should have one binary indicator per class according to
+  the class (sorted) order. More than one class can be present for each sample.
+- probabilities - A sequence of sequences with each element containing the vector of class probabilities for
+  each sample. Each such vector should have one probability per class according to the class (sorted) order, and
+  the probabilities should range from 0 to 1 for each sample, but are not required to sum to 1.
 
-        **Token Classification**
+**Token Classification**
 
-        - predictions - A sequence of sequences, with the inner sequence containing tuples in the following
-          format: (class_name, span_start, span_end, class_probability). span_start and span_end are the start and end
-          character indices  of the token within the text, as it was passed to the raw_text argument. Each upper level
-          sequence contains a sequence of tokens for each sample.
-        - probabilities - No probabilities should be passed for Token Classification tasks. Passing probabilities will
-          result in an error.
+- predictions - A sequence of sequences, with the inner sequence containing tuples in the following
+  format: (class_name, span_start, span_end, class_probability). span_start and span_end are the start and end
+  character indices  of the token within the text, as it was passed to the raw_text argument. Each upper level
+  sequence contains a sequence of tokens for each sample.
+- probabilities - No probabilities should be passed for Token Classification tasks. Passing probabilities will
+  result in an error.
 
-        Examples
-        --------
+Examples
+--------
 
-        **Text Classification**
+**Text Classification**
 
-        *Single Class Predictions*
+*Single Class Predictions*
 
-        >>> predictions = ['class_1', 'class_1', 'class_2']
-        >>> probabilities = [[0.2, 0.8], [0.5, 0.5], [0.3, 0.7]]
+>>> predictions = ['class_1', 'class_1', 'class_2']
+>>> probabilities = [[0.2, 0.8], [0.5, 0.5], [0.3, 0.7]]
 
-        *Multilabel Predictions*
+*Multilabel Predictions*
 
-        >>> predictions = [[0, 0, 1], [0, 1, 1]]
-        >>> probabilities = [[0.2, 0.3, 0.8], [0.4, 0.9, 0.6]]
+>>> predictions = [[0, 0, 1], [0, 1, 1]]
+>>> probabilities = [[0.2, 0.3, 0.8], [0.4, 0.9, 0.6]]
 
-        **Token Classification**
+**Token Classification**
 
-        >>> predictions = [[('class_1', 0, 2, 0.8), ('class_2', 7, 10, 0.9)], [('class_2', 42, 54, 0.4)], []]
+>>> predictions = [[('class_1', 0, 2, 0.8), ('class_2', 7, 10, 0.9)], [('class_2', 42, 54, 0.4)], []]
+""".strip('\n')
+
+
+_shared_docstrings['text_normalization_params'] = """
+ignore_case: bool, default True
+    ignore text case during samples comparison.
+remove_punctuation: bool, default True
+    ignore punctuation characters during samples comparison.
+normalize_unicode: bool, default True
+    normilize unicode characters before samples comparison.
+remove_stopwords: bool, default True
+    remove stopwrods before samples comparison.
+ignore_whitespace: bool, default False
+    ignore whitespace characters during samples comparison.
+""".strip('\n')
+
+
+_shared_docstrings['max_text_length_for_display_param'] = """
+max_text_length_for_display : int, default 30
+    truncate text samples to given length before display.
+""".strip('\n')
 
-""".strip('\n')}
 
 docstrings = Substitution(**_shared_docstrings)
@@ -10,16 +10,20 @@
 #
 """Module importing all nlp checks."""
 
-from deepchecks.nlp.checks.data_integrity import PropertyLabelCorrelation, TextPropertyOutliers
+from deepchecks.nlp.checks.data_integrity import (ConflictingLabels, PropertyLabelCorrelation, SpecialCharacters,
+                                                  TextDuplicates, TextPropertyOutliers)
 from deepchecks.nlp.checks.model_evaluation import (ConfusionMatrixReport, MetadataSegmentsPerformance, PredictionDrift,
                                                     PropertySegmentsPerformance, SingleDatasetPerformance,
                                                     TrainTestPerformance)
-from deepchecks.nlp.checks.train_test_validation import LabelDrift, PropertyDrift
+from deepchecks.nlp.checks.train_test_validation import LabelDrift, PropertyDrift, TrainTestSamplesMix
 
 __all__ = [
     # Data Integrity
     'PropertyLabelCorrelation',
     'TextPropertyOutliers',
+    'TextDuplicates',
+    'ConflictingLabels',
+    'SpecialCharacters',
 
     # Model Evaluation
     'SingleDatasetPerformance',
@@ -31,5 +35,6 @@
     # Train Test Validation
     'PredictionDrift',
     'LabelDrift',
-    'PropertyDrift'
+    'PropertyDrift',
+    'TrainTestSamplesMix'
 ]
@@ -10,7 +10,16 @@
 #
 """Module importing all nlp checks."""
 
+from .conflicting_labels import ConflictingLabels
 from .property_label_correlation import PropertyLabelCorrelation
+from .special_characters import SpecialCharacters
+from .text_duplicates import TextDuplicates
 from .text_property_outliers import TextPropertyOutliers
 
-__all__ = ['PropertyLabelCorrelation', 'TextPropertyOutliers']
+__all__ = [
+    'PropertyLabelCorrelation',
+    'TextPropertyOutliers',
+    'TextDuplicates',
+    'ConflictingLabels',
+    'SpecialCharacters'
+]
@@ -0,0 +1,174 @@
+# ----------------------------------------------------------------------------
+# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
+#
+# This file is part of Deepchecks.
+# Deepchecks is distributed under the terms of the GNU Affero General
+# Public License (version 3 or later).
+# You should have received a copy of the GNU Affero General Public License
+# along with Deepchecks.  If not, see <http://www.gnu.org/licenses/>.
+# ----------------------------------------------------------------------------
+#
+"""Module contains Conflicting Labels check."""
+import typing as t
+
+import pandas as pd
+
+from deepchecks.core import CheckResult
+from deepchecks.core.errors import DeepchecksValueError
+from deepchecks.nlp import Context, SingleDatasetCheck
+from deepchecks.nlp._shared_docs import docstrings
+from deepchecks.nlp.task_type import TaskType
+from deepchecks.nlp.text_data import TextData
+from deepchecks.nlp.utils.text import hash_samples, normalize_samples
+from deepchecks.utils.abstracts.conflicting_labels import ConflictingLabelsAbstract
+from deepchecks.utils.other import to_ordional_enumeration
+from deepchecks.utils.strings import format_list
+from deepchecks.utils.strings import get_ellipsis as truncate_string
+
+__all__ = ['ConflictingLabels']
+
+
+@docstrings
+class ConflictingLabels(SingleDatasetCheck, ConflictingLabelsAbstract):
+    """Find identical samples which have different labels.
+
+    Parameters
+    ----------
+    {text_normalization_params:1*indent}
+    n_to_show : int , default: 5
+        number of most common ambiguous samples to show.
+    n_samples : int , default: 10_000_000
+        number of samples to use for this check.
+    random_state : int, default: 42
+        random seed for all check internals.
+    {max_text_length_for_display_param:1*indent}
+    """
+
+    def __init__(
+        self,
+        ignore_case: bool = True,
+        remove_punctuation: bool = True,
+        normalize_unicode: bool = True,
+        remove_stopwords: bool = True,
+        ignore_whitespace: bool = False,
+        n_to_show: int = 5,
+        n_samples: int = 10_000_000,
+        random_state: int = 42,
+        max_text_length_for_display: int = 30,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.ignore_case = ignore_case
+        self.remove_punctuation = remove_punctuation
+        self.normalize_unicode = normalize_unicode
+        self.remove_stopwords = remove_stopwords
+        self.ignore_whitespace = ignore_whitespace
+        self.n_to_show = n_to_show
+        self.n_samples = n_samples
+        self.random_state = random_state
+        self.max_text_length_for_display = max_text_length_for_display
+
+    @property
+    def _text_normalization_kwargs(self):
+        return {
+            'ignore_case': self.ignore_case,
+            'ignore_whitespace': self.ignore_whitespace,
+            'normalize_uni': self.normalize_unicode,
+            'remove_punct': self.remove_punctuation,
+            'remove_stops': self.remove_stopwords,
+        }
+
+    def _truncate_text(self, x: str) -> str:
+        return truncate_string(x, self.max_text_length_for_display)
+
+    def run_logic(self, context: Context, dataset_kind) -> CheckResult:
+        """Run check."""
+        dataset = context.get_data_by_kind(dataset_kind).sample(self.n_samples, random_state=self.random_state)
+        dataset = t.cast(TextData, dataset)
+        samples = dataset.text
+        n_of_samples = len(samples)
+
+        if n_of_samples == 0:
+            raise DeepchecksValueError('Dataset cannot be empty')
+
+        samples_hashes = hash_samples(normalize_samples(
+            dataset.text,
+            **self._text_normalization_kwargs
+        ))
+
+        if dataset.task_type is TaskType.TOKEN_CLASSIFICATION or dataset.is_multi_label_classification():
+            labels = [tuple(t.cast(t.Sequence[t.Any], it)) for it in dataset.label]
+        elif dataset.task_type is TaskType.TEXT_CLASSIFICATION:
+            labels = dataset.label
+        else:
+            raise DeepchecksValueError(f'Unknow task type - {dataset.task_type}')
+
+        df = pd.DataFrame({
+            'hash': samples_hashes,
+            'Sample ID': dataset.get_original_text_indexes(),
+            'Label': labels,
+            'Text': dataset.text,
+        })
+
+        by_hash = df.loc[:, ['hash', 'Label']].groupby(['hash'], dropna=False)
+        count_labels = lambda x: len(set(x.to_list()))
+        n_of_labels_per_sample = by_hash['Label'].aggregate(count_labels)
+
+        ambiguous_samples_hashes = n_of_labels_per_sample[n_of_labels_per_sample > 1]
+        ambiguous_samples_hashes = frozenset(ambiguous_samples_hashes.index.to_list())
+
+        ambiguous_samples = df[df['hash'].isin(ambiguous_samples_hashes)]
+        num_of_ambiguous_samples = ambiguous_samples['Text'].count()
+        percent_of_ambiguous_samples = num_of_ambiguous_samples / n_of_samples
+
+        result_df = ambiguous_samples.rename(columns={'hash': 'Duplicate'})
+        duplicates_enumeration = to_ordional_enumeration(result_df['Duplicate'].to_list())
+        result_df['Duplicate'] = result_df['Duplicate'].apply(lambda x: duplicates_enumeration[x])
+        result_df = result_df.set_index(['Duplicate', 'Sample ID', 'Label'])
+
+        result_value = {
+            'percent_of_conflicting_samples': percent_of_ambiguous_samples,
+            'conflicting_samples': result_df,
+        }
+
+        if context.with_display is False:
+            return CheckResult(value=result_value)
+
+        ambiguous_samples['Text'] = ambiguous_samples['Text'].apply(self._truncate_text)
+        by_hash = ambiguous_samples.groupby(['hash'], dropna=False)
+        observed_labels = by_hash['Label'].aggregate(lambda x: format_list(x.to_list()))
+        samples_ids = by_hash['Sample ID'].aggregate(lambda x: format_list(x.to_list(), max_string_length=200))
+        first_in_group = by_hash['Text'].first()
+
+        display_table = (
+            pd.DataFrame({
+                # TODO:
+                # for multi-label and token classification
+                # 'Observed Labels' column will look not very nice
+                # need an another way to display observed labels
+                # for those task types
+                'Observed Labels': observed_labels,
+                'Sample IDs': samples_ids,
+                'Text': first_in_group
+            })
+            .reset_index(drop=True)
+            .set_index(['Observed Labels', 'Sample IDs'])
+        )
+        table_description = (
+            'Each row in the table shows an example of a data sample '
+            'and the its observed conflicting labels as found in the dataset.'
+        )
+        table_note = (
+            f'Showing top {self.n_to_show} of {len(display_table)}'
+            if self.n_to_show <= len(display_table)
+            else ''
+        )
+        return CheckResult(
+            value=result_value,
+            display=[
+                table_description,
+                table_note,
+                # slice over first level of the multiindex ('Observed Labels')
+                display_table.iloc[slice(0, self.n_to_show)]
+            ]
+        )