Noam/bugfix/token class bugfixes (#2551)

* improve display for token classification in embeddings drift * address null properties in outliers * fix issue of small % of unknown tokens not appearing --------- Co-authored-by: Nir Hutnik <92314933+nirhutnik@users.noreply.github.com>
deepchecks · May 23, 2023 · 87de7d7 · 87de7d7
1 parent 5654063
commit 87de7d7
Show file tree

Hide file tree

Showing 6 changed files with 107 additions and 22 deletions.
diff --git a/deepchecks/nlp/checks/data_integrity/text_property_outliers.py b/deepchecks/nlp/checks/data_integrity/text_property_outliers.py
@@ -142,19 +142,27 @@ def run_logic(self, context: Context, dataset_kind: DatasetKind) -> CheckResult:
                 else:
                     if len(display) < self.n_show_top:
                         dist = df_properties[property_name]
-                        lower_limit = info['lower_limit']
-                        upper_limit = info['upper_limit']
-
-                        fig = get_text_outliers_graph(
-                            dist=dist,
-                            data=dataset.text,
-                            lower_limit=lower_limit,
-                            upper_limit=upper_limit,
-                            dist_name=property_name,
-                            is_categorical=property_name in cat_properties
-                        )
-
-                        display.append(fig)
+                        if len(dist[~pd.isnull(dist)]) >= self.min_samples:
+                            lower_limit = info['lower_limit']
+                            upper_limit = info['upper_limit']
+
+                            fig = get_text_outliers_graph(
+                                dist=dist,
+                                data=dataset.text,
+                                lower_limit=lower_limit,
+                                upper_limit=upper_limit,
+                                dist_name=property_name,
+                                is_categorical=property_name in cat_properties
+                            )
+
+                            display.append(fig)
+                        else:
+                            no_outliers = pd.concat(
+                                [no_outliers, pd.Series(property_name, index=[
+                                    f'Not enough non-null samples to compute'
+                                    f' properties (min_samples={self.min_samples}).'
+                                ])]
+                            )
                     else:
                         no_outliers = pd.concat([no_outliers, pd.Series(property_name, index=[
                             f'Outliers found but not shown in graphs (n_show_top={self.n_show_top}).'])])

diff --git a/deepchecks/nlp/checks/data_integrity/unknown_tokens.py b/deepchecks/nlp/checks/data_integrity/unknown_tokens.py
@@ -21,6 +21,7 @@
 from deepchecks.nlp import Context, SingleDatasetCheck
 from deepchecks.nlp._shared_docs import docstrings
 from deepchecks.nlp.text_data import TextData
+from deepchecks.utils.numbers import round_sig
 from deepchecks.utils.strings import format_list, format_percent
 from deepchecks.utils.strings import get_ellipsis as truncate_string
 
@@ -167,7 +168,7 @@ def create_pie_chart(self, all_unknown_words_counter, total_words):
         # Truncate labels for display
         labels = [truncate_string(label, self.max_text_length_for_display) for label in labels]
         # round percentages to 2 decimal places after the percent
-        percentages = [round(percent, 2) for percent in percentages]
+        percentages = [round_sig(percent, 2) for percent in percentages]
 
         # Create pie chart with hover text and custom hover template
         fig = go.Figure(data=[go.Pie(

diff --git a/deepchecks/nlp/text_data.py b/deepchecks/nlp/text_data.py
@@ -22,6 +22,7 @@
                                               validate_length_and_type_numpy_array, validate_modify_label,
                                               validate_raw_text, validate_tokenized_text)
 from deepchecks.nlp.task_type import TaskType, TTextLabel
+from deepchecks.nlp.utils.text import break_to_lines_and_trim
 from deepchecks.nlp.utils.text_embeddings import calculate_builtin_embeddings
 from deepchecks.nlp.utils.text_properties import calculate_builtin_properties
 from deepchecks.utils.logger import get_logger
@@ -294,7 +295,7 @@ def embeddings(self) -> pd.DataFrame:
         """Return the metadata of for the dataset."""
         return self._embeddings
 
-    def calculate_builtin_embeddings(self, model: str = 'miniLM', file_path: str = 'embeddings.csv'):
+    def calculate_builtin_embeddings(self, model: str = 'miniLM', file_path: str = 'embeddings.npy'):
         """Calculate the built-in embeddings of the dataset.
 
         Parameters
@@ -303,7 +304,7 @@ def calculate_builtin_embeddings(self, model: str = 'miniLM', file_path: str = '
             The model to use for calculating the embeddings. Possible values are:
             'miniLM': using the miniLM model in the sentence-transformers library.
             'open_ai': using the ADA model in the open_ai library. Requires an API key.
-        file_path : str, default: 'embeddings.csv'
+        file_path : str, default: 'embeddings.npy'
             The path to save the embeddings to.
         """
         if self._embeddings is not None:
@@ -532,6 +533,21 @@ def label_for_display(self, model_classes: list = None) -> TTextLabel:
         else:
             return self.label
 
+    def label_for_print(self, model_classes: list = None) -> t.List[str]:
+        """Return the label defined in the dataset in a format that can be printed nicely.
+
+        Parameters
+        ----------
+        model_classes : list, default None
+            List of classes names to use for multi-label display. Only used if the dataset is multi-label.
+
+        Returns
+        -------
+        List[str]
+        """
+        label_for_display = self.label_for_display(model_classes)
+        return [break_to_lines_and_trim(str(x)) for x in label_for_display]
+
     def has_label(self) -> bool:
         """Return True if label was set.
 

diff --git a/deepchecks/nlp/utils/nlp_plot.py b/deepchecks/nlp/utils/nlp_plot.py
@@ -9,14 +9,16 @@
 # ----------------------------------------------------------------------------
 #
 """A module containing utils for plotting distributions."""
-from typing import List, Sequence
+from collections import Counter
+from typing import Dict, List, Sequence
 
 import numpy as np
 import pandas as pd
 import plotly.express as px
 import plotly.graph_objs as go
 
 from deepchecks.nlp import TextData
+from deepchecks.nlp.task_type import TaskType
 from deepchecks.nlp.utils.text import break_to_lines_and_trim
 from deepchecks.utils.dataframes import un_numpy
 from deepchecks.utils.distribution.plot import get_density
@@ -122,6 +124,8 @@ def get_text_outliers_graph(dist: Sequence, data: Sequence[str], lower_limit: fl
         xaxis_layout = dict(type='category')
 
     else:
+        dist = dist[~pd.isnull(dist)]
+
         x_range = (
             dist.min(), dist.max()
         )
@@ -235,6 +239,24 @@ def get_text_outliers_graph(dist: Sequence, data: Sequence[str], lower_limit: fl
     return fig
 
 
+def count_token_classification_labels(labels) -> Dict:
+    """Count the number of labels of each kind in a token classification dataset.
+
+    Ignores the initial character of these labels (B- and I- and such) if they exist.
+    """
+    labels = [label[2:] if label[:2] in ['B-', 'I-', 'O-'] else label for label in labels]
+    return dict(Counter(labels))
+
+
+def annotated_token_classification_text(token_text, iob_annotations) -> List[str]:
+    """Annotate a token classification dataset with IOB tags."""
+    annotated_samples = []
+    for sample, iob_sample in zip(token_text, iob_annotations):
+        annotated_samples.append(' '.join([f'<b>{word}</b>' if iob != 'O' else word for
+                                           word, iob in zip(sample, iob_sample)]))
+    return annotated_samples
+
+
 def two_datasets_scatter_plot(plot_title: str, plot_data: pd.DataFrame, train_dataset: TextData,
                               test_dataset: TextData, model_classes: list):
     """Plot a scatter plot of two datasets.
@@ -259,12 +281,27 @@ def two_datasets_scatter_plot(plot_title: str, plot_data: pd.DataFrame, train_da
         dataset_names = DEFAULT_DATASET_NAMES
 
     plot_data['Dataset'] = [dataset_names[0]] * len(train_dataset) + [dataset_names[1]] * len(test_dataset)
-    if train_dataset.has_label():
-        plot_data['Label'] = list(train_dataset.label_for_display(model_classes=model_classes)) + \
-                             list(test_dataset.label_for_display(model_classes=model_classes))
+
+    if train_dataset.task_type == TaskType.TOKEN_CLASSIFICATION:
+        plot_data['Sample'] = np.concatenate([train_dataset.tokenized_text, test_dataset.tokenized_text])
+
+        if train_dataset.has_label():
+            plot_data['Label'] = list(train_dataset.label_for_display(model_classes=model_classes)) + \
+                                 list(test_dataset.label_for_display(model_classes=model_classes))
+            plot_data['Sample'] = annotated_token_classification_text(plot_data['Sample'], plot_data['Label'])
+            # Displayed labels are the counts of each label in the dataset:
+            plot_data['Label'] = [break_to_lines_and_trim(str(count_token_classification_labels(x)))
+                                  for x in plot_data['Label']]
+        else:
+            plot_data['Label'] = None
     else:
-        plot_data['Label'] = None
-    plot_data['Sample'] = np.concatenate([train_dataset.text, test_dataset.text])
+        if train_dataset.has_label():
+            plot_data['Label'] = list(train_dataset.label_for_print(model_classes=model_classes)) + \
+                                 list(test_dataset.label_for_print(model_classes=model_classes))
+        else:
+            plot_data['Label'] = None
+        plot_data['Sample'] = np.concatenate([train_dataset.text, test_dataset.text])
+
     plot_data['Sample'] = plot_data['Sample'].apply(break_to_lines_and_trim)
 
     fig = px.scatter(plot_data, x=axes[0], y=axes[1], color='Dataset', color_discrete_map=colors,

diff --git a/deepchecks/nlp/utils/text.py b/deepchecks/nlp/utils/text.py
@@ -9,6 +9,7 @@
 # ----------------------------------------------------------------------------
 #
 """Module of text utils for NLP package."""
+import re
 import string
 import typing as t
 import unicodedata
@@ -55,6 +56,10 @@ def break_to_lines_and_trim(s, max_lines: int = 10, min_line_length: int = 50, m
                     s = s[j:].strip()
                     break
             else:  # if no delimiter was found, break in the middle of the line
+                # Check if breaking in the middle of an HTML tag
+                tag_start = re.search(r'<[^>]*$', s[:max_line_length])
+                if tag_start:
+                    max_line_length = tag_start.start()
                 lines.append(s[:max_line_length].strip() + '-')
                 s = s[max_line_length:].strip()
     else:  # if the loop ended without breaking, and there is still text left, add an ellipsis

diff --git a/deepchecks/utils/numbers.py b/deepchecks/utils/numbers.py
@@ -0,0 +1,18 @@
+# ----------------------------------------------------------------------------
+# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
+#
+# This file is part of Deepchecks.
+# Deepchecks is distributed under the terms of the GNU Affero General
+# Public License (version 3 or later).
+# You should have received a copy of the GNU Affero General Public License
+# along with Deepchecks.  If not, see <http://www.gnu.org/licenses/>.
+# ----------------------------------------------------------------------------
+#
+"""General utilities for working with numbers."""
+
+import numpy as np
+
+
+def round_sig(x: float, sig: int = 2):
+    """Round a number to a given number of significant digits."""
+    return round(x, sig-int(np.floor(np.log10(abs(x))))-1)