Skip to content

Commit

Permalink
Noam/bugfix/token class bugfixes (#2551)
Browse files Browse the repository at this point in the history
* improve display for token classification in embeddings drift

* address null properties in outliers

* fix issue of small % of unknown tokens not appearing

---------

Co-authored-by: Nir Hutnik <92314933+nirhutnik@users.noreply.github.com>
  • Loading branch information
noamzbr and nirhutnik committed May 23, 2023
1 parent 5654063 commit 87de7d7
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 22 deletions.
34 changes: 21 additions & 13 deletions deepchecks/nlp/checks/data_integrity/text_property_outliers.py
Expand Up @@ -142,19 +142,27 @@ def run_logic(self, context: Context, dataset_kind: DatasetKind) -> CheckResult:
else:
if len(display) < self.n_show_top:
dist = df_properties[property_name]
lower_limit = info['lower_limit']
upper_limit = info['upper_limit']

fig = get_text_outliers_graph(
dist=dist,
data=dataset.text,
lower_limit=lower_limit,
upper_limit=upper_limit,
dist_name=property_name,
is_categorical=property_name in cat_properties
)

display.append(fig)
if len(dist[~pd.isnull(dist)]) >= self.min_samples:
lower_limit = info['lower_limit']
upper_limit = info['upper_limit']

fig = get_text_outliers_graph(
dist=dist,
data=dataset.text,
lower_limit=lower_limit,
upper_limit=upper_limit,
dist_name=property_name,
is_categorical=property_name in cat_properties
)

display.append(fig)
else:
no_outliers = pd.concat(
[no_outliers, pd.Series(property_name, index=[
f'Not enough non-null samples to compute'
f' properties (min_samples={self.min_samples}).'
])]
)
else:
no_outliers = pd.concat([no_outliers, pd.Series(property_name, index=[
f'Outliers found but not shown in graphs (n_show_top={self.n_show_top}).'])])
Expand Down
3 changes: 2 additions & 1 deletion deepchecks/nlp/checks/data_integrity/unknown_tokens.py
Expand Up @@ -21,6 +21,7 @@
from deepchecks.nlp import Context, SingleDatasetCheck
from deepchecks.nlp._shared_docs import docstrings
from deepchecks.nlp.text_data import TextData
from deepchecks.utils.numbers import round_sig
from deepchecks.utils.strings import format_list, format_percent
from deepchecks.utils.strings import get_ellipsis as truncate_string

Expand Down Expand Up @@ -167,7 +168,7 @@ def create_pie_chart(self, all_unknown_words_counter, total_words):
# Truncate labels for display
labels = [truncate_string(label, self.max_text_length_for_display) for label in labels]
# round percentages to 2 decimal places after the percent
percentages = [round(percent, 2) for percent in percentages]
percentages = [round_sig(percent, 2) for percent in percentages]

# Create pie chart with hover text and custom hover template
fig = go.Figure(data=[go.Pie(
Expand Down
20 changes: 18 additions & 2 deletions deepchecks/nlp/text_data.py
Expand Up @@ -22,6 +22,7 @@
validate_length_and_type_numpy_array, validate_modify_label,
validate_raw_text, validate_tokenized_text)
from deepchecks.nlp.task_type import TaskType, TTextLabel
from deepchecks.nlp.utils.text import break_to_lines_and_trim
from deepchecks.nlp.utils.text_embeddings import calculate_builtin_embeddings
from deepchecks.nlp.utils.text_properties import calculate_builtin_properties
from deepchecks.utils.logger import get_logger
Expand Down Expand Up @@ -294,7 +295,7 @@ def embeddings(self) -> pd.DataFrame:
"""Return the metadata of for the dataset."""
return self._embeddings

def calculate_builtin_embeddings(self, model: str = 'miniLM', file_path: str = 'embeddings.csv'):
def calculate_builtin_embeddings(self, model: str = 'miniLM', file_path: str = 'embeddings.npy'):
"""Calculate the built-in embeddings of the dataset.
Parameters
Expand All @@ -303,7 +304,7 @@ def calculate_builtin_embeddings(self, model: str = 'miniLM', file_path: str = '
The model to use for calculating the embeddings. Possible values are:
'miniLM': using the miniLM model in the sentence-transformers library.
'open_ai': using the ADA model in the open_ai library. Requires an API key.
file_path : str, default: 'embeddings.csv'
file_path : str, default: 'embeddings.npy'
The path to save the embeddings to.
"""
if self._embeddings is not None:
Expand Down Expand Up @@ -532,6 +533,21 @@ def label_for_display(self, model_classes: list = None) -> TTextLabel:
else:
return self.label

def label_for_print(self, model_classes: list = None) -> t.List[str]:
"""Return the label defined in the dataset in a format that can be printed nicely.
Parameters
----------
model_classes : list, default None
List of classes names to use for multi-label display. Only used if the dataset is multi-label.
Returns
-------
List[str]
"""
label_for_display = self.label_for_display(model_classes)
return [break_to_lines_and_trim(str(x)) for x in label_for_display]

def has_label(self) -> bool:
"""Return True if label was set.
Expand Down
49 changes: 43 additions & 6 deletions deepchecks/nlp/utils/nlp_plot.py
Expand Up @@ -9,14 +9,16 @@
# ----------------------------------------------------------------------------
#
"""A module containing utils for plotting distributions."""
from typing import List, Sequence
from collections import Counter
from typing import Dict, List, Sequence

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go

from deepchecks.nlp import TextData
from deepchecks.nlp.task_type import TaskType
from deepchecks.nlp.utils.text import break_to_lines_and_trim
from deepchecks.utils.dataframes import un_numpy
from deepchecks.utils.distribution.plot import get_density
Expand Down Expand Up @@ -122,6 +124,8 @@ def get_text_outliers_graph(dist: Sequence, data: Sequence[str], lower_limit: fl
xaxis_layout = dict(type='category')

else:
dist = dist[~pd.isnull(dist)]

x_range = (
dist.min(), dist.max()
)
Expand Down Expand Up @@ -235,6 +239,24 @@ def get_text_outliers_graph(dist: Sequence, data: Sequence[str], lower_limit: fl
return fig


def count_token_classification_labels(labels) -> Dict:
"""Count the number of labels of each kind in a token classification dataset.
Ignores the initial character of these labels (B- and I- and such) if they exist.
"""
labels = [label[2:] if label[:2] in ['B-', 'I-', 'O-'] else label for label in labels]
return dict(Counter(labels))


def annotated_token_classification_text(token_text, iob_annotations) -> List[str]:
"""Annotate a token classification dataset with IOB tags."""
annotated_samples = []
for sample, iob_sample in zip(token_text, iob_annotations):
annotated_samples.append(' '.join([f'<b>{word}</b>' if iob != 'O' else word for
word, iob in zip(sample, iob_sample)]))
return annotated_samples


def two_datasets_scatter_plot(plot_title: str, plot_data: pd.DataFrame, train_dataset: TextData,
test_dataset: TextData, model_classes: list):
"""Plot a scatter plot of two datasets.
Expand All @@ -259,12 +281,27 @@ def two_datasets_scatter_plot(plot_title: str, plot_data: pd.DataFrame, train_da
dataset_names = DEFAULT_DATASET_NAMES

plot_data['Dataset'] = [dataset_names[0]] * len(train_dataset) + [dataset_names[1]] * len(test_dataset)
if train_dataset.has_label():
plot_data['Label'] = list(train_dataset.label_for_display(model_classes=model_classes)) + \
list(test_dataset.label_for_display(model_classes=model_classes))

if train_dataset.task_type == TaskType.TOKEN_CLASSIFICATION:
plot_data['Sample'] = np.concatenate([train_dataset.tokenized_text, test_dataset.tokenized_text])

if train_dataset.has_label():
plot_data['Label'] = list(train_dataset.label_for_display(model_classes=model_classes)) + \
list(test_dataset.label_for_display(model_classes=model_classes))
plot_data['Sample'] = annotated_token_classification_text(plot_data['Sample'], plot_data['Label'])
# Displayed labels are the counts of each label in the dataset:
plot_data['Label'] = [break_to_lines_and_trim(str(count_token_classification_labels(x)))
for x in plot_data['Label']]
else:
plot_data['Label'] = None
else:
plot_data['Label'] = None
plot_data['Sample'] = np.concatenate([train_dataset.text, test_dataset.text])
if train_dataset.has_label():
plot_data['Label'] = list(train_dataset.label_for_print(model_classes=model_classes)) + \
list(test_dataset.label_for_print(model_classes=model_classes))
else:
plot_data['Label'] = None
plot_data['Sample'] = np.concatenate([train_dataset.text, test_dataset.text])

plot_data['Sample'] = plot_data['Sample'].apply(break_to_lines_and_trim)

fig = px.scatter(plot_data, x=axes[0], y=axes[1], color='Dataset', color_discrete_map=colors,
Expand Down
5 changes: 5 additions & 0 deletions deepchecks/nlp/utils/text.py
Expand Up @@ -9,6 +9,7 @@
# ----------------------------------------------------------------------------
#
"""Module of text utils for NLP package."""
import re
import string
import typing as t
import unicodedata
Expand Down Expand Up @@ -55,6 +56,10 @@ def break_to_lines_and_trim(s, max_lines: int = 10, min_line_length: int = 50, m
s = s[j:].strip()
break
else: # if no delimiter was found, break in the middle of the line
# Check if breaking in the middle of an HTML tag
tag_start = re.search(r'<[^>]*$', s[:max_line_length])
if tag_start:
max_line_length = tag_start.start()
lines.append(s[:max_line_length].strip() + '-')
s = s[max_line_length:].strip()
else: # if the loop ended without breaking, and there is still text left, add an ellipsis
Expand Down
18 changes: 18 additions & 0 deletions deepchecks/utils/numbers.py
@@ -0,0 +1,18 @@
# ----------------------------------------------------------------------------
# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
#
# This file is part of Deepchecks.
# Deepchecks is distributed under the terms of the GNU Affero General
# Public License (version 3 or later).
# You should have received a copy of the GNU Affero General Public License
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""General utilities for working with numbers."""

import numpy as np


def round_sig(x: float, sig: int = 2):
"""Round a number to a given number of significant digits."""
return round(x, sig-int(np.floor(np.log10(abs(x))))-1)

0 comments on commit 87de7d7

Please sign in to comment.