Skip to content

Commit

Permalink
Fix the property drift check name and others various fixes (#2572)
Browse files Browse the repository at this point in the history
* simple name fix for Property Drift

* gitignore fix

* avoid annoying transformers print

* mention the full suite in quickstart
  • Loading branch information
noamzbr committed Jun 1, 2023
1 parent 70d6289 commit 96c67c0
Show file tree
Hide file tree
Showing 7 changed files with 39 additions and 9 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Expand Up @@ -104,6 +104,8 @@ tweet_emotion_*.csv

# docs build files
docs/source/_build
docs/html
docs/doctrees

# build folders of sphinx gallery
docs/source/general/usage/customizations/auto_examples/
Expand All @@ -120,6 +122,8 @@ docs/source/vision/auto_tutorials
docs/source/user-guide/tabular/auto_quickstarts
docs/source/user-guide/vision/auto_quickstarts

docs/source/checks_gallery

# build artificats from running docs (vision, nlp, wandb export)
docs/source/vision/tutorials/quickstarts/*.html

Expand Down
2 changes: 1 addition & 1 deletion deepchecks/analytics/anonymous_telemetry.py
Expand Up @@ -44,7 +44,7 @@ def validate_latest_version():
is_on_latest = result.read().decode() == 'True'
if not is_on_latest:
get_logger().warning('You are using deepchecks version %s, however a newer version is available.'
'Deepchecks is frequently updated with major improvements. You should consider '
' Deepchecks is frequently updated with major improvements. You should consider '
'upgrading via the "python -m pip install --upgrade deepchecks" command.',
deepchecks.__version__)
except Exception: # pylint: disable=broad-except
Expand Down
28 changes: 24 additions & 4 deletions deepchecks/nlp/checks/data_integrity/unknown_tokens.py
Expand Up @@ -9,7 +9,8 @@
# ----------------------------------------------------------------------------
#
"""Module contains the Unknown Tokens check."""
import os
import contextlib
import sys
import typing as t
import warnings
from collections import Counter
Expand Down Expand Up @@ -151,9 +152,14 @@ def find_unknown_words(self, samples, indices):
# Batch tokenization
# ------------------
# Needed to avoid warning when used after loading a hub dataset
os.environ['TOKENIZERS_PARALLELISM '] = 'true'
tokenized_samples = self.tokenizer(list(samples), return_offsets_mapping=True, is_split_into_words=False,
truncation=False)
# We divert the printing to stdout (done by the rust code within the HuggingFace tokenizer)
# into this filter, that will filter out any print containing the str 'huggingface/tokenizers'
# This warning printout is activated when running this check after loading a HuggingFace dataset,
# and is irrelevant to us because we're not forking the process.
# see: https://github.com/huggingface/transformers/issues/5486
with contextlib.redirect_stdout(PrintFilter(sys.stdout)):
tokenized_samples = self.tokenizer(list(samples), return_offsets_mapping=True,
is_split_into_words=False, truncation=False)

for idx, (tokens, offsets_mapping, sample) in zip(indices, zip(tokenized_samples['input_ids'],
tokenized_samples['offset_mapping'],
Expand Down Expand Up @@ -251,3 +257,17 @@ def condition(result):

return self.add_condition(f'Ratio of unknown words is less than {format_percent(ratio)}',
condition)


class PrintFilter:
"""Filter to avoid printing of tokenization warnings."""

def __init__(self, original_stdout):
self.original_stdout = original_stdout

def write(self, msg):
if 'huggingface/tokenizers' not in msg:
self.original_stdout.write(msg)

def flush(self):
self.original_stdout.flush()
Expand Up @@ -151,5 +151,5 @@ def run_logic(self, context: Context) -> CheckResult:
return CheckResult(
value=results,
display=displays,
header='Properties Drift'
header='Property Drift'
)
Expand Up @@ -96,7 +96,9 @@
#
# Deepchecks comes with a set of pre-built suites that can be used to run a set of checks on your data, alongside
# with their default conditions and thresholds. You can read more about customizing and creating your own suites in the
# :ref:`Customizations Guide <general__customizations>`.
# :ref:`Customizations Guide <general__customizations>`. In this guide we'll be using 3 suites - the data integrity
# suite, the train test validation suite and the model evaluation suite. You can also run all the checks at once using
# the :mod:`full_suite <deepchecks.nlp.suites>`.
#
# Data Integrity
# --------------
Expand Down
Expand Up @@ -47,7 +47,8 @@
Load Data
---------
For the purpose of this guide, we'll use a small subset of the
`tweet emotion <https://github.com/cardiffnlp/tweeteval>`__ dataset:
`tweet emotion <https://github.com/cardiffnlp/tweeteval>`__ dataset. This dataset contains tweets and their
corresponding emotion - Anger, Happiness, Optimism, and Sadness.
"""

Expand Down Expand Up @@ -115,7 +116,9 @@
#
# Deepchecks comes with a set of pre-built suites that can be used to run a set of checks on your data, alongside
# with their default conditions and thresholds. You can read more about customizing and creating your own suites in the
# :ref:`Customizations Guide <general__customizations>`.
# :ref:`Customizations Guide <general__customizations>`. In this guide we'll be using 3 suites - the data integrity
# suite, the train test validation suite and the model evaluation suite. You can also run all the checks at once using
# the :mod:`full_suite <deepchecks.nlp.suites>`.
#
# Data Integrity
# --------------
Expand Down
1 change: 1 addition & 0 deletions spelling-allowlist.txt
Expand Up @@ -155,3 +155,4 @@ misclassified
Uncomment
dimensionality
tokenization
huggingface

0 comments on commit 96c67c0

Please sign in to comment.