From 96c67c0a443d22ef733a6ee152c1708464f28ee2 Mon Sep 17 00:00:00 2001 From: Noam Bressler Date: Thu, 1 Jun 2023 12:20:50 +0300 Subject: [PATCH] Fix the property drift check name and others various fixes (#2572) * simple name fix for Property Drift * gitignore fix * avoid annoying transformers print * mention the full suite in quickstart --- .gitignore | 4 +++ deepchecks/analytics/anonymous_telemetry.py | 2 +- .../checks/data_integrity/unknown_tokens.py | 28 ++++++++++++++++--- .../train_test_validation/property_drift.py | 2 +- .../plot_multi_label_classification.py | 4 ++- .../quickstarts/plot_text_classification.py | 7 +++-- spelling-allowlist.txt | 1 + 7 files changed, 39 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index a7717b136f..36f2ee5df4 100644 --- a/.gitignore +++ b/.gitignore @@ -104,6 +104,8 @@ tweet_emotion_*.csv # docs build files docs/source/_build +docs/html +docs/doctrees # build folders of sphinx gallery docs/source/general/usage/customizations/auto_examples/ @@ -120,6 +122,8 @@ docs/source/vision/auto_tutorials docs/source/user-guide/tabular/auto_quickstarts docs/source/user-guide/vision/auto_quickstarts +docs/source/checks_gallery + # build artificats from running docs (vision, nlp, wandb export) docs/source/vision/tutorials/quickstarts/*.html diff --git a/deepchecks/analytics/anonymous_telemetry.py b/deepchecks/analytics/anonymous_telemetry.py index d0af629491..f0e0a669b3 100644 --- a/deepchecks/analytics/anonymous_telemetry.py +++ b/deepchecks/analytics/anonymous_telemetry.py @@ -44,7 +44,7 @@ def validate_latest_version(): is_on_latest = result.read().decode() == 'True' if not is_on_latest: get_logger().warning('You are using deepchecks version %s, however a newer version is available.' - 'Deepchecks is frequently updated with major improvements. You should consider ' + ' Deepchecks is frequently updated with major improvements. You should consider ' 'upgrading via the "python -m pip install --upgrade deepchecks" command.', deepchecks.__version__) except Exception: # pylint: disable=broad-except diff --git a/deepchecks/nlp/checks/data_integrity/unknown_tokens.py b/deepchecks/nlp/checks/data_integrity/unknown_tokens.py index b7e625cad8..89caca41bd 100644 --- a/deepchecks/nlp/checks/data_integrity/unknown_tokens.py +++ b/deepchecks/nlp/checks/data_integrity/unknown_tokens.py @@ -9,7 +9,8 @@ # ---------------------------------------------------------------------------- # """Module contains the Unknown Tokens check.""" -import os +import contextlib +import sys import typing as t import warnings from collections import Counter @@ -151,9 +152,14 @@ def find_unknown_words(self, samples, indices): # Batch tokenization # ------------------ # Needed to avoid warning when used after loading a hub dataset - os.environ['TOKENIZERS_PARALLELISM '] = 'true' - tokenized_samples = self.tokenizer(list(samples), return_offsets_mapping=True, is_split_into_words=False, - truncation=False) + # We divert the printing to stdout (done by the rust code within the HuggingFace tokenizer) + # into this filter, that will filter out any print containing the str 'huggingface/tokenizers' + # This warning printout is activated when running this check after loading a HuggingFace dataset, + # and is irrelevant to us because we're not forking the process. + # see: https://github.com/huggingface/transformers/issues/5486 + with contextlib.redirect_stdout(PrintFilter(sys.stdout)): + tokenized_samples = self.tokenizer(list(samples), return_offsets_mapping=True, + is_split_into_words=False, truncation=False) for idx, (tokens, offsets_mapping, sample) in zip(indices, zip(tokenized_samples['input_ids'], tokenized_samples['offset_mapping'], @@ -251,3 +257,17 @@ def condition(result): return self.add_condition(f'Ratio of unknown words is less than {format_percent(ratio)}', condition) + + +class PrintFilter: + """Filter to avoid printing of tokenization warnings.""" + + def __init__(self, original_stdout): + self.original_stdout = original_stdout + + def write(self, msg): + if 'huggingface/tokenizers' not in msg: + self.original_stdout.write(msg) + + def flush(self): + self.original_stdout.flush() diff --git a/deepchecks/nlp/checks/train_test_validation/property_drift.py b/deepchecks/nlp/checks/train_test_validation/property_drift.py index 8c211c84ff..b0c599d17b 100644 --- a/deepchecks/nlp/checks/train_test_validation/property_drift.py +++ b/deepchecks/nlp/checks/train_test_validation/property_drift.py @@ -151,5 +151,5 @@ def run_logic(self, context: Context) -> CheckResult: return CheckResult( value=results, display=displays, - header='Properties Drift' + header='Property Drift' ) diff --git a/docs/source/nlp/tutorials/quickstarts/plot_multi_label_classification.py b/docs/source/nlp/tutorials/quickstarts/plot_multi_label_classification.py index 61692f41c6..7280905a4f 100644 --- a/docs/source/nlp/tutorials/quickstarts/plot_multi_label_classification.py +++ b/docs/source/nlp/tutorials/quickstarts/plot_multi_label_classification.py @@ -96,7 +96,9 @@ # # Deepchecks comes with a set of pre-built suites that can be used to run a set of checks on your data, alongside # with their default conditions and thresholds. You can read more about customizing and creating your own suites in the -# :ref:`Customizations Guide `. +# :ref:`Customizations Guide `. In this guide we'll be using 3 suites - the data integrity +# suite, the train test validation suite and the model evaluation suite. You can also run all the checks at once using +# the :mod:`full_suite `. # # Data Integrity # -------------- diff --git a/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py b/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py index 2098431fa1..ca61bdd4e1 100644 --- a/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py +++ b/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py @@ -47,7 +47,8 @@ Load Data --------- For the purpose of this guide, we'll use a small subset of the -`tweet emotion `__ dataset: +`tweet emotion `__ dataset. This dataset contains tweets and their +corresponding emotion - Anger, Happiness, Optimism, and Sadness. """ @@ -115,7 +116,9 @@ # # Deepchecks comes with a set of pre-built suites that can be used to run a set of checks on your data, alongside # with their default conditions and thresholds. You can read more about customizing and creating your own suites in the -# :ref:`Customizations Guide `. +# :ref:`Customizations Guide `. In this guide we'll be using 3 suites - the data integrity +# suite, the train test validation suite and the model evaluation suite. You can also run all the checks at once using +# the :mod:`full_suite `. # # Data Integrity # -------------- diff --git a/spelling-allowlist.txt b/spelling-allowlist.txt index 587c23697a..d6d21355ef 100644 --- a/spelling-allowlist.txt +++ b/spelling-allowlist.txt @@ -155,3 +155,4 @@ misclassified Uncomment dimensionality tokenization +huggingface