deepchecks · noamzbr · Jun 1, 2023 · May 31, 2023 · May 31, 2023 · May 31, 2023
@@ -104,6 +104,8 @@ tweet_emotion_*.csv
 
 # docs build files
 docs/source/_build
+docs/html
+docs/doctrees
 
 # build folders of sphinx gallery
 docs/source/general/usage/customizations/auto_examples/
@@ -120,6 +122,8 @@ docs/source/vision/auto_tutorials
 docs/source/user-guide/tabular/auto_quickstarts
 docs/source/user-guide/vision/auto_quickstarts
 
+docs/source/checks_gallery
+
 # build artificats from running docs (vision, nlp, wandb export)
 docs/source/vision/tutorials/quickstarts/*.html
 

@@ -44,7 +44,7 @@ def validate_latest_version():
             is_on_latest = result.read().decode() == 'True'
             if not is_on_latest:
                 get_logger().warning('You are using deepchecks version %s, however a newer version is available.'
-                                     'Deepchecks is frequently updated with major improvements. You should consider '
+                                     ' Deepchecks is frequently updated with major improvements. You should consider '
                                      'upgrading via the "python -m pip install --upgrade deepchecks" command.',
                                      deepchecks.__version__)
         except Exception:  # pylint: disable=broad-except

@@ -9,7 +9,8 @@
 # ----------------------------------------------------------------------------
 #
 """Module contains the Unknown Tokens check."""
-import os
+import contextlib
+import sys
 import typing as t
 import warnings
 from collections import Counter
@@ -151,9 +152,14 @@ def find_unknown_words(self, samples, indices):
             # Batch tokenization
             # ------------------
             # Needed to avoid warning when used after loading a hub dataset
-            os.environ['TOKENIZERS_PARALLELISM '] = 'true'
-            tokenized_samples = self.tokenizer(list(samples), return_offsets_mapping=True, is_split_into_words=False,
-                                               truncation=False)
+            # We divert the printing to stdout (done by the rust code within the HuggingFace tokenizer)
+            # into this filter, that will filter out any print containing the str 'huggingface/tokenizers'
+            # This warning printout is activated when running this check after loading a HuggingFace dataset,
+            # and is irrelevant to us because we're not forking the process.
+            # see: https://github.com/huggingface/transformers/issues/5486
+            with contextlib.redirect_stdout(PrintFilter(sys.stdout)):
+                tokenized_samples = self.tokenizer(list(samples), return_offsets_mapping=True,
+                                                   is_split_into_words=False, truncation=False)
 
             for idx, (tokens, offsets_mapping, sample) in zip(indices, zip(tokenized_samples['input_ids'],
                                                                            tokenized_samples['offset_mapping'],
@@ -251,3 +257,17 @@ def condition(result):
 
         return self.add_condition(f'Ratio of unknown words is less than {format_percent(ratio)}',
                                   condition)
+
+
+class PrintFilter:
+    """Filter to avoid printing of tokenization warnings."""
+
+    def __init__(self, original_stdout):
+        self.original_stdout = original_stdout
+
+    def write(self, msg):
+        if 'huggingface/tokenizers' not in msg:
+            self.original_stdout.write(msg)
+
+    def flush(self):
+        self.original_stdout.flush()
@@ -151,5 +151,5 @@ def run_logic(self, context: Context) -> CheckResult:
         return CheckResult(
             value=results,
             display=displays,
-            header='Properties Drift'
+            header='Property Drift'
         )
@@ -96,7 +96,9 @@
 #
 # Deepchecks comes with a set of pre-built suites that can be used to run a set of checks on your data, alongside
 # with their default conditions and thresholds. You can read more about customizing and creating your own suites in the
-# :ref:`Customizations Guide <general__customizations>`.
+# :ref:`Customizations Guide <general__customizations>`. In this guide we'll be using 3 suites - the data integrity
+# suite, the train test validation suite and the model evaluation suite. You can also run all the checks at once using
+# the :mod:`full_suite <deepchecks.nlp.suites>`.
 #
 # Data Integrity
 # --------------

@@ -47,7 +47,8 @@
 Load Data
 ---------
 For the purpose of this guide, we'll use a small subset of the
-`tweet emotion <https://github.com/cardiffnlp/tweeteval>`__ dataset:
+`tweet emotion <https://github.com/cardiffnlp/tweeteval>`__ dataset. This dataset contains tweets and their
+corresponding emotion - Anger, Happiness, Optimism, and Sadness.
 
 """
 
@@ -115,7 +116,9 @@
 #
 # Deepchecks comes with a set of pre-built suites that can be used to run a set of checks on your data, alongside
 # with their default conditions and thresholds. You can read more about customizing and creating your own suites in the
-# :ref:`Customizations Guide <general__customizations>`.
+# :ref:`Customizations Guide <general__customizations>`. In this guide we'll be using 3 suites - the data integrity
+# suite, the train test validation suite and the model evaluation suite. You can also run all the checks at once using
+# the :mod:`full_suite <deepchecks.nlp.suites>`.
 #
 # Data Integrity
 # --------------

@@ -155,3 +155,4 @@ misclassified
 Uncomment
 dimensionality
 tokenization
+huggingface