From 96c67c0a443d22ef733a6ee152c1708464f28ee2 Mon Sep 17 00:00:00 2001
From: Noam Bressler <noamzbr@gmail.com>
Date: Thu, 1 Jun 2023 12:20:50 +0300
Subject: [PATCH] Fix the property drift check name and others various fixes
 (#2572)

* simple name fix for Property Drift

* gitignore fix

* avoid annoying transformers print

* mention the full suite in quickstart
---
 .gitignore                                    |  4 +++
 deepchecks/analytics/anonymous_telemetry.py   |  2 +-
 .../checks/data_integrity/unknown_tokens.py   | 28 ++++++++++++++++---
 .../train_test_validation/property_drift.py   |  2 +-
 .../plot_multi_label_classification.py        |  4 ++-
 .../quickstarts/plot_text_classification.py   |  7 +++--
 spelling-allowlist.txt                        |  1 +
 7 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/.gitignore b/.gitignore
index a7717b136f..36f2ee5df4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -104,6 +104,8 @@ tweet_emotion_*.csv
 
 # docs build files
 docs/source/_build
+docs/html
+docs/doctrees
 
 # build folders of sphinx gallery
 docs/source/general/usage/customizations/auto_examples/
@@ -120,6 +122,8 @@ docs/source/vision/auto_tutorials
 docs/source/user-guide/tabular/auto_quickstarts
 docs/source/user-guide/vision/auto_quickstarts
 
+docs/source/checks_gallery
+
 # build artificats from running docs (vision, nlp, wandb export)
 docs/source/vision/tutorials/quickstarts/*.html
 
diff --git a/deepchecks/analytics/anonymous_telemetry.py b/deepchecks/analytics/anonymous_telemetry.py
index d0af629491..f0e0a669b3 100644
--- a/deepchecks/analytics/anonymous_telemetry.py
+++ b/deepchecks/analytics/anonymous_telemetry.py
@@ -44,7 +44,7 @@ def validate_latest_version():
             is_on_latest = result.read().decode() == 'True'
             if not is_on_latest:
                 get_logger().warning('You are using deepchecks version %s, however a newer version is available.'
-                                     'Deepchecks is frequently updated with major improvements. You should consider '
+                                     ' Deepchecks is frequently updated with major improvements. You should consider '
                                      'upgrading via the "python -m pip install --upgrade deepchecks" command.',
                                      deepchecks.__version__)
         except Exception:  # pylint: disable=broad-except
diff --git a/deepchecks/nlp/checks/data_integrity/unknown_tokens.py b/deepchecks/nlp/checks/data_integrity/unknown_tokens.py
index b7e625cad8..89caca41bd 100644
--- a/deepchecks/nlp/checks/data_integrity/unknown_tokens.py
+++ b/deepchecks/nlp/checks/data_integrity/unknown_tokens.py
@@ -9,7 +9,8 @@
 # ----------------------------------------------------------------------------
 #
 """Module contains the Unknown Tokens check."""
-import os
+import contextlib
+import sys
 import typing as t
 import warnings
 from collections import Counter
@@ -151,9 +152,14 @@ def find_unknown_words(self, samples, indices):
             # Batch tokenization
             # ------------------
             # Needed to avoid warning when used after loading a hub dataset
-            os.environ['TOKENIZERS_PARALLELISM '] = 'true'
-            tokenized_samples = self.tokenizer(list(samples), return_offsets_mapping=True, is_split_into_words=False,
-                                               truncation=False)
+            # We divert the printing to stdout (done by the rust code within the HuggingFace tokenizer)
+            # into this filter, that will filter out any print containing the str 'huggingface/tokenizers'
+            # This warning printout is activated when running this check after loading a HuggingFace dataset,
+            # and is irrelevant to us because we're not forking the process.
+            # see: https://github.com/huggingface/transformers/issues/5486
+            with contextlib.redirect_stdout(PrintFilter(sys.stdout)):
+                tokenized_samples = self.tokenizer(list(samples), return_offsets_mapping=True,
+                                                   is_split_into_words=False, truncation=False)
 
             for idx, (tokens, offsets_mapping, sample) in zip(indices, zip(tokenized_samples['input_ids'],
                                                                            tokenized_samples['offset_mapping'],
@@ -251,3 +257,17 @@ def condition(result):
 
         return self.add_condition(f'Ratio of unknown words is less than {format_percent(ratio)}',
                                   condition)
+
+
+class PrintFilter:
+    """Filter to avoid printing of tokenization warnings."""
+
+    def __init__(self, original_stdout):
+        self.original_stdout = original_stdout
+
+    def write(self, msg):
+        if 'huggingface/tokenizers' not in msg:
+            self.original_stdout.write(msg)
+
+    def flush(self):
+        self.original_stdout.flush()
diff --git a/deepchecks/nlp/checks/train_test_validation/property_drift.py b/deepchecks/nlp/checks/train_test_validation/property_drift.py
index 8c211c84ff..b0c599d17b 100644
--- a/deepchecks/nlp/checks/train_test_validation/property_drift.py
+++ b/deepchecks/nlp/checks/train_test_validation/property_drift.py
@@ -151,5 +151,5 @@ def run_logic(self, context: Context) -> CheckResult:
         return CheckResult(
             value=results,
             display=displays,
-            header='Properties Drift'
+            header='Property Drift'
         )
diff --git a/docs/source/nlp/tutorials/quickstarts/plot_multi_label_classification.py b/docs/source/nlp/tutorials/quickstarts/plot_multi_label_classification.py
index 61692f41c6..7280905a4f 100644
--- a/docs/source/nlp/tutorials/quickstarts/plot_multi_label_classification.py
+++ b/docs/source/nlp/tutorials/quickstarts/plot_multi_label_classification.py
@@ -96,7 +96,9 @@
 #
 # Deepchecks comes with a set of pre-built suites that can be used to run a set of checks on your data, alongside
 # with their default conditions and thresholds. You can read more about customizing and creating your own suites in the
-# :ref:`Customizations Guide <general__customizations>`.
+# :ref:`Customizations Guide <general__customizations>`. In this guide we'll be using 3 suites - the data integrity
+# suite, the train test validation suite and the model evaluation suite. You can also run all the checks at once using
+# the :mod:`full_suite <deepchecks.nlp.suites>`.
 #
 # Data Integrity
 # --------------
diff --git a/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py b/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py
index 2098431fa1..ca61bdd4e1 100644
--- a/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py
+++ b/docs/source/nlp/tutorials/quickstarts/plot_text_classification.py
@@ -47,7 +47,8 @@
 Load Data
 ---------
 For the purpose of this guide, we'll use a small subset of the
-`tweet emotion <https://github.com/cardiffnlp/tweeteval>`__ dataset:
+`tweet emotion <https://github.com/cardiffnlp/tweeteval>`__ dataset. This dataset contains tweets and their
+corresponding emotion - Anger, Happiness, Optimism, and Sadness.
 
 """
 
@@ -115,7 +116,9 @@
 #
 # Deepchecks comes with a set of pre-built suites that can be used to run a set of checks on your data, alongside
 # with their default conditions and thresholds. You can read more about customizing and creating your own suites in the
-# :ref:`Customizations Guide <general__customizations>`.
+# :ref:`Customizations Guide <general__customizations>`. In this guide we'll be using 3 suites - the data integrity
+# suite, the train test validation suite and the model evaluation suite. You can also run all the checks at once using
+# the :mod:`full_suite <deepchecks.nlp.suites>`.
 #
 # Data Integrity
 # --------------
diff --git a/spelling-allowlist.txt b/spelling-allowlist.txt
index 587c23697a..d6d21355ef 100644
--- a/spelling-allowlist.txt
+++ b/spelling-allowlist.txt
@@ -155,3 +155,4 @@ misclassified
 Uncomment
 dimensionality
 tokenization
+huggingface