Noam/dee 538 better handle download of nltk (#2493)

* quite download and warnings for nltk stuff * docs fix
deepchecks · May 4, 2023 · 294ed16 · 294ed16
1 parent bcbfa68
commit 294ed16
Show file tree

Hide file tree

Showing 3 changed files with 17 additions and 8 deletions.
diff --git a/deepchecks/nlp/checks/data_integrity/unknown_tokens.py b/deepchecks/nlp/checks/data_integrity/unknown_tokens.py
@@ -10,6 +10,7 @@
 #
 """Module contains the Unknown Tokens check."""
 import typing as t
+import warnings
 from collections import Counter
 
 import nltk
@@ -113,9 +114,11 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
     def find_unknown_words(self, samples, indices):
         """Find words with unknown tokens in samples."""
         # Choose tokenizer based on availability of nltk
-        if nltk.download('punkt'):
+        if nltk.download('punkt', quiet=True):
             tokenize = nltk.word_tokenize
         else:
+            warnings.warn('nltk punkt is not available, using str.split instead to identify individual words. '
+                          'Please check your internet connection.')
             tokenize = str.split
 
         # Tokenize samples and count unknown words

diff --git a/deepchecks/nlp/utils/text.py b/deepchecks/nlp/utils/text.py
@@ -12,6 +12,7 @@
 import string
 import typing as t
 import unicodedata
+import warnings
 
 import nltk
 from nltk.corpus import stopwords
@@ -62,10 +63,6 @@ def break_to_lines_and_trim(s, max_lines: int = 10, min_line_length: int = 50, m
     return '<br>'.join(lines)
 
 
-nltk.download('stopwords')
-nltk.download('punkt')
-
-
 def remove_punctuation(text: str) -> str:
     """Remove punctuation characters from a string."""
     return text.translate(str.maketrans('', '', string.punctuation))
@@ -78,8 +75,17 @@ def normalize_unicode(text: str) -> str:
 
 def remove_stopwords(text: str) -> str:
     """Remove stop words from a string."""
-    stop_words = set(stopwords.words('english'))
-    words = word_tokenize(text)
+    if nltk.download('stopwords', quiet=True):
+        stop_words = set(stopwords.words('english'))
+    else:
+        warnings.warn('nltk stopwords not found, stopwords won\'t be ignored when considering text duplicates.'
+                      ' Please check your internet connection.')
+        return text
+    if nltk.download('punkt', quiet=True):
+        tokenize = word_tokenize
+    else:
+        tokenize = str.split
+    words = tokenize(text)
     return ' '.join([word for word in words if word.lower() not in stop_words])
 
 

diff --git a/docs/source/nlp/usage_guides/text_data_object.rst b/docs/source/nlp/usage_guides/text_data_object.rst
@@ -4,7 +4,7 @@
 The TextData Object
 ===================
 
-The :class:`TextData <deepchecks.nlp.text_data.TextData>` is a container for your textual data, labels, and relevant
+The :class:`TextData <deepchecks.nlp.TextData>` is a container for your textual data, labels, and relevant
 metadata for NLP tasks and is a basic building block in the ``deepchecks.nlp`` subpackage.
 In order to use any functionality of the ``deepchecks.nlp`` subpackage, you need to first create a ``TextData`` object.
 The ``TextData`` object enables easy access to metadata, embeddings and properties relevant for training and validating ML