Fixes #2524 Refactoring of the calculate_default_properties function …

…and adding new text properties (#2536) * Refactoring of the calculate_default_properties function and adding new properties --------- Co-authored-by: Noam Bressler <noamzbr@gmail.com>
deepchecks · May 21, 2023 · a1f921e · a1f921e
1 parent a886103
commit a1f921e
Show file tree

Hide file tree

Showing 12 changed files with 436 additions and 105 deletions.
diff --git a/.gitignore b/.gitignore
@@ -138,4 +138,14 @@ benchmarks/results
 deepchecks/nlp/utils/.nlp-models
 
 # embedding files
-tests/nlp/utils/embeddings.csv
+tests/nlp/utils/embeddings.csv
+embeddings.csv.npy
+embeddings.npy
+deepchecks/nlp/datasets/assets/tweet_emotion/tweet_emotion_embeddings.npy
+
+# nlp datasets
+deepchecks/nlp/datasets/assets/just_dance_comment_analysis
+
+# nlp test properties
+metadata.csv
+test_properties.csv
diff --git a/deepchecks/nlp/text_data.py b/deepchecks/nlp/text_data.py
@@ -23,7 +23,7 @@
                                               validate_raw_text, validate_tokenized_text)
 from deepchecks.nlp.task_type import TaskType, TTextLabel
 from deepchecks.nlp.utils.text_embeddings import calculate_default_embeddings
-from deepchecks.nlp.utils.text_properties import calculate_default_properties
+from deepchecks.nlp.utils.text_properties import calculate_builtin_properties
 from deepchecks.utils.logger import get_logger
 from deepchecks.utils.metrics import is_label_none
 from deepchecks.utils.validation import is_sequence_not_str
@@ -85,7 +85,7 @@ class TextData:
         properties. If None, no properties are set.
         The number of rows in the properties DataFrame must be equal to the number of samples in the dataset, and the
         order of the rows must be the same as the order of the samples in the dataset.
-        In order to calculate the default properties, use the `TextData.calculate_default_properties` function after
+        In order to calculate the default properties, use the `TextData.calculate_builtin_properties` function after
         the creation of the TextData object.
         For more on properties, see the `NLP Properties Guide
         <https://docs.deepchecks.com/stable/nlp/usage_guides/nlp_properties.html>`_.
@@ -372,7 +372,7 @@ def set_metadata(
         self._metadata = metadata.reset_index(drop=True)
         self._cat_metadata = column_types.categorical_columns
 
-    def calculate_default_properties(
+    def calculate_builtin_properties(
         self,
         include_properties: t.Optional[t.List[str]] = None,
         ignore_properties: t.Optional[t.List[str]] = None,
@@ -398,7 +398,7 @@ def calculate_default_properties(
         if self._properties is not None:
             warnings.warn('Properties already exist, overwriting them', UserWarning)
 
-        properties, properties_types = calculate_default_properties(
+        properties, properties_types = calculate_builtin_properties(
             list(self.text),
             include_properties=include_properties,
             ignore_properties=ignore_properties,
@@ -442,7 +442,7 @@ def save_properties(self, path: str):
         if self._properties is None:
             raise DeepchecksNotSupportedError(
                 'TextData does not contain properties, add them by using '
-                '"calculate_default_properties" or "set_properties" functions'
+                '"calculate_builtin_properties" or "set_properties" functions'
             )
 
         self._properties.to_csv(path, index=False)
@@ -454,7 +454,7 @@ def properties(self) -> pd.DataFrame:
             raise DeepchecksNotSupportedError(
                 'Functionality requires properties, but the the TextData object had none. To use this functionality, '
                 'use the set_properties method to set your own properties with a pandas.DataFrame or use '
-                'TextData.calculate_default_properties to add the default deepchecks properties.'
+                'TextData.calculate_builtin_properties to add the default deepchecks properties.'
             )
         return self._properties
 

diff --git a/deepchecks/nlp/utils/__init__.py b/deepchecks/nlp/utils/__init__.py
@@ -12,10 +12,10 @@
 
 from deepchecks.nlp.utils.llm_utils import call_open_ai_completion_api
 from deepchecks.nlp.utils.text_embeddings import calculate_default_embeddings
-from deepchecks.nlp.utils.text_properties import calculate_default_properties
+from deepchecks.nlp.utils.text_properties import calculate_builtin_properties
 
 __all__ = [
-    'calculate_default_properties',
+    'calculate_builtin_properties',
     'calculate_default_embeddings',
     'call_open_ai_completion_api'
 ]
diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py
@@ -11,6 +11,7 @@
 """Module containing the text properties for the NLP module."""
 import importlib
 import pathlib
+import re
 import string
 import warnings
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
@@ -28,7 +29,7 @@
 from deepchecks.utils.function import run_available_kwargs
 from deepchecks.utils.ipython import create_progress_bar
 
-__all__ = ['calculate_default_properties']
+__all__ = ['calculate_builtin_properties']
 
 
 MODELS_STORAGE = pathlib.Path(__file__).absolute().parent / '.nlp-models'
@@ -411,10 +412,10 @@ def readability_score(raw_text: Sequence[str]) -> List[float]:
     for text in raw_text:
         if not pd.isna(text):
             sentence_count = len(sent_tokenize(text))
-            text = remove_punctuation(text)
+            text = remove_punctuation(text.lower())
             words = word_tokenize(text)
             word_count = len(words)
-            syllable_count = sum([len(cmudict_dict[word.lower()]) for word in words if word.lower() in cmudict_dict])
+            syllable_count = sum([len(cmudict_dict[word]) for word in words if word in cmudict_dict])
             if word_count != 0 and sentence_count != 0 and syllable_count != 0:
                 avg_syllables_per_word = syllable_count / word_count
                 avg_words_per_sentence = word_count / sentence_count
@@ -448,6 +449,113 @@ def average_sentence_length(raw_text: Sequence[str]) -> List[float]:
     return result
 
 
+def count_unique_urls(raw_text: Sequence[str]) -> List[str]:
+    """Return a list of integers denoting the number of unique URLS per text sample."""
+    url_pattern = r'https?:\/\/(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
+    return [len(set(re.findall(url_pattern, text))) if not pd.isna(text) else 0 for text in raw_text]
+
+
+def count_urls(raw_text: Sequence[str]) -> List[str]:
+    """Return a list of integers denoting the number of URLS per text sample."""
+    url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
+    return [len(re.findall(url_pattern, text)) if not pd.isna(text) else 0 for text in raw_text]
+
+
+def count_unique_email_addresses(raw_text: Sequence[str]) -> List[str]:
+    """Return a list of integers denoting the number of unique email addresses per text sample."""
+    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b'
+    return [len(set(re.findall(email_pattern, text))) if not pd.isna(text) else 0 for text in raw_text]
+
+
+def count_email_addresses(raw_text: Sequence[str]) -> List[str]:
+    """Return a list of integers denoting the number of email addresses per text sample."""
+    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b'
+    return [len(re.findall(email_pattern, text)) if not pd.isna(text) else 0 for text in raw_text]
+
+
+def count_unique_syllables(raw_text: Sequence[str]) -> List[str]:
+    """Return a list of integers denoting the number of unique syllables per text sample."""
+    if not nltk_download('punkt', quiet=True):
+        warnings.warn('nltk punkt not found, readability score cannot be calculated.'
+                      ' Please check your internet connection.', UserWarning)
+        return [np.nan] * len(raw_text)
+    if not nltk_download('cmudict', quiet=True):
+        warnings.warn('nltk cmudict not found, readability score cannot be calculated.'
+                      ' Please check your internet connection.', UserWarning)
+        return [np.nan] * len(raw_text)
+    result = []
+    cmudict_dict = corpus.cmudict.dict()
+    for text in raw_text:
+        if not pd.isna(text):
+            text = remove_punctuation(text.lower())
+            words = word_tokenize(text)
+            syllables = {word: True for word in words if word in cmudict_dict}
+            result.append(len(syllables))
+        else:
+            result.append(np.nan)
+    return result
+
+
+def reading_time(raw_text: Sequence[str]) -> List[str]:
+    """Return a list of integers denoting time in seconds to read each text sample.
+
+    The formula is based on Demberg & Keller, 2008 where it is assumed that
+    reading a character taken 14.69 milliseconds on average.
+    """
+    ms_per_char = 14.69
+    result = []
+    for text in raw_text:
+        if not pd.isna(text):
+            words = text.split()
+            nchars = map(len, words)
+            rt_per_word = map(lambda nchar: nchar * ms_per_char, nchars)
+            ms_reading_time = sum(list(rt_per_word))
+            result.append(round(ms_reading_time/1000, 2))
+        else:
+            result.append(0.00)
+    return result
+
+
+def sentence_length(raw_text: Sequence[str]) -> List[str]:
+    """Return a list of integers denoting the number of sentences per text sample."""
+    if not nltk_download('punkt', quiet=True):
+        warnings.warn('nltk punkt not found, average syllable length cannot be calculated.'
+                      ' Please check your internet connection.', UserWarning)
+        return [np.nan] * len(raw_text)
+    result = []
+    for text in raw_text:
+        if not pd.isna(text):
+            sentence_count = len(sent_tokenize(text))
+            result.append(sentence_count)
+        else:
+            result.append(np.nan)
+    return result
+
+
+def average_syllable_length(raw_text: Sequence[str]) -> List[str]:
+    """Return a list of integers denoting the average number of syllables per sentences per text sample."""
+    if not nltk_download('punkt', quiet=True):
+        warnings.warn('nltk punkt not found, average syllable length cannot be calculated.'
+                      ' Please check your internet connection.', UserWarning)
+        return [np.nan] * len(raw_text)
+    if not nltk_download('cmudict', quiet=True):
+        warnings.warn('nltk cmudict not found, average syllable length cannot be calculated.'
+                      ' Please check your internet connection.', UserWarning)
+        return [np.nan] * len(raw_text)
+    cmudict_dict = corpus.cmudict.dict()
+    result = []
+    for text in raw_text:
+        if not pd.isna(text):
+            sentence_count = len(sent_tokenize(text))
+            text = remove_punctuation(text.lower())
+            words = word_tokenize(text)
+            syllable_count = sum([len(cmudict_dict[word]) for word in words if word in cmudict_dict])
+            result.append(round(syllable_count/sentence_count, 2))
+        else:
+            result.append(np.nan)
+    return result
+
+
 class TextProperty(TypedDict):
     name: str
     method: Callable[..., Sequence[Any]]
@@ -472,13 +580,24 @@ class TextProperty(TypedDict):
 )
 
 
+ALL_PROPERTIES: Tuple[TextProperty, ...] = (
+    {'name': 'Count URLs', 'method': count_urls, 'output_type': 'numeric'},
+    {'name': 'Count Email Address', 'method': count_email_addresses, 'output_type': 'numeric'},
+    {'name': 'Count Unique URLs', 'method': count_unique_urls, 'output_type': 'numeric'},
+    {'name': 'Count Unique Email Address', 'method': count_unique_email_addresses, 'output_type': 'numeric'},
+    {'name': 'Count Unique Syllables', 'method': count_unique_syllables, 'output_type': 'numeric'},
+    {'name': 'Reading Time', 'method': reading_time, 'output_type': 'numeric'},
+    {'name': 'Sentence Length', 'method': sentence_length, 'output_type': 'numeric'},
+    {'name': 'Average Syllable Length', 'method': average_syllable_length, 'output_type': 'numeric'},
+) + DEFAULT_PROPERTIES
+
+
 LONG_RUN_PROPERTIES = ('Toxicity', 'Fluency', 'Formality', 'Unique Noun Count')
 LARGE_SAMPLE_SIZE = 10_000
 
 ENGLISH_ONLY_PROPERTIES = (
-    'Sentiment', 'Subjectivity', 'Toxicity',
-    'Fluency', 'Formality', 'Readability Score',
-    'Unique Noun Count'
+    'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Readability Score',
+    'Unique Noun Count', 'Count Unique Syllables', 'Sentence Length', 'Average Syllable Length'
 )
 
 
@@ -491,15 +610,18 @@ def _select_properties(
     device: Optional[str] = None,
 ) -> Sequence[TextProperty]:
     """Select properties."""
-    properties = DEFAULT_PROPERTIES
+    all_properties = ALL_PROPERTIES
+    default_properties = DEFAULT_PROPERTIES
 
     if include_properties is not None and ignore_properties is not None:
         raise ValueError('Cannot use properties and ignore_properties parameters together.')
 
     if include_properties is not None:
-        properties = [prop for prop in properties if prop['name'] in include_properties]
+        properties = [prop for prop in all_properties if prop['name'] in include_properties]
     elif ignore_properties is not None:
-        properties = [prop for prop in properties if prop['name'] not in ignore_properties]
+        properties = [prop for prop in default_properties if prop['name'] not in ignore_properties]
+    else:
+        properties = default_properties
 
     if not include_long_calculation_properties:
         return [
@@ -528,7 +650,7 @@ def _select_properties(
     return properties
 
 
-def calculate_default_properties(
+def calculate_builtin_properties(
     raw_text: Sequence[str],
     include_properties: Optional[List[str]] = None,
     ignore_properties: Optional[List[str]] = None,
@@ -543,17 +665,25 @@ def calculate_default_properties(
     raw_text : Sequence[str]
         The text to calculate the properties for.
     include_properties : List[str], default None
-        The properties to calculate. If None, all default properties will be calculated. Cannot be used together
-        with ignore_properties parameter. Available properties are:
+        The properties to calculate. If None, all default properties will be calculated. Cannot be used
+        together with ignore_properties parameter. Available properties are:
         ['Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters', 'Language',
         'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Lexical Density', 'Unique Noun Count',
-        'Readability Score', 'Average Sentence Length']
+        'Readability Score', 'Average Sentence Length', 'Count URLs', Count Unique URLs', 'Count Email Address',
+        'Count Unique Email Address', 'Count Unique Syllables', 'Reading Time', 'Sentence Length',
+        'Average Syllable Length']
+        List of default properties are: ['Text Length', 'Average Word Length', 'Max Word Length',
+        '% Special Characters', 'Language', 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality',
+        'Lexical Density', 'Unique Noun Count', 'Readability Score', 'Average Sentence Length']
+        To calculate all the default properties, the include_properties and ignore_properties parameters should
+        be None. If you pass either include_properties or ignore_properties then the only the properties specified
+        in the list will be calculated or ignored.
         Note that the properties ['Toxicity', 'Fluency', 'Formality', 'Language', 'Unique Noun Count'] may
         take a long time to calculate. If include_long_calculation_properties is False, these properties will be
         ignored, even if they are in the include_properties parameter.
     ignore_properties : List[str], default None
-        The properties to ignore. If None, no properties will be ignored. Cannot be used together with
-        properties parameter.
+        The properties to ignore from the list of default properties. If None, no properties will be ignored and
+        all the default properties will be calculated. Cannot be used together with include_properties parameter.
     include_long_calculation_properties : bool, default False
         Whether to include properties that may take a long time to calculate. If False, these properties will be
         ignored, even if they are in the include_properties parameter.

diff --git a/docs/source/checks/nlp/train_test_validation/plot_property_drift.py b/docs/source/checks/nlp/train_test_validation/plot_property_drift.py
@@ -54,8 +54,8 @@
 train_dataset, test_dataset = load_data()
 
 # # Calculate properties, commented out because it takes a short while to run
-# train_dataset.calculate_default_properties(include_long_calculation_properties=True)
-# test_dataset.calculate_default_properties(include_long_calculation_properties=True)
+# train_dataset.calculate_builtin_properties(include_long_calculation_properties=True)
+# test_dataset.calculate_builtin_properties(include_long_calculation_properties=True)
 
 #%%
 # Run the check