Noam/address null properties (#2585)

* fix to loading the large just dance dataset * Note that null (non english)noam/Address-null-properties properties will be ignored. * Improve descriptions * Add english only column * avoid long properties in tests - in also cases problems with torch 1.10.2 which is current version --------- Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>
deepchecks · Jun 7, 2023 · b62a8bd · b62a8bd
1 parent 59464b8
commit b62a8bd
Show file tree

Hide file tree

Showing 6 changed files with 61 additions and 42 deletions.
diff --git a/deepchecks/nlp/datasets/classification/just_dance_comment_analysis.py b/deepchecks/nlp/datasets/classification/just_dance_comment_analysis.py
@@ -193,7 +193,8 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True, use_ful
             include_properties, include_embeddings = False, False
 
     if use_full_size:
-        data = read_and_save_data(ASSETS_DIR, 'just_dance_data.csv', _FULL_DATA_URL, to_numpy=False)
+        data = read_and_save_data(ASSETS_DIR, 'just_dance_data.csv', _FULL_DATA_URL, to_numpy=False,
+                                  include_index=False)
     else:
         data = read_and_save_data(ASSETS_DIR, 'just_dance_shorted_data.csv', _SHORT_DATA_URL, to_numpy=False)
     data[_TIME_COL] = pd.to_datetime(data[_TIME_COL])

diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py
@@ -658,15 +658,15 @@ class TextProperty(TypedDict):
     'Average Word Length': 'Average number of characters in a word',
     'Max Word Length': 'Maximum number of characters in a word',
     '% Special Characters': 'Percentage of special characters in the text',
-    'Language': 'Language of the text',
-    'Sentiment': 'Sentiment of the text',
-    'Subjectivity': 'Subjectivity of the text',
+    'Language': 'Language of the text, using the fasttext language detection model',
+    'Sentiment': 'Sentiment of the text, calculated using the TextBlob sentiment analysis model',
+    'Subjectivity': 'Subjectivity of the text, calculated using the TextBlob sentiment analysis model',
     'Average Words Per Sentence': 'Average number of words per sentence in the text',
     'Readability Score': 'A score calculated based on Flesch reading-ease per text sample',
     'Lexical Density': 'Percentage of unique words in the text',
-    'Toxicity': 'Toxicity of the text',
-    'Fluency': 'Fluency of the text',
-    'Formality': 'Formality of the text',
+    'Toxicity': 'Toxicity score using unitary/toxic-bert HuggingFace model',
+    'Fluency': 'Fluency score using prithivida/parrot_fluency_model HuggingFace model',
+    'Formality': 'Formality score using s-nlp/roberta-base-formality-ranker HuggingFace model',
     'Unique Noun Count': 'Number of unique noun words in the text',
     'URLs Count': 'Number of URLS per text sample',
     'Email Addresses Count': 'Number of email addresses per text sample',
@@ -721,7 +721,8 @@ def _select_properties(
     else:
         properties = default_properties
 
-    if not include_long_calculation_properties:
+    # include_long_calculation_properties is only applicable when include_properties is None
+    if not include_long_calculation_properties and include_properties is None:
         return [
             prop for prop in properties
             if prop['name'] not in LONG_RUN_PROPERTIES
@@ -784,7 +785,7 @@ def calculate_builtin_properties(
         all the default properties will be calculated. Cannot be used together with include_properties parameter.
     include_long_calculation_properties : bool, default False
         Whether to include properties that may take a long time to calculate. If False, these properties will be
-        ignored, even if they are in the include_properties parameter.
+        ignored, unless they are specified in the include_properties parameter explicitly.
     device : int, default None
         The device to use for the calculation. If None, the default device will be used.
     models_storage : Union[str, pathlib.Path, None], default None

diff --git a/docs/source/checks/nlp/data_integrity/plot_text_property_outliers.py b/docs/source/checks/nlp/data_integrity/plot_text_property_outliers.py
@@ -45,9 +45,15 @@
 
 Which Text Properties Are Used?
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-By default the checks use the built-in text properties, and it's also possible to replace the default properties
-with custom ones. For the list of the built-in text properties and explanation about custom properties refer to
-:ref:`NLP properties <nlp__properties_guide>`.
+By default the checks uses the properties that where calculated for the train and test datasets, which by default are
+the built-in text properties. It's also possible to replace the default properties with custom ones. For the list
+of the built-in text properties and explanation about custom properties refer to :ref:`NLP properties
+<nlp__properties_guide>`.
+
+.. note::
+
+    If a property was not calculated for a sample (for example, if it applies only to English samples and the sample
+    is in another language), it will contain a nan value and will be ignored when calculating the outliers.
 
 """
 

diff --git a/docs/source/checks/nlp/train_test_validation/plot_property_drift.py b/docs/source/checks/nlp/train_test_validation/plot_property_drift.py
@@ -41,9 +41,15 @@
 Which NLP Properties Are Used?
 -------------------------------
 
-By default the checks use the built-in text properties, and it's also possible to replace the default properties
-with custom ones. For the list of the built-in text properties and explanation about custom properties refer to
-:ref:`NLP properties <nlp__properties_guide>`.
+By default the checks uses the properties that where calculated for the train and test datasets, which by default are
+the built-in text properties. It's also possible to replace the default properties with custom ones. For the list
+of the built-in text properties and explanation about custom properties refer to :ref:`NLP properties
+<nlp__properties_guide>`.
+
+.. note::
+
+    If a property was not calculated for a sample (for example, if it applies only to English samples and the sample
+    is in another language), it will contain a nan value and will be ignored when calculating the drift.
 
 Prepare data
 =============

diff --git a/docs/source/nlp/usage_guides/nlp_properties.rst b/docs/source/nlp/usage_guides/nlp_properties.rst
@@ -49,32 +49,32 @@ There are two types of built-in properties:
 
 The built-in image properties are:
 
-==============================  ================  ==========
-Property name                   Default Property  Description
-==============================  ================  ==========
-Text Length                     Yes               Number of characters in the text
-Average Word Length             Yes               Average number of characters in a word
-Max Word Length                 Yes               Maximum number of characters in a word
-% Special Characters            Yes               Percentage of special characters in the text
-Language                        Yes               Language of the text. Uses the langdetect library
-Sentiment                       Yes               Sentiment of the text. Uses the textblob library
-Subjectivity                    Yes               Subjectivity of the text. Uses the textblob library
-Toxicity*                       Yes               Toxicity of the text. Uses the unitary/toxic-bert model
-Fluency*                        Yes               Fluency of the text. Uses the prithivida/parrot_fluency_model model
-Formality*                      Yes               Formality of the text. Uses the s-nlp/roberta-base-formality-ranker model
-Lexical Density                 Yes               Percentage of unique words in the text, rounded up to 2 decimal digits
-Unique Noun Count*              Yes               Number of unique noun words in the text
-Readability Score               Yes               A score calculated based on Flesch reading-ease per text sample. For more information: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease
-Average Words Per Sentence      Yes               Average number of words per sentence in the text
-URLs Count                      No                Number of URLS per text sample.
-Unique URLs Count               No                Number of unique URLS per text sample.
-Email Addresses Count           No                Number of email addresses per text sample.
-Unique Email Addresses Count    No                Number of unique email addresses per text sample.
-Unique Syllables Count          No                Number of unique syllables per text sample.
-Reading Time                    No                Time taken in seconds to read a text sample.
-Sentences Count                 No                Number of sentences per text sample.
-Average Syllable Length         No                Average number of syllables per sentence per text sample.
-==============================  ================  ==========
+==============================  ================  ====================================================================================================================================================================================  ===============
+Property name                   Default Property  Description                                                                                                                                                                           English Only
+==============================  ================  ====================================================================================================================================================================================  ===============
+Text Length                     Yes               Number of characters in the text                                                                                                                                                      No
+Average Word Length             Yes               Average number of characters in a word                                                                                                                                                No
+Max Word Length                 Yes               Maximum number of characters in a word                                                                                                                                                No
+% Special Characters            Yes               Percentage of special characters in the text                                                                                                                                          No
+Language                        Yes               Language of the text. Uses the langdetect library                                                                                                                                     No
+Sentiment                       Yes               Sentiment of the text. Uses the textblob library                                                                                                                                      Yes
+Subjectivity                    Yes               Subjectivity of the text. Uses the textblob library                                                                                                                                   Yes
+Toxicity*                       Yes               Toxicity of the text. Uses the unitary/toxic-bert model                                                                                                                               Yes
+Fluency*                        Yes               Fluency of the text. Uses the prithivida/parrot_fluency_model model                                                                                                                   Yes
+Formality*                      Yes               Formality of the text. Uses the s-nlp/roberta-base-formality-ranker model                                                                                                             Yes
+Lexical Density                 Yes               Percentage of unique words in the text, rounded up to 2 decimal digits                                                                                                                Yes
+Unique Noun Count*              Yes               Number of unique noun words in the text                                                                                                                                               Yes
+Readability Score               Yes               A score calculated based on Flesch reading-ease per text sample. For more information: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease     Yes
+Average Words Per Sentence      Yes               Average number of words per sentence in the text                                                                                                                                      No
+URLs Count                      No                Number of URLS per text sample.                                                                                                                                                       No
+Unique URLs Count               No                Number of unique URLS per text sample.                                                                                                                                                No
+Email Addresses Count           No                Number of email addresses per text sample.                                                                                                                                            No
+Unique Email Addresses Count    No                Number of unique email addresses per text sample.                                                                                                                                     No
+Unique Syllables Count          No                Number of unique syllables per text sample.                                                                                                                                           No
+Reading Time                    No                Time taken in seconds to read a text sample.                                                                                                                                          No
+Sentences Count                 No                Number of sentences per text sample.                                                                                                                                                  No
+Average Syllable Length         No                Average number of syllables per sentence per text sample.                                                                                                                             No
+==============================  ================  ====================================================================================================================================================================================  ===============
 
 *These properties are not calculated by default, as they may take a long time to calculate. To use them, pass
 ``include_long_calculation_properties=True`` to the :class:`TextData.calculate_properties <deepchecks.nlp.TextData>` method.
@@ -115,6 +115,11 @@ In the following example, we will calculate the default properties in order to u
 
 Note that any use of the ``TextData.calculate_builtin_properties`` method will override the existing properties.
 
+.. admonition:: Non-English Samples
+
+    If a property was not calculated for a sample (for example, if it applies only to English samples and the sample
+    is in another language), it will contain a nan value and will be ignored when calculating the different checks.
+
 Including or Ignoring Properties
 #################################
 

diff --git a/tests/nlp/checks/train_test_validation/property_drift_test.py b/tests/nlp/checks/train_test_validation/property_drift_test.py
@@ -171,7 +171,7 @@ def test_with_drift(self, dummy_multilabel_textdata_train_test):
         # Arrange
         train, test = dummy_multilabel_textdata_train_test
         default_properties = ['Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters',
-                              'Language', 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', ]
+                              'Language', 'Sentiment', 'Subjectivity']
         train.calculate_builtin_properties(include_properties=default_properties)
         test.calculate_builtin_properties(include_properties=default_properties)
         check = PropertyDrift(min_samples=20).add_condition_drift_score_less_than(max_allowed_numeric_score=0.3,