diff --git a/deepchecks/nlp/datasets/classification/just_dance_comment_analysis.py b/deepchecks/nlp/datasets/classification/just_dance_comment_analysis.py index e0086c4a9a..c059b3fddc 100644 --- a/deepchecks/nlp/datasets/classification/just_dance_comment_analysis.py +++ b/deepchecks/nlp/datasets/classification/just_dance_comment_analysis.py @@ -193,7 +193,8 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True, use_ful include_properties, include_embeddings = False, False if use_full_size: - data = read_and_save_data(ASSETS_DIR, 'just_dance_data.csv', _FULL_DATA_URL, to_numpy=False) + data = read_and_save_data(ASSETS_DIR, 'just_dance_data.csv', _FULL_DATA_URL, to_numpy=False, + include_index=False) else: data = read_and_save_data(ASSETS_DIR, 'just_dance_shorted_data.csv', _SHORT_DATA_URL, to_numpy=False) data[_TIME_COL] = pd.to_datetime(data[_TIME_COL]) diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py index f5f9c91cd0..171120ee2c 100644 --- a/deepchecks/nlp/utils/text_properties.py +++ b/deepchecks/nlp/utils/text_properties.py @@ -658,15 +658,15 @@ class TextProperty(TypedDict): 'Average Word Length': 'Average number of characters in a word', 'Max Word Length': 'Maximum number of characters in a word', '% Special Characters': 'Percentage of special characters in the text', - 'Language': 'Language of the text', - 'Sentiment': 'Sentiment of the text', - 'Subjectivity': 'Subjectivity of the text', + 'Language': 'Language of the text, using the fasttext language detection model', + 'Sentiment': 'Sentiment of the text, calculated using the TextBlob sentiment analysis model', + 'Subjectivity': 'Subjectivity of the text, calculated using the TextBlob sentiment analysis model', 'Average Words Per Sentence': 'Average number of words per sentence in the text', 'Readability Score': 'A score calculated based on Flesch reading-ease per text sample', 'Lexical Density': 'Percentage of unique words in the text', - 'Toxicity': 'Toxicity of the text', - 'Fluency': 'Fluency of the text', - 'Formality': 'Formality of the text', + 'Toxicity': 'Toxicity score using unitary/toxic-bert HuggingFace model', + 'Fluency': 'Fluency score using prithivida/parrot_fluency_model HuggingFace model', + 'Formality': 'Formality score using s-nlp/roberta-base-formality-ranker HuggingFace model', 'Unique Noun Count': 'Number of unique noun words in the text', 'URLs Count': 'Number of URLS per text sample', 'Email Addresses Count': 'Number of email addresses per text sample', @@ -721,7 +721,8 @@ def _select_properties( else: properties = default_properties - if not include_long_calculation_properties: + # include_long_calculation_properties is only applicable when include_properties is None + if not include_long_calculation_properties and include_properties is None: return [ prop for prop in properties if prop['name'] not in LONG_RUN_PROPERTIES @@ -784,7 +785,7 @@ def calculate_builtin_properties( all the default properties will be calculated. Cannot be used together with include_properties parameter. include_long_calculation_properties : bool, default False Whether to include properties that may take a long time to calculate. If False, these properties will be - ignored, even if they are in the include_properties parameter. + ignored, unless they are specified in the include_properties parameter explicitly. device : int, default None The device to use for the calculation. If None, the default device will be used. models_storage : Union[str, pathlib.Path, None], default None diff --git a/docs/source/checks/nlp/data_integrity/plot_text_property_outliers.py b/docs/source/checks/nlp/data_integrity/plot_text_property_outliers.py index 0972d37b01..e01221ecef 100644 --- a/docs/source/checks/nlp/data_integrity/plot_text_property_outliers.py +++ b/docs/source/checks/nlp/data_integrity/plot_text_property_outliers.py @@ -45,9 +45,15 @@ Which Text Properties Are Used? ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -By default the checks use the built-in text properties, and it's also possible to replace the default properties -with custom ones. For the list of the built-in text properties and explanation about custom properties refer to -:ref:`NLP properties `. +By default the checks uses the properties that where calculated for the train and test datasets, which by default are +the built-in text properties. It's also possible to replace the default properties with custom ones. For the list +of the built-in text properties and explanation about custom properties refer to :ref:`NLP properties +`. + +.. note:: + + If a property was not calculated for a sample (for example, if it applies only to English samples and the sample + is in another language), it will contain a nan value and will be ignored when calculating the outliers. """ diff --git a/docs/source/checks/nlp/train_test_validation/plot_property_drift.py b/docs/source/checks/nlp/train_test_validation/plot_property_drift.py index d10775f1d1..255ae6bec5 100644 --- a/docs/source/checks/nlp/train_test_validation/plot_property_drift.py +++ b/docs/source/checks/nlp/train_test_validation/plot_property_drift.py @@ -41,9 +41,15 @@ Which NLP Properties Are Used? ------------------------------- -By default the checks use the built-in text properties, and it's also possible to replace the default properties -with custom ones. For the list of the built-in text properties and explanation about custom properties refer to -:ref:`NLP properties `. +By default the checks uses the properties that where calculated for the train and test datasets, which by default are +the built-in text properties. It's also possible to replace the default properties with custom ones. For the list +of the built-in text properties and explanation about custom properties refer to :ref:`NLP properties +`. + +.. note:: + + If a property was not calculated for a sample (for example, if it applies only to English samples and the sample + is in another language), it will contain a nan value and will be ignored when calculating the drift. Prepare data ============= diff --git a/docs/source/nlp/usage_guides/nlp_properties.rst b/docs/source/nlp/usage_guides/nlp_properties.rst index b4411d9813..68d7e14824 100644 --- a/docs/source/nlp/usage_guides/nlp_properties.rst +++ b/docs/source/nlp/usage_guides/nlp_properties.rst @@ -49,32 +49,32 @@ There are two types of built-in properties: The built-in image properties are: -============================== ================ ========== -Property name Default Property Description -============================== ================ ========== -Text Length Yes Number of characters in the text -Average Word Length Yes Average number of characters in a word -Max Word Length Yes Maximum number of characters in a word -% Special Characters Yes Percentage of special characters in the text -Language Yes Language of the text. Uses the langdetect library -Sentiment Yes Sentiment of the text. Uses the textblob library -Subjectivity Yes Subjectivity of the text. Uses the textblob library -Toxicity* Yes Toxicity of the text. Uses the unitary/toxic-bert model -Fluency* Yes Fluency of the text. Uses the prithivida/parrot_fluency_model model -Formality* Yes Formality of the text. Uses the s-nlp/roberta-base-formality-ranker model -Lexical Density Yes Percentage of unique words in the text, rounded up to 2 decimal digits -Unique Noun Count* Yes Number of unique noun words in the text -Readability Score Yes A score calculated based on Flesch reading-ease per text sample. For more information: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease -Average Words Per Sentence Yes Average number of words per sentence in the text -URLs Count No Number of URLS per text sample. -Unique URLs Count No Number of unique URLS per text sample. -Email Addresses Count No Number of email addresses per text sample. -Unique Email Addresses Count No Number of unique email addresses per text sample. -Unique Syllables Count No Number of unique syllables per text sample. -Reading Time No Time taken in seconds to read a text sample. -Sentences Count No Number of sentences per text sample. -Average Syllable Length No Average number of syllables per sentence per text sample. -============================== ================ ========== +============================== ================ ==================================================================================================================================================================================== =============== +Property name Default Property Description English Only +============================== ================ ==================================================================================================================================================================================== =============== +Text Length Yes Number of characters in the text No +Average Word Length Yes Average number of characters in a word No +Max Word Length Yes Maximum number of characters in a word No +% Special Characters Yes Percentage of special characters in the text No +Language Yes Language of the text. Uses the langdetect library No +Sentiment Yes Sentiment of the text. Uses the textblob library Yes +Subjectivity Yes Subjectivity of the text. Uses the textblob library Yes +Toxicity* Yes Toxicity of the text. Uses the unitary/toxic-bert model Yes +Fluency* Yes Fluency of the text. Uses the prithivida/parrot_fluency_model model Yes +Formality* Yes Formality of the text. Uses the s-nlp/roberta-base-formality-ranker model Yes +Lexical Density Yes Percentage of unique words in the text, rounded up to 2 decimal digits Yes +Unique Noun Count* Yes Number of unique noun words in the text Yes +Readability Score Yes A score calculated based on Flesch reading-ease per text sample. For more information: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease Yes +Average Words Per Sentence Yes Average number of words per sentence in the text No +URLs Count No Number of URLS per text sample. No +Unique URLs Count No Number of unique URLS per text sample. No +Email Addresses Count No Number of email addresses per text sample. No +Unique Email Addresses Count No Number of unique email addresses per text sample. No +Unique Syllables Count No Number of unique syllables per text sample. No +Reading Time No Time taken in seconds to read a text sample. No +Sentences Count No Number of sentences per text sample. No +Average Syllable Length No Average number of syllables per sentence per text sample. No +============================== ================ ==================================================================================================================================================================================== =============== *These properties are not calculated by default, as they may take a long time to calculate. To use them, pass ``include_long_calculation_properties=True`` to the :class:`TextData.calculate_properties ` method. @@ -115,6 +115,11 @@ In the following example, we will calculate the default properties in order to u Note that any use of the ``TextData.calculate_builtin_properties`` method will override the existing properties. +.. admonition:: Non-English Samples + + If a property was not calculated for a sample (for example, if it applies only to English samples and the sample + is in another language), it will contain a nan value and will be ignored when calculating the different checks. + Including or Ignoring Properties ################################# diff --git a/tests/nlp/checks/train_test_validation/property_drift_test.py b/tests/nlp/checks/train_test_validation/property_drift_test.py index 9c64318177..ef92d44ae4 100644 --- a/tests/nlp/checks/train_test_validation/property_drift_test.py +++ b/tests/nlp/checks/train_test_validation/property_drift_test.py @@ -171,7 +171,7 @@ def test_with_drift(self, dummy_multilabel_textdata_train_test): # Arrange train, test = dummy_multilabel_textdata_train_test default_properties = ['Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters', - 'Language', 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', ] + 'Language', 'Sentiment', 'Subjectivity'] train.calculate_builtin_properties(include_properties=default_properties) test.calculate_builtin_properties(include_properties=default_properties) check = PropertyDrift(min_samples=20).add_condition_drift_score_less_than(max_allowed_numeric_score=0.3,