Fixes #2454 Added Readability score and average sentence length text …

…property (#2512) * Added Flesch reading ease and average sentence length text property --------- Co-authored-by: Harsh Jain <harsh.jain@springworks.in> Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>
deepchecks · May 11, 2023 · 3ebf01c · 3ebf01c
1 parent fe3f69e
commit 3ebf01c
Show file tree

Hide file tree

Showing 6 changed files with 111 additions and 14 deletions.
diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py
@@ -19,8 +19,11 @@
 import pandas as pd
 import requests
 import textblob
+from nltk import corpus
 from nltk import download as nltk_download
+from nltk import sent_tokenize, word_tokenize
 
+from deepchecks.nlp.utils.text import remove_punctuation
 from deepchecks.utils.function import run_available_kwargs
 
 __all__ = ['calculate_default_properties']
@@ -304,7 +307,7 @@ def lexical_density(raw_text: Sequence[str]) -> List[str]:
     """
     if not nltk_download('punkt', quiet=True):
         warnings.warn('nltk punkt not found, lexical density cannot be calculated.'
-                      ' Please check your internet connection.')
+                      ' Please check your internet connection.', UserWarning)
         return [np.nan] * len(raw_text)
     result = []
     for text in raw_text:
@@ -323,7 +326,7 @@ def unique_noun_count(raw_text: Sequence[str]) -> List[str]:
     """Return a list of integers of number of unique noun words in the text."""
     if not nltk_download('averaged_perceptron_tagger', quiet=True):
         warnings.warn('nltk averaged_perceptron_tagger not found, unique noun count cannot be calculated.'
-                      ' Please check your internet connection.')
+                      ' Please check your internet connection.', UserWarning)
         return [np.nan] * len(raw_text)
     result = []
     for text in raw_text:
@@ -335,6 +338,63 @@ def unique_noun_count(raw_text: Sequence[str]) -> List[str]:
     return result
 
 
+def readability_score(raw_text: Sequence[str]) -> List[str]:
+    """Return a list of floats of Flesch Reading-Ease score per text sample.
+
+    In the Flesch reading-ease test, higher scores indicate material that is easier to read
+    whereas lower numbers mark texts that are more difficult to read. For more information:
+    https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease
+    """
+    if not nltk_download('punkt', quiet=True):
+        warnings.warn('nltk punkt not found, readability score cannot be calculated.'
+                      ' Please check your internet connection.', UserWarning)
+        return [np.nan] * len(raw_text)
+    if not nltk_download('cmudict', quiet=True):
+        warnings.warn('nltk cmudict not found, readability score cannot be calculated.'
+                      ' Please check your internet connection.', UserWarning)
+        return [np.nan] * len(raw_text)
+    result = []
+    cmudict_dict = corpus.cmudict.dict()
+    for text in raw_text:
+        if not pd.isna(text):
+            sentence_count = len(sent_tokenize(text))
+            text = remove_punctuation(text)
+            words = word_tokenize(text)
+            word_count = len(words)
+            syllable_count = sum([len(cmudict_dict[word.lower()]) for word in words if word.lower() in cmudict_dict])
+            if word_count != 0 and sentence_count != 0 and syllable_count != 0:
+                avg_syllables_per_word = syllable_count / word_count
+                avg_words_per_sentence = word_count / sentence_count
+                flesch_reading_ease = 206.835 - (1.015 * avg_words_per_sentence) - (84.6 * avg_syllables_per_word)
+                result.append(round(flesch_reading_ease, 3))
+            else:
+                result.append(np.nan)
+        else:
+            result.append(np.nan)
+    return result
+
+
+def average_sentence_length(raw_text: Sequence[str]) -> List[str]:
+    """Return a list of floats denoting the average sentence length per text sample."""
+    if not nltk_download('punkt', quiet=True):
+        warnings.warn('nltk punkt not found, average sentence length cannot be calculated.'
+                      ' Please check your internet connection.', UserWarning)
+        return [np.nan] * len(raw_text)
+    result = []
+    for text in raw_text:
+        if not pd.isna(text):
+            sentences = [remove_punctuation(sent) for sent in sent_tokenize(text)]
+            total_words = sum([len(word_tokenize(sentence)) for sentence in sentences])
+            if len(sentences) != 0:
+                asl = total_words / len(sentences)
+                result.append(round(asl, 0))
+            else:
+                result.append(np.nan)
+        else:
+            result.append(np.nan)
+    return result
+
+
 DEFAULT_PROPERTIES = (
     {'name': 'Text Length', 'method': text_length, 'output_type': 'numeric'},
     {'name': 'Average Word Length', 'method': average_word_length, 'output_type': 'numeric'},
@@ -348,6 +408,8 @@ def unique_noun_count(raw_text: Sequence[str]) -> List[str]:
     {'name': 'Formality', 'method': formality, 'output_type': 'numeric'},
     {'name': 'Lexical Density', 'method': lexical_density, 'output_type': 'numeric'},
     {'name': 'Unique Noun Count', 'method': unique_noun_count, 'output_type': 'numeric'},
+    {'name': 'Readability Score', 'method': readability_score, 'output_type': 'numeric'},
+    {'name': 'Average Sentence Length', 'method': average_sentence_length, 'output_type': 'numeric'},
 )
 
 LONG_RUN_PROPERTIES = ['Toxicity', 'Fluency', 'Formality', 'Unique Noun Count']
@@ -395,10 +457,11 @@ def calculate_default_properties(
         The properties to calculate. If None, all default properties will be calculated. Cannot be used together
         with ignore_properties parameter. Available properties are:
         ['Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters', 'Language',
-        'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Lexical Density', 'Unique Noun Count']
-        Note that the properties ['Toxicity', 'Fluency', 'Formality', 'Language'] may take a long time to calculate. If
-        include_long_calculation_properties is False, these properties will be ignored, even if they are in the
-        include_properties parameter.
+        'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Lexical Density', 'Unique Noun Count',
+        'Readability Score', 'Average Sentence Length']
+        Note that the properties ['Toxicity', 'Fluency', 'Formality', 'Language', 'Unique Noun Count'] may
+        take a long time to calculate. If include_long_calculation_properties is False, these properties will be
+        ignored, even if they are in the include_properties parameter.
     ignore_properties : List[str], default None
         The properties to ignore. If None, no properties will be ignored. Cannot be used together with
         properties parameter.

diff --git a/docs/source/nlp/usage_guides/nlp_properties.rst b/docs/source/nlp/usage_guides/nlp_properties.rst
@@ -59,6 +59,8 @@ Fluency*                        Fluency of the text. Uses the prithivida/parrot_
 Formality*                      Formality of the text. Uses the s-nlp/roberta-base-formality-ranker model
 Lexical Density                 Percentage of unique words in the text, rounded up to 2 decimal digits
 Unique Noun Count*              Number of unique noun words in the text
+Readability Score               A score calculated based on Flesch reading-ease per text sample. For more information: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease
+Average Sentence Length         Average number of words per sentence in the text
 ==============================  ==========
 
 *These properties are not calculated by default, as they may take a long time to calculate. To use them, pass

diff --git a/tests/nlp/checks/data_integrity/under_annotated_segments_test.py b/tests/nlp/checks/data_integrity/under_annotated_segments_test.py
@@ -113,15 +113,15 @@ def test_token_classification_dataset(small_wikiann_train_test_text_data):
     # Assert
     assert_that(condition_result, has_items(
         equal_condition_result(is_pass=False,
-                               details='Found a segment with annotation ratio of 0.375 in comparison to an '
+                               details='Found a segment with annotation ratio of 0.2 in comparison to an '
                                        'average score of 0.8 in sampled data.',
                                name='The relative performance of weakest segment is greater than 80% of average model '
                                     'performance.')
     ))
 
     assert_that(result.value['avg_score'], close_to(0.8, 0.001))
-    assert_that(len(result.value['weak_segments_list']), equal_to(15))
-    assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.375, 0.01))
+    assert_that(len(result.value['weak_segments_list']), equal_to(25))
+    assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.2, 0.01))
 
 
 def test_multilabel_dataset(multilabel_mock_dataset_and_probabilities):

diff --git a/tests/nlp/checks/train_test_validation/property_drift_test.py b/tests/nlp/checks/train_test_validation/property_drift_test.py
@@ -171,8 +171,9 @@ def test_without_drift(self, dummy_multilabel_textdata_train_test):
     def test_with_drift(self, dummy_multilabel_textdata_train_test):
         # Arrange
         train, test = dummy_multilabel_textdata_train_test
-        train.calculate_default_properties(ignore_properties=['Lexical Density','Unique Noun Count'])
-        test.calculate_default_properties(ignore_properties=['Lexical Density','Unique Noun Count'])
+        properties_to_ignore = ['Lexical Density','Unique Noun Count', 'Average Sentence Length', 'Readability Score']
+        train.calculate_default_properties(ignore_properties=properties_to_ignore)
+        test.calculate_default_properties(ignore_properties=properties_to_ignore)
         check = PropertyDrift(min_samples=20).add_condition_drift_score_less_than(max_allowed_numeric_score=0.3,
                                                                                   max_allowed_categorical_score=0.3)
         # Act

diff --git a/tests/nlp/test_text_data.py b/tests/nlp/test_text_data.py
@@ -157,11 +157,12 @@ def test_properties(text_classification_dataset_mock):
     dataset.calculate_default_properties(ignore_properties=['topic'] + LONG_RUN_PROPERTIES)
     properties = dataset.properties
     assert_that(properties.shape[0], equal_to(3))
-    assert_that(properties.shape[1], equal_to(7))
+    assert_that(properties.shape[1], equal_to(9))
     assert_that(properties.columns,
                 contains_exactly('Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters',
-                                 'Sentiment', 'Subjectivity', 'Lexical Density'))
-    assert_that(properties.iloc[0].values, contains_exactly(22, 3.6, 9, 0.0, 0.0, 0.0, 80.0 ))
+                                 'Sentiment', 'Subjectivity', 'Lexical Density', 'Readability Score',
+                                 'Average Sentence Length'))
+    assert_that(properties.iloc[0].values, contains_exactly(22, 3.6, 9, 0.0, 0.0, 0.0, 80.0, 100.24, 5))
 
 
 def test_embeddings():

diff --git a/tests/nlp/utils/test_properties.py b/tests/nlp/utils/test_properties.py
@@ -76,6 +76,36 @@ def test_calculate_unique_noun_count_property(tweet_emotion_train_test_textdata)
     assert_that(result_none_text['Unique Noun Count'], equal_to([np.nan]))
 
 
+def test_calculate_average_sentence_length_property(tweet_emotion_train_test_textdata):
+
+    # Arrange
+    _, test = tweet_emotion_train_test_textdata
+    test_text = test.text
+
+    # Act
+    result = calculate_default_properties(test_text, include_properties=['Average Sentence Length'])[0]
+    result_none_text = calculate_default_properties([None], include_properties=['Average Sentence Length'])[0]
+
+    # Assert
+    assert_that(result['Average Sentence Length'][0: 10], equal_to([6, 7, 11, 12, 8, 19, 3, 9, 12, 7]))
+    assert_that(result_none_text['Average Sentence Length'], equal_to([np.nan]))
+
+
+def test_calculate_readability_score_property(tweet_emotion_train_test_textdata):
+
+    # Arrange
+    _, test = tweet_emotion_train_test_textdata
+    test_text = test.text
+
+    # Act
+    result = calculate_default_properties(test_text, include_properties=['Readability Score'])[0]
+    result_none_text = calculate_default_properties([None], include_properties=['Readability Score'])[0]
+
+    # Assert
+    assert_that(result['Readability Score'][0: 10], equal_to([102.045, 97.001, 80.306, 67.755, 77.103, 71.782, 90.99, 75.5, 70.102, 95.564]))
+    assert_that(result_none_text['Readability Score'], equal_to([np.nan]))
+
+
 @pytest.mark.skipif(
     'TEST_NLP_PROPERTIES_MODELS_DOWNLOAD' not in os.environ,
     reason='The test takes too long to run, provide env var if you want to run it.'