Skip to content

Commit

Permalink
Fixes #2454 Added Readability score and average sentence length text …
Browse files Browse the repository at this point in the history
…property (#2512)

* Added Flesch reading ease and average sentence length text property

---------

Co-authored-by: Harsh Jain <harsh.jain@springworks.in>
Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>
  • Loading branch information
3 people committed May 11, 2023
1 parent fe3f69e commit 3ebf01c
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 14 deletions.
75 changes: 69 additions & 6 deletions deepchecks/nlp/utils/text_properties.py
Expand Up @@ -19,8 +19,11 @@
import pandas as pd
import requests
import textblob
from nltk import corpus
from nltk import download as nltk_download
from nltk import sent_tokenize, word_tokenize

from deepchecks.nlp.utils.text import remove_punctuation
from deepchecks.utils.function import run_available_kwargs

__all__ = ['calculate_default_properties']
Expand Down Expand Up @@ -304,7 +307,7 @@ def lexical_density(raw_text: Sequence[str]) -> List[str]:
"""
if not nltk_download('punkt', quiet=True):
warnings.warn('nltk punkt not found, lexical density cannot be calculated.'
' Please check your internet connection.')
' Please check your internet connection.', UserWarning)
return [np.nan] * len(raw_text)
result = []
for text in raw_text:
Expand All @@ -323,7 +326,7 @@ def unique_noun_count(raw_text: Sequence[str]) -> List[str]:
"""Return a list of integers of number of unique noun words in the text."""
if not nltk_download('averaged_perceptron_tagger', quiet=True):
warnings.warn('nltk averaged_perceptron_tagger not found, unique noun count cannot be calculated.'
' Please check your internet connection.')
' Please check your internet connection.', UserWarning)
return [np.nan] * len(raw_text)
result = []
for text in raw_text:
Expand All @@ -335,6 +338,63 @@ def unique_noun_count(raw_text: Sequence[str]) -> List[str]:
return result


def readability_score(raw_text: Sequence[str]) -> List[str]:
"""Return a list of floats of Flesch Reading-Ease score per text sample.
In the Flesch reading-ease test, higher scores indicate material that is easier to read
whereas lower numbers mark texts that are more difficult to read. For more information:
https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease
"""
if not nltk_download('punkt', quiet=True):
warnings.warn('nltk punkt not found, readability score cannot be calculated.'
' Please check your internet connection.', UserWarning)
return [np.nan] * len(raw_text)
if not nltk_download('cmudict', quiet=True):
warnings.warn('nltk cmudict not found, readability score cannot be calculated.'
' Please check your internet connection.', UserWarning)
return [np.nan] * len(raw_text)
result = []
cmudict_dict = corpus.cmudict.dict()
for text in raw_text:
if not pd.isna(text):
sentence_count = len(sent_tokenize(text))
text = remove_punctuation(text)
words = word_tokenize(text)
word_count = len(words)
syllable_count = sum([len(cmudict_dict[word.lower()]) for word in words if word.lower() in cmudict_dict])
if word_count != 0 and sentence_count != 0 and syllable_count != 0:
avg_syllables_per_word = syllable_count / word_count
avg_words_per_sentence = word_count / sentence_count
flesch_reading_ease = 206.835 - (1.015 * avg_words_per_sentence) - (84.6 * avg_syllables_per_word)
result.append(round(flesch_reading_ease, 3))
else:
result.append(np.nan)
else:
result.append(np.nan)
return result


def average_sentence_length(raw_text: Sequence[str]) -> List[str]:
"""Return a list of floats denoting the average sentence length per text sample."""
if not nltk_download('punkt', quiet=True):
warnings.warn('nltk punkt not found, average sentence length cannot be calculated.'
' Please check your internet connection.', UserWarning)
return [np.nan] * len(raw_text)
result = []
for text in raw_text:
if not pd.isna(text):
sentences = [remove_punctuation(sent) for sent in sent_tokenize(text)]
total_words = sum([len(word_tokenize(sentence)) for sentence in sentences])
if len(sentences) != 0:
asl = total_words / len(sentences)
result.append(round(asl, 0))
else:
result.append(np.nan)
else:
result.append(np.nan)
return result


DEFAULT_PROPERTIES = (
{'name': 'Text Length', 'method': text_length, 'output_type': 'numeric'},
{'name': 'Average Word Length', 'method': average_word_length, 'output_type': 'numeric'},
Expand All @@ -348,6 +408,8 @@ def unique_noun_count(raw_text: Sequence[str]) -> List[str]:
{'name': 'Formality', 'method': formality, 'output_type': 'numeric'},
{'name': 'Lexical Density', 'method': lexical_density, 'output_type': 'numeric'},
{'name': 'Unique Noun Count', 'method': unique_noun_count, 'output_type': 'numeric'},
{'name': 'Readability Score', 'method': readability_score, 'output_type': 'numeric'},
{'name': 'Average Sentence Length', 'method': average_sentence_length, 'output_type': 'numeric'},
)

LONG_RUN_PROPERTIES = ['Toxicity', 'Fluency', 'Formality', 'Unique Noun Count']
Expand Down Expand Up @@ -395,10 +457,11 @@ def calculate_default_properties(
The properties to calculate. If None, all default properties will be calculated. Cannot be used together
with ignore_properties parameter. Available properties are:
['Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters', 'Language',
'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Lexical Density', 'Unique Noun Count']
Note that the properties ['Toxicity', 'Fluency', 'Formality', 'Language'] may take a long time to calculate. If
include_long_calculation_properties is False, these properties will be ignored, even if they are in the
include_properties parameter.
'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Lexical Density', 'Unique Noun Count',
'Readability Score', 'Average Sentence Length']
Note that the properties ['Toxicity', 'Fluency', 'Formality', 'Language', 'Unique Noun Count'] may
take a long time to calculate. If include_long_calculation_properties is False, these properties will be
ignored, even if they are in the include_properties parameter.
ignore_properties : List[str], default None
The properties to ignore. If None, no properties will be ignored. Cannot be used together with
properties parameter.
Expand Down
2 changes: 2 additions & 0 deletions docs/source/nlp/usage_guides/nlp_properties.rst
Expand Up @@ -59,6 +59,8 @@ Fluency* Fluency of the text. Uses the prithivida/parrot_
Formality* Formality of the text. Uses the s-nlp/roberta-base-formality-ranker model
Lexical Density Percentage of unique words in the text, rounded up to 2 decimal digits
Unique Noun Count* Number of unique noun words in the text
Readability Score A score calculated based on Flesch reading-ease per text sample. For more information: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease
Average Sentence Length Average number of words per sentence in the text
============================== ==========

*These properties are not calculated by default, as they may take a long time to calculate. To use them, pass
Expand Down
Expand Up @@ -113,15 +113,15 @@ def test_token_classification_dataset(small_wikiann_train_test_text_data):
# Assert
assert_that(condition_result, has_items(
equal_condition_result(is_pass=False,
details='Found a segment with annotation ratio of 0.375 in comparison to an '
details='Found a segment with annotation ratio of 0.2 in comparison to an '
'average score of 0.8 in sampled data.',
name='The relative performance of weakest segment is greater than 80% of average model '
'performance.')
))

assert_that(result.value['avg_score'], close_to(0.8, 0.001))
assert_that(len(result.value['weak_segments_list']), equal_to(15))
assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.375, 0.01))
assert_that(len(result.value['weak_segments_list']), equal_to(25))
assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.2, 0.01))


def test_multilabel_dataset(multilabel_mock_dataset_and_probabilities):
Expand Down
5 changes: 3 additions & 2 deletions tests/nlp/checks/train_test_validation/property_drift_test.py
Expand Up @@ -171,8 +171,9 @@ def test_without_drift(self, dummy_multilabel_textdata_train_test):
def test_with_drift(self, dummy_multilabel_textdata_train_test):
# Arrange
train, test = dummy_multilabel_textdata_train_test
train.calculate_default_properties(ignore_properties=['Lexical Density','Unique Noun Count'])
test.calculate_default_properties(ignore_properties=['Lexical Density','Unique Noun Count'])
properties_to_ignore = ['Lexical Density','Unique Noun Count', 'Average Sentence Length', 'Readability Score']
train.calculate_default_properties(ignore_properties=properties_to_ignore)
test.calculate_default_properties(ignore_properties=properties_to_ignore)
check = PropertyDrift(min_samples=20).add_condition_drift_score_less_than(max_allowed_numeric_score=0.3,
max_allowed_categorical_score=0.3)
# Act
Expand Down
7 changes: 4 additions & 3 deletions tests/nlp/test_text_data.py
Expand Up @@ -157,11 +157,12 @@ def test_properties(text_classification_dataset_mock):
dataset.calculate_default_properties(ignore_properties=['topic'] + LONG_RUN_PROPERTIES)
properties = dataset.properties
assert_that(properties.shape[0], equal_to(3))
assert_that(properties.shape[1], equal_to(7))
assert_that(properties.shape[1], equal_to(9))
assert_that(properties.columns,
contains_exactly('Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters',
'Sentiment', 'Subjectivity', 'Lexical Density'))
assert_that(properties.iloc[0].values, contains_exactly(22, 3.6, 9, 0.0, 0.0, 0.0, 80.0 ))
'Sentiment', 'Subjectivity', 'Lexical Density', 'Readability Score',
'Average Sentence Length'))
assert_that(properties.iloc[0].values, contains_exactly(22, 3.6, 9, 0.0, 0.0, 0.0, 80.0, 100.24, 5))


def test_embeddings():
Expand Down
30 changes: 30 additions & 0 deletions tests/nlp/utils/test_properties.py
Expand Up @@ -76,6 +76,36 @@ def test_calculate_unique_noun_count_property(tweet_emotion_train_test_textdata)
assert_that(result_none_text['Unique Noun Count'], equal_to([np.nan]))


def test_calculate_average_sentence_length_property(tweet_emotion_train_test_textdata):

# Arrange
_, test = tweet_emotion_train_test_textdata
test_text = test.text

# Act
result = calculate_default_properties(test_text, include_properties=['Average Sentence Length'])[0]
result_none_text = calculate_default_properties([None], include_properties=['Average Sentence Length'])[0]

# Assert
assert_that(result['Average Sentence Length'][0: 10], equal_to([6, 7, 11, 12, 8, 19, 3, 9, 12, 7]))
assert_that(result_none_text['Average Sentence Length'], equal_to([np.nan]))


def test_calculate_readability_score_property(tweet_emotion_train_test_textdata):

# Arrange
_, test = tweet_emotion_train_test_textdata
test_text = test.text

# Act
result = calculate_default_properties(test_text, include_properties=['Readability Score'])[0]
result_none_text = calculate_default_properties([None], include_properties=['Readability Score'])[0]

# Assert
assert_that(result['Readability Score'][0: 10], equal_to([102.045, 97.001, 80.306, 67.755, 77.103, 71.782, 90.99, 75.5, 70.102, 95.564]))
assert_that(result_none_text['Readability Score'], equal_to([np.nan]))


@pytest.mark.skipif(
'TEST_NLP_PROPERTIES_MODELS_DOWNLOAD' not in os.environ,
reason='The test takes too long to run, provide env var if you want to run it.'
Expand Down

0 comments on commit 3ebf01c

Please sign in to comment.