Skip to content

Commit

Permalink
Fixes #2524 Refactoring of the calculate_default_properties function …
Browse files Browse the repository at this point in the history
…and adding new text properties (#2536)

* Refactoring of the calculate_default_properties function and adding new properties
---------

Co-authored-by: Noam Bressler <noamzbr@gmail.com>
  • Loading branch information
hjain5164 and noamzbr committed May 21, 2023
1 parent a886103 commit a1f921e
Show file tree
Hide file tree
Showing 12 changed files with 436 additions and 105 deletions.
12 changes: 11 additions & 1 deletion .gitignore
Expand Up @@ -138,4 +138,14 @@ benchmarks/results
deepchecks/nlp/utils/.nlp-models

# embedding files
tests/nlp/utils/embeddings.csv
tests/nlp/utils/embeddings.csv
embeddings.csv.npy
embeddings.npy
deepchecks/nlp/datasets/assets/tweet_emotion/tweet_emotion_embeddings.npy

# nlp datasets
deepchecks/nlp/datasets/assets/just_dance_comment_analysis

# nlp test properties
metadata.csv
test_properties.csv
12 changes: 6 additions & 6 deletions deepchecks/nlp/text_data.py
Expand Up @@ -23,7 +23,7 @@
validate_raw_text, validate_tokenized_text)
from deepchecks.nlp.task_type import TaskType, TTextLabel
from deepchecks.nlp.utils.text_embeddings import calculate_default_embeddings
from deepchecks.nlp.utils.text_properties import calculate_default_properties
from deepchecks.nlp.utils.text_properties import calculate_builtin_properties
from deepchecks.utils.logger import get_logger
from deepchecks.utils.metrics import is_label_none
from deepchecks.utils.validation import is_sequence_not_str
Expand Down Expand Up @@ -85,7 +85,7 @@ class TextData:
properties. If None, no properties are set.
The number of rows in the properties DataFrame must be equal to the number of samples in the dataset, and the
order of the rows must be the same as the order of the samples in the dataset.
In order to calculate the default properties, use the `TextData.calculate_default_properties` function after
In order to calculate the default properties, use the `TextData.calculate_builtin_properties` function after
the creation of the TextData object.
For more on properties, see the `NLP Properties Guide
<https://docs.deepchecks.com/stable/nlp/usage_guides/nlp_properties.html>`_.
Expand Down Expand Up @@ -372,7 +372,7 @@ def set_metadata(
self._metadata = metadata.reset_index(drop=True)
self._cat_metadata = column_types.categorical_columns

def calculate_default_properties(
def calculate_builtin_properties(
self,
include_properties: t.Optional[t.List[str]] = None,
ignore_properties: t.Optional[t.List[str]] = None,
Expand All @@ -398,7 +398,7 @@ def calculate_default_properties(
if self._properties is not None:
warnings.warn('Properties already exist, overwriting them', UserWarning)

properties, properties_types = calculate_default_properties(
properties, properties_types = calculate_builtin_properties(
list(self.text),
include_properties=include_properties,
ignore_properties=ignore_properties,
Expand Down Expand Up @@ -442,7 +442,7 @@ def save_properties(self, path: str):
if self._properties is None:
raise DeepchecksNotSupportedError(
'TextData does not contain properties, add them by using '
'"calculate_default_properties" or "set_properties" functions'
'"calculate_builtin_properties" or "set_properties" functions'
)

self._properties.to_csv(path, index=False)
Expand All @@ -454,7 +454,7 @@ def properties(self) -> pd.DataFrame:
raise DeepchecksNotSupportedError(
'Functionality requires properties, but the the TextData object had none. To use this functionality, '
'use the set_properties method to set your own properties with a pandas.DataFrame or use '
'TextData.calculate_default_properties to add the default deepchecks properties.'
'TextData.calculate_builtin_properties to add the default deepchecks properties.'
)
return self._properties

Expand Down
4 changes: 2 additions & 2 deletions deepchecks/nlp/utils/__init__.py
Expand Up @@ -12,10 +12,10 @@

from deepchecks.nlp.utils.llm_utils import call_open_ai_completion_api
from deepchecks.nlp.utils.text_embeddings import calculate_default_embeddings
from deepchecks.nlp.utils.text_properties import calculate_default_properties
from deepchecks.nlp.utils.text_properties import calculate_builtin_properties

__all__ = [
'calculate_default_properties',
'calculate_builtin_properties',
'calculate_default_embeddings',
'call_open_ai_completion_api'
]
160 changes: 145 additions & 15 deletions deepchecks/nlp/utils/text_properties.py
Expand Up @@ -11,6 +11,7 @@
"""Module containing the text properties for the NLP module."""
import importlib
import pathlib
import re
import string
import warnings
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
Expand All @@ -28,7 +29,7 @@
from deepchecks.utils.function import run_available_kwargs
from deepchecks.utils.ipython import create_progress_bar

__all__ = ['calculate_default_properties']
__all__ = ['calculate_builtin_properties']


MODELS_STORAGE = pathlib.Path(__file__).absolute().parent / '.nlp-models'
Expand Down Expand Up @@ -411,10 +412,10 @@ def readability_score(raw_text: Sequence[str]) -> List[float]:
for text in raw_text:
if not pd.isna(text):
sentence_count = len(sent_tokenize(text))
text = remove_punctuation(text)
text = remove_punctuation(text.lower())
words = word_tokenize(text)
word_count = len(words)
syllable_count = sum([len(cmudict_dict[word.lower()]) for word in words if word.lower() in cmudict_dict])
syllable_count = sum([len(cmudict_dict[word]) for word in words if word in cmudict_dict])
if word_count != 0 and sentence_count != 0 and syllable_count != 0:
avg_syllables_per_word = syllable_count / word_count
avg_words_per_sentence = word_count / sentence_count
Expand Down Expand Up @@ -448,6 +449,113 @@ def average_sentence_length(raw_text: Sequence[str]) -> List[float]:
return result


def count_unique_urls(raw_text: Sequence[str]) -> List[str]:
"""Return a list of integers denoting the number of unique URLS per text sample."""
url_pattern = r'https?:\/\/(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
return [len(set(re.findall(url_pattern, text))) if not pd.isna(text) else 0 for text in raw_text]


def count_urls(raw_text: Sequence[str]) -> List[str]:
"""Return a list of integers denoting the number of URLS per text sample."""
url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
return [len(re.findall(url_pattern, text)) if not pd.isna(text) else 0 for text in raw_text]


def count_unique_email_addresses(raw_text: Sequence[str]) -> List[str]:
"""Return a list of integers denoting the number of unique email addresses per text sample."""
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b'
return [len(set(re.findall(email_pattern, text))) if not pd.isna(text) else 0 for text in raw_text]


def count_email_addresses(raw_text: Sequence[str]) -> List[str]:
"""Return a list of integers denoting the number of email addresses per text sample."""
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b'
return [len(re.findall(email_pattern, text)) if not pd.isna(text) else 0 for text in raw_text]


def count_unique_syllables(raw_text: Sequence[str]) -> List[str]:
"""Return a list of integers denoting the number of unique syllables per text sample."""
if not nltk_download('punkt', quiet=True):
warnings.warn('nltk punkt not found, readability score cannot be calculated.'
' Please check your internet connection.', UserWarning)
return [np.nan] * len(raw_text)
if not nltk_download('cmudict', quiet=True):
warnings.warn('nltk cmudict not found, readability score cannot be calculated.'
' Please check your internet connection.', UserWarning)
return [np.nan] * len(raw_text)
result = []
cmudict_dict = corpus.cmudict.dict()
for text in raw_text:
if not pd.isna(text):
text = remove_punctuation(text.lower())
words = word_tokenize(text)
syllables = {word: True for word in words if word in cmudict_dict}
result.append(len(syllables))
else:
result.append(np.nan)
return result


def reading_time(raw_text: Sequence[str]) -> List[str]:
"""Return a list of integers denoting time in seconds to read each text sample.
The formula is based on Demberg & Keller, 2008 where it is assumed that
reading a character taken 14.69 milliseconds on average.
"""
ms_per_char = 14.69
result = []
for text in raw_text:
if not pd.isna(text):
words = text.split()
nchars = map(len, words)
rt_per_word = map(lambda nchar: nchar * ms_per_char, nchars)
ms_reading_time = sum(list(rt_per_word))
result.append(round(ms_reading_time/1000, 2))
else:
result.append(0.00)
return result


def sentence_length(raw_text: Sequence[str]) -> List[str]:
"""Return a list of integers denoting the number of sentences per text sample."""
if not nltk_download('punkt', quiet=True):
warnings.warn('nltk punkt not found, average syllable length cannot be calculated.'
' Please check your internet connection.', UserWarning)
return [np.nan] * len(raw_text)
result = []
for text in raw_text:
if not pd.isna(text):
sentence_count = len(sent_tokenize(text))
result.append(sentence_count)
else:
result.append(np.nan)
return result


def average_syllable_length(raw_text: Sequence[str]) -> List[str]:
"""Return a list of integers denoting the average number of syllables per sentences per text sample."""
if not nltk_download('punkt', quiet=True):
warnings.warn('nltk punkt not found, average syllable length cannot be calculated.'
' Please check your internet connection.', UserWarning)
return [np.nan] * len(raw_text)
if not nltk_download('cmudict', quiet=True):
warnings.warn('nltk cmudict not found, average syllable length cannot be calculated.'
' Please check your internet connection.', UserWarning)
return [np.nan] * len(raw_text)
cmudict_dict = corpus.cmudict.dict()
result = []
for text in raw_text:
if not pd.isna(text):
sentence_count = len(sent_tokenize(text))
text = remove_punctuation(text.lower())
words = word_tokenize(text)
syllable_count = sum([len(cmudict_dict[word]) for word in words if word in cmudict_dict])
result.append(round(syllable_count/sentence_count, 2))
else:
result.append(np.nan)
return result


class TextProperty(TypedDict):
name: str
method: Callable[..., Sequence[Any]]
Expand All @@ -472,13 +580,24 @@ class TextProperty(TypedDict):
)


ALL_PROPERTIES: Tuple[TextProperty, ...] = (
{'name': 'Count URLs', 'method': count_urls, 'output_type': 'numeric'},
{'name': 'Count Email Address', 'method': count_email_addresses, 'output_type': 'numeric'},
{'name': 'Count Unique URLs', 'method': count_unique_urls, 'output_type': 'numeric'},
{'name': 'Count Unique Email Address', 'method': count_unique_email_addresses, 'output_type': 'numeric'},
{'name': 'Count Unique Syllables', 'method': count_unique_syllables, 'output_type': 'numeric'},
{'name': 'Reading Time', 'method': reading_time, 'output_type': 'numeric'},
{'name': 'Sentence Length', 'method': sentence_length, 'output_type': 'numeric'},
{'name': 'Average Syllable Length', 'method': average_syllable_length, 'output_type': 'numeric'},
) + DEFAULT_PROPERTIES


LONG_RUN_PROPERTIES = ('Toxicity', 'Fluency', 'Formality', 'Unique Noun Count')
LARGE_SAMPLE_SIZE = 10_000

ENGLISH_ONLY_PROPERTIES = (
'Sentiment', 'Subjectivity', 'Toxicity',
'Fluency', 'Formality', 'Readability Score',
'Unique Noun Count'
'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Readability Score',
'Unique Noun Count', 'Count Unique Syllables', 'Sentence Length', 'Average Syllable Length'
)


Expand All @@ -491,15 +610,18 @@ def _select_properties(
device: Optional[str] = None,
) -> Sequence[TextProperty]:
"""Select properties."""
properties = DEFAULT_PROPERTIES
all_properties = ALL_PROPERTIES
default_properties = DEFAULT_PROPERTIES

if include_properties is not None and ignore_properties is not None:
raise ValueError('Cannot use properties and ignore_properties parameters together.')

if include_properties is not None:
properties = [prop for prop in properties if prop['name'] in include_properties]
properties = [prop for prop in all_properties if prop['name'] in include_properties]
elif ignore_properties is not None:
properties = [prop for prop in properties if prop['name'] not in ignore_properties]
properties = [prop for prop in default_properties if prop['name'] not in ignore_properties]
else:
properties = default_properties

if not include_long_calculation_properties:
return [
Expand Down Expand Up @@ -528,7 +650,7 @@ def _select_properties(
return properties


def calculate_default_properties(
def calculate_builtin_properties(
raw_text: Sequence[str],
include_properties: Optional[List[str]] = None,
ignore_properties: Optional[List[str]] = None,
Expand All @@ -543,17 +665,25 @@ def calculate_default_properties(
raw_text : Sequence[str]
The text to calculate the properties for.
include_properties : List[str], default None
The properties to calculate. If None, all default properties will be calculated. Cannot be used together
with ignore_properties parameter. Available properties are:
The properties to calculate. If None, all default properties will be calculated. Cannot be used
together with ignore_properties parameter. Available properties are:
['Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters', 'Language',
'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Lexical Density', 'Unique Noun Count',
'Readability Score', 'Average Sentence Length']
'Readability Score', 'Average Sentence Length', 'Count URLs', Count Unique URLs', 'Count Email Address',
'Count Unique Email Address', 'Count Unique Syllables', 'Reading Time', 'Sentence Length',
'Average Syllable Length']
List of default properties are: ['Text Length', 'Average Word Length', 'Max Word Length',
'% Special Characters', 'Language', 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality',
'Lexical Density', 'Unique Noun Count', 'Readability Score', 'Average Sentence Length']
To calculate all the default properties, the include_properties and ignore_properties parameters should
be None. If you pass either include_properties or ignore_properties then the only the properties specified
in the list will be calculated or ignored.
Note that the properties ['Toxicity', 'Fluency', 'Formality', 'Language', 'Unique Noun Count'] may
take a long time to calculate. If include_long_calculation_properties is False, these properties will be
ignored, even if they are in the include_properties parameter.
ignore_properties : List[str], default None
The properties to ignore. If None, no properties will be ignored. Cannot be used together with
properties parameter.
The properties to ignore from the list of default properties. If None, no properties will be ignored and
all the default properties will be calculated. Cannot be used together with include_properties parameter.
include_long_calculation_properties : bool, default False
Whether to include properties that may take a long time to calculate. If False, these properties will be
ignored, even if they are in the include_properties parameter.
Expand Down
Expand Up @@ -54,8 +54,8 @@
train_dataset, test_dataset = load_data()

# # Calculate properties, commented out because it takes a short while to run
# train_dataset.calculate_default_properties(include_long_calculation_properties=True)
# test_dataset.calculate_default_properties(include_long_calculation_properties=True)
# train_dataset.calculate_builtin_properties(include_long_calculation_properties=True)
# test_dataset.calculate_builtin_properties(include_long_calculation_properties=True)

#%%
# Run the check
Expand Down

0 comments on commit a1f921e

Please sign in to comment.