Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

⚡ code optimization for cleaning special chars from string #2698

Merged
merged 6 commits into from
Dec 6, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
28 changes: 22 additions & 6 deletions deepchecks/nlp/utils/text_embeddings.py
Expand Up @@ -9,6 +9,7 @@
# ----------------------------------------------------------------------------
#
"""Utils module for calculating embeddings for text."""
import re
import sys
import warnings
from itertools import islice
Expand All @@ -22,6 +23,10 @@
EMBEDDING_CTX_LENGTH = 8191
EMBEDDING_ENCODING = 'cl100k_base'

PATTERN_SPECIAL_CHARS = re.compile(r"[!@#$%^&*()_+{}|:\"<>?~`\-=\[\]\;',.\/]")
PATTERN_SPACE_CHARS = re.compile(r'\s')
PATTERN_BR_CHARS = re.compile(r'<br />')


def batched(iterable, n):
"""Batch data into tuples of length n. The last batch may be shorter."""
Expand Down Expand Up @@ -205,10 +210,21 @@ def len_safe_get_embedding(list_of_texts, model_name=EMBEDDING_MODEL, max_tokens
return embeddings


def _clean_special_chars(text):
special_chars = r'!@#$%^&*()_+{}|:"<>?~`-=[]\;\',./'
for char in special_chars:
text = text.replace(char, '')
text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
text = text.replace('<br />', ' ')
def _clean_special_chars(text: str) -> str:
"""
Remove special characters, replaces space characters with space.

Parameters
----------
text : str
The `text` parameter is a string that represents the input text that needs to be cleaned.

Returns
-------
text
Cleaned text string
"""
text = PATTERN_SPECIAL_CHARS.sub('', text)
text = PATTERN_SPACE_CHARS.sub(' ', text)
text = PATTERN_BR_CHARS.sub(' ', text)
return text