Skip to content

Commit

Permalink
⚡ code optimization for cleaning special chars from string
Browse files Browse the repository at this point in the history
  • Loading branch information
manijhariya committed Nov 22, 2023
1 parent cf4a7ea commit f80a669
Showing 1 changed file with 25 additions and 7 deletions.
32 changes: 25 additions & 7 deletions deepchecks/nlp/utils/text_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#
"""Utils module for calculating embeddings for text."""
import sys
import re
import warnings
from itertools import islice
from typing import Optional
Expand All @@ -22,6 +23,10 @@
EMBEDDING_CTX_LENGTH = 8191
EMBEDDING_ENCODING = 'cl100k_base'

PATTERN_SPECIAL_CHARS = re.compile("[!@#$%^&*()_+{}\|:\"<>?~`\-=\[\]\;',.\/]")
PATTERN_SPACE_CHARS = re.compile(r"\s")
PATTERN_BR_CHARS = re.compile("<br />")


def batched(iterable, n):
"""Batch data into tuples of length n. The last batch may be shorter."""
Expand Down Expand Up @@ -204,11 +209,24 @@ def len_safe_get_embedding(list_of_texts, model_name=EMBEDDING_MODEL, max_tokens
np.save(file_path, embeddings)
return embeddings


def _clean_special_chars(text):
special_chars = r'!@#$%^&*()_+{}|:"<>?~`-=[]\;\',./'
for char in special_chars:
text = text.replace(char, '')
text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
text = text.replace('<br />', ' ')

def _clean_special_chars(text: str) -> str:
"""
Remove special characters, replaces space characters with a
single space, and replaces line break characters with a space in a given text.
Parameters
----------
text : str
The `text` parameter is a string that represents the input text that needs to be cleaned.
Returns
-------
str
Cleaned text string
"""
text = PATTERN_SPECIAL_CHARS.sub("", text)
text = PATTERN_SPACE_CHARS.sub(" ", text)
text = PATTERN_BR_CHARS.sub(" ", text)
return text

0 comments on commit f80a669

Please sign in to comment.