In [12]:
import re
import string
import unidecode
import contractions
from bs4 import BeautifulSoup

ENUM_REGEX = re.compile(r"\(\s*(i|v|[0-9])+\s*\)", re.IGNORECASE)
ENUM_ALPHA_LOWER_REGEX = re.compile(r"\(\s*[a-e]\s*\)")
BRACKETS_REGEX = re.compile(r"(\{\[\]\})") # removes only brackets
BRACKET_WITH_CONTENT_REGEX = re.compile('\[.*?\]|\(.*?\)|/.*?/') # remove brackets along with content
REF_REGEX = re.compile(r"\[\s*[0-9]+\s*\]")
ETC_REGEX = re.compile(r"\,\s*etc\s*\.")
EG_REGEX = re.compile(r"\,\s*e\.g\.\s*\,")
IE_REGEX = re.compile(r"\,\s*i\.e\.\s*\,")
CHARS_REGEX = re.compile(r"(\\|\/|\{|\}|\[|\]|\+|\*|\&|\^|\~)")
LEADING_DIGITS = re.compile(r'^\d+')

class Preprocess:
    """
    Process one sentences a time.
    """

    def run(self, text:str)->str:
        text = self.__remove_html_tags(text)
        text = self.__remove_accented_chars(text)
        text = self.__expand_contractions(text)
        text = self._remove_extra_whitespace(text)
        text = ENUM_REGEX.sub('', text.replace('%', ' percent'))
        text = ENUM_ALPHA_LOWER_REGEX.sub('', text)
        text = REF_REGEX.sub('', text)
        text = BRACKET_WITH_CONTENT_REGEX.sub('',text)
        text = ETC_REGEX.sub('', text)
        text = EG_REGEX.sub(', and', text)
        text = IE_REGEX.sub(', and', text)
        text = CHARS_REGEX.sub('', text)
        text = LEADING_DIGITS.sub('',text)
        text = self.__remove_punctuation(text)
        text = self._remove_extra_whitespace(text)
        return text

    def __remove_html_tags(self, text):
        soup = BeautifulSoup(text, "html.parser")
        return soup.get_text(separator=" ")

    def __remove_accented_chars(self, text):
        return unidecode.unidecode(text)

    def __expand_contractions(self, text):
        return contractions.fix(text)

    def __remove_punctuation(self, text):
        return text.translate(str.maketrans('', '', string.punctuation))
    
    def _remove_extra_whitespace(self,text):
        return " ".join(text.split())

In [13]:
s = "    1. Project's failue shouldn't be (abc)  is 100% of them </p>;"
Preprocess().run(s)

'Projects failue should not be is 100 percent of them'