In [1]:
from bltk.langtools.banglachars import (vowels,
                                        vowel_signs,
                                        consonants,
                                        digits,
                                        operators,
                                        punctuations,
                                        others)

In [None]:

BANGLA_DIGIT_REGEX = re.compile(r'[০-৯]+')
# taken hostname, domainname, tld from URL regex below
EMAIL_REGEX = re.compile(r"(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-](@|[(<{\[]at[)>}\]])(?:(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)(?:\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))", flags=re.IGNORECASE | re.UNICODE,)
# source: https://gist.github.com/dperini/729294
# @jfilter: I guess it was changed
URL_REGEX = re.compile(
    r"(?:^|(?<![\w\/\.]))"    
    r"(?:(?:https?:\/\/|ftp:\/\/|www\d{0,3}\.))" # protocol identifier r"(?:(?:https?|ftp)://)"  <-- alt?    
    r"(?:\S+(?::\S*)?@)?" r"(?:" # user:pass authentication
    r"(?!(?:10|127)(?:\.\d{1,3}){3})"     # IP address exclusion private & local networks
    r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
    r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
    r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" # IP address dotted notation octets, excludes loopback network 0.0.0.0, excludes reserved space >= 224.0.0.0 ,excludes network & broadcast addresses, (first & last IP address of each class)
    r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
    r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
    r"|"
    r"(?:(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)" # host name
    r"(?:\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*" # domain name
    r"(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))" r"|" r"(?:(localhost))" r")"     # TLD identifier
    r"(?::\d{2,5})?" # port number    
    r"(?:\/[^\)\]\}\s]*)?", # resource path
    flags=re.UNICODE | re.IGNORECASE,
    # r"(?:$|(?![\w?!+&\/\)]))",
    # @jfilter: I removed the line above from the regex because I don't understand what it is used for, maybe it was useful?
    # But I made sure that it does not include ), ] and } in the URL.
)
strange_double_quotes = ["«", "‹", "»", "›", "„", "“", "‟", "”", "❝", "❞", "❮", "❯", "〝", "〞", "〟", "＂",}
strange_single_quotes = ["‘", "‛", "’", "❛", "❜", "`", "´", "‘", "’"]
DOUBLE_QUOTE_REGEX = re.compile("|".join(strange_double_quotes))
SINGLE_QUOTE_REGEX = re.compile("|".join(strange_single_quotes))

In [None]:
def fix_bad_unicode(text, normalization="NFC"):
    return fix_text(text, normalization=normalization)

def fix_strange_quotes(text):
    """
    Replace strange quotes, i.e., 〞with a single quote ' or a double quote " if it fits better.
    """
    text = constants.SINGLE_QUOTE_REGEX.sub("'", text)
    text = constants.DOUBLE_QUOTE_REGEX.sub('"', text)
    return text

def replace_urls(text, replace_with=""):
    """
    Replace all URLs in ``text`` str with ``replace_with`` str.
    """
    return constants.URL_REGEX.sub(replace_with, text)

def replace_emails(text, replace_with=""):
    """
    Replace all emails in ``text`` str with ``replace_with`` str.
    """
    return constants.EMAIL_REGEX.sub(replace_with, text)

def remove_substrings(text, to_replace, replace_with=""):
    """
    Remove (or replace) substrings from a text.
    Args:
        text (str): raw text to preprocess
        to_replace (iterable or str): substrings to remove/replace
        replace_with (str): defaults to an empty string but
            you replace substrings with a token.
    """
    if isinstance(to_replace, str):
        to_replace = [to_replace]

    result = text
    for x in to_replace:
        result = result.replace(x, replace_with)
    return result

def remove_emoji(text):
    return remove_substrings(text, UNICODE_EMOJI["en"])

def remove_number_or_digit(text, replace_with=""):
    return re.sub(constants.BANGLA_DIGIT_REGEX, replace_with, text)

def remove_punctuations(text, replace_with=""):
    for punc in corpus.punctuations:
        print(punc)
        text = text.replace(punc, replace_with)
    
    return text

class CleanText(object):
    def __init__(
        self,
        fix_unicode=True,
        unicode_norm=True,
        unicode_norm_form="NFKC",
        remove_url=False,
        remove_email=False,
        remove_number=False,
        remove_digits=False,
        remove_emoji=False,
        remove_punct=False,
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_number="<NUMBER>",
        replace_with_digit="<DIGIT>",
        replace_with_punct = "<PUNC>"
        ):
        self.fix_unicode = fix_unicode
        self.unicode_norm = unicode_norm
        self.unicode_norm_form = unicode_norm_form
        self.remove_url = remove_url
        self.remove_email = remove_email
        self.remove_number = remove_number
        self.remove_digits = remove_digits
        self.remove_emoji = remove_emoji
        self.remove_punct = remove_punct
        
        self.replace_with_url = replace_with_url
        self.replace_with_email = replace_with_email
        self.replace_with_number = replace_with_number
        self.replace_with_digit = replace_with_digit
        self.replace_with_punct = replace_with_punct

    def __call__(self, text: str) -> str:
        if text is None:
            text = ""
        text = str(text)
        text = fix_strange_quotes(text)

        if self.fix_unicode:
            text = fix_bad_unicode(text)
        if self.unicode_norm:
            text = normalize(self.unicode_norm_form, text)
        if self.remove_punct:
            text = remove_punctuations(text, replace_with=self.replace_with_punct)
        if self.remove_url:
            text = replace_urls(text, replace_with=self.replace_with_url)
        if self.remove_email:
            text = replace_emails(text, replace_with=self.replace_with_email)
        if self.remove_emoji:
            text = remove_emoji(text)
        if self.remove_digits:
            text = remove_number_or_digit(text, replace_with=self.replace_with_digit)
        if self.remove_number:
            text = remove_number_or_digit(text, replace_with=self.replace_with_number)

        return text

In [4]:
class BanglaCharacter:
    def __init__(self):
        self.vowels = ["অ", "আ", "ই", "ঈ", "উ", "ঊ", "ঋ", "ঌ", "এ", "ঐ", "ও", "ঔ"]
        self.vowels_unicode = ["\u0985", "\u0986", "\u0987", "\u0988", "\u0989", "\u098A", "\u098B", "\u098C", "\u098F", "\u0990", "\u0993", "\u0994"]
        self.vowel_signs = ["া", "ি", "ী", "ু", "ূ", "ৃ", "ৄ", "ে", "ৈ", "ো", "ৌ"]
        self.vowel_signs_unicode = ["\u09BE", "\u09Bf", "\u09C0", "\u09C1", "\u09C2", "\u09C3", "\u09C4", "\u09C7", "\u09C8", "\u09CB", "\u09CC"]
        self.digits = ["০", "১", "২", "৩", "৪", "৫", "৬", "৭", "৮", "৯"]
        self.digit_unicode = ["\u09E6", "\u09E7", "\u09E8", "\u09E9", "\u09EA", "\u09EB", "\u09EC", "\u09ED", "\u09EE", "\u09EF"]
        self.consonants = ["ক", "খ", "গ", "ঘ", "ঙ", "চ", "ছ", "জ", "ঝ", "ঞ", 
                           "ট", "ঠ", "ড", "ঢ", "ণ", "ত", "থ", "দ", "ধ", "ন", 
                           "প", "ফ", "ব", "ভ", "ম", "য", "র", "ল", "শ", "ষ", 
                           "স", "হ", "ড়", "ঢ়", "য়", "ৎ", "ং", "ঃ", "ঁ"]
        self.consonant_unicode = ["\u0995", "\u0996", "\u0997", "\u0998", "\u0999", "\u099A", "\u099B","\u099C",
                            "\u099D", "\u099E", "\u099F", "\u09A0", "\u09A1", "\u09A2", "\u09A3", "\u09A4",
                            "\u09A5", "\u09A6", "\u09A7", "\u09A8", "\u09AA", "\u09AB", "\u09AC", "\u09AD",
                            "\u09AE", "\u09AF", "\u09B0", "\u09B2", "\u09B6", "\u09B7", "\u09B8", "\u09B9",
                            "\u09DC", "\u09DD", "\u09DF", "\u09CE", "\u0982", "\u0983", "\u0981"]
        self.punctuations = ["।", ",", ";", ":", "?", "!", "'", ".", "\"", "-", "[", "]", "{", "}", "(", ")", '–', "—", "―", "~"]
        self.punctuation_unicode = ["\u0964", "\u002C", "\u003B", "\u003A", "\u003F", "\u0021",
                            "\u0027", "\u002E", "\u0022", "\u002D", "\u005B", "\u005D",
                            "\u007B", "\u007D", "\u0028", "\u0029", "\u2013", "\u2014",
                            "\u2015", "\u007E"]
        self.operators = ["=", "+", "-", "*", "/", "%", "<", ">", "×", "÷"]
        self.unicode_operators = ["\u003D", "\u002B", "\u002D", "\u002A", "\u002F", "\u0025", "\u003C",
                            "\u003E", "\u00D7", "\u00F7"]
        self.others = ["৳", "৺", '্', "ঀ", "ঽ", "#", "$"]
        self.unicode_others = ["\u09F3", "\u09FA", "\u09CD", "\u0980", "\u09BD", "\u0023", "\u0024"]

    def is_valid_character(self, char: str) -> bool:
        if len(char) > 1:
            raise ValueError("Input character must be a single character")
        if char in self.vowels or char in self.vowel_signs or char in self.digits or char in self.consonants or char in self.punctuations or char in self.operators or char in self.others:
            return True
        return False
        

In [5]:
bc = BanglaCharacter()

In [10]:
bc.is_valid_character("ড")

True

In [3]:
import bangla_text_cleaner

In [1]:
from bangla_text_cleaner.bangla_text_cleaner import BanglaTextCleaner

cleaner = BanglaTextCleaner()
cleaned_text = cleaner.clean("""হবিগঞ্জের বাহুবল থানার সদ্য Bangla বিদায়ী ভারপ্রাপ্ত কর্মকর্তা (ওসি) রাকিবুল ইসলাম খানের বিরুদ্ধে থানা ও বাসভবনে সরকারি অর্থে স্থাপিত দুটি শীতাতপনিয়ন্ত্রণ যন্ত্র (এসি) খুলে নেওয়ার অভিযোগ পাওয়া গেছে।
ওসির এ কর্মকাণ্ডে ক্ষোভ প্রকাশ করেছেন হবিগঞ্জ-১ (নবীগঞ্জ-বাহুবল) আসনের সংসদ সদস্য গাজী মোহাম্মদ শাহ নওয়াজ। তিনি প্রথম আলোকে বলেন, এসি দুটি সরকারি বরাদ্দ থেকে থানায় ওসির কক্ষে ও তাঁর বাসভবনে স্থাপন করা হয়। এটি কাউকে ব্যক্তিগতভাবে দেওয়া হয়নি। তিনি এ এসি খুলে নেওয়ার অধিকার রাখেন না।""")

In [2]:
cleaned_text

'হবিগঞ্জের বাহুবল থানার সদ্য Bangla বিদায়ী ভারপ্রাপ্ত কর্মকর্তা (ওসি) রাকিবুল ইসলাম খানের বিরুদ্ধে থানা ও বাসভবনে সরকারি অর্থে স্থাপিত দুটি শীতাতপনিয়ন্ত্রণ যন্ত্র (এসি) খুলে নেওয়ার অভিযোগ পাওয়া গেছে।\nওসির এ কর্মকাণ্ডে ক্ষোভ প্রকাশ করেছেন হবিগঞ্জ-১ (নবীগঞ্জ-বাহুবল) আসনের সংসদ সদস্য গাজী মোহাম্মদ শাহ নওয়াজ। তিনি প্রথম আলোকে বলেন, এসি দুটি সরকারি বরাদ্দ থেকে থানায় ওসির কক্ষে ও তাঁর বাসভবনে স্থাপন করা হয়। এটি কাউকে ব্যক্তিগতভাবে দেওয়া হয়নি। তিনি এ এসি খুলে নেওয়ার অধিকার রাখেন না।'

In [3]:
cleaned_text = cleaner.replace_foreign_words(cleaned_text, keep_special_tokens=False)
cleaned_text

'হবিগঞ্জের বাহুবল থানার সদ্য <FOREIGN> <FOREIGN> ভারপ্রাপ্ত কর্মকর্তা (ওসি) রাকিবুল ইসলাম খানের বিরুদ্ধে থানা ও বাসভবনে সরকারি অর্থে স্থাপিত দুটি <FOREIGN> যন্ত্র (এসি) খুলে <FOREIGN> অভিযোগ <FOREIGN> গেছে। ওসির এ কর্মকাণ্ডে ক্ষোভ প্রকাশ করেছেন হবিগঞ্জ-১ (নবীগঞ্জ-বাহুবল) আসনের সংসদ সদস্য গাজী মোহাম্মদ শাহ <FOREIGN> তিনি প্রথম আলোকে বলেন, এসি দুটি সরকারি বরাদ্দ থেকে <FOREIGN> ওসির কক্ষে ও তাঁর বাসভবনে স্থাপন করা <FOREIGN> এটি কাউকে ব্যক্তিগতভাবে <FOREIGN> <FOREIGN> তিনি এ এসি খুলে <FOREIGN> অধিকার রাখেন না।'

In [6]:
cleaner.replace_foreign_word("শীতাতপনিয়ন্ত্রণ", keep_special_tokens=False)

'<FOREIGN>'

In [5]:
for item in "পাওয়া":
    print("item: ", item, "is valid: ", cleaner.bangla_character.is_valid_character(item))

item:  প is valid:  True
item:  া is valid:  True
item:  ও is valid:  True
item:  য is valid:  True
item:  ় is valid:  False
item:  া is valid:  True


In [9]:
" ়".encode("utf-8").decode("utf-8")

' ়'

In [10]:
# get unicode of " ়"
" ়".encode("utf-8").decode("utf-8")

' ়'