# Bangla Text Representation using BOW

In [11]:
import re

def normalize_bangla_text(text):
    # Remove digits
    text = re.sub(r'[০-৯]', '', text) 
    
    # Step 1: Remove invisible characters
    text = text.replace('\u200d', '')  # Zero-width joiner
    text = text.replace('\u200c', '')  # Zero-width non-joiner
    text = text.replace('\u00a0', ' ') # Non-breaking space to regular space
    
    # Step 2: Normalize visually similar characters
    text = re.sub(r'[য়]', 'য়', text)   # Normalize 'য়' to 'য়'
    text = re.sub(r'[র‍]', 'র', text)   # Remove ZWJ from 'র‍' if used wrongly
    text = re.sub(r'[ৎ]', 'ত্', text)   # Rare cases where 'ৎ' needs to be decomposed
    text = re.sub(r'[ড়]', 'র়', text)   # Normalize dotted র
    text = re.sub(r'[ঢ়]', 'ঢ়', text)   # Normalize dotted ঢ
    text = re.sub(r'[ঙ‍]', 'ঙ', text)   # Remove ZWJ after ঙ if it exists

    # Step 3: Normalize vowel signs and nukta forms
    text = re.sub(r'[\u09c7\u09c8]', '\u09c7', text)  # Normalize e-kar and ai-kar variants
    text = re.sub(r'[\u09cb\u09cc]', '\u09cb', text)  # Normalize o-kar and au-kar variants
  
    # Optional: remove duplicate diacritics (common from faulty OCR or typing)
    text = re.sub(r'([ািীুূেৈোৌ])\1+', r'\1', text)   # Collapse repeated vowel signs
    
    return text

In [12]:
from bnlp import NLTKTokenizer
import re

def tokenize_bangla_text(text):
    tokenizer = NLTKTokenizer()
    words = tokenizer.word_tokenize(text)
    # Keep only Bangla words (remove punctuation and non-Bangla characters)
    words = [word for word in words if re.match(r'^[\u0980-\u09FF]+$', word)]
    return words

In [14]:
u_documents = ["কুকুর মানুষকে কামড়ায়", "মানুষ কুকুরকে কামড়ায়", "কুকুর মাংস খায়।", "মানুষ খাবার খায়।"]

# Normalize
documents = [normalize_bangla_text(doc) for doc in u_documents]

processed_docs = [doc.lower().replace("।", "") for doc in documents]
print(processed_docs)

['কুকুর মানুষকে কামর়ায়', 'মানুষ কুকুরকে কামর়ায়', 'কুকুর মাংস খায়', 'মানুষ খাবার খায়']


In [15]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(tokenizer=tokenize_bangla_text)

# Build a BOW Representation for the corpus
bow_rep = count_vect.fit(processed_docs)

bow_rep

In [16]:
# Look at the vocabulary mapping
print("Our vocabulary: ", count_vect.vocabulary_)

Our vocabulary:  {'কুকুর': 1, 'মানুষকে': 7, 'কামর়ায়': 0, 'মানুষ': 6, 'কুকুরকে': 2, 'মাংস': 5, 'খায়': 4, 'খাবার': 3}
