In [1]:
import regex as re

# Basic

In [2]:
# Character policy (feel free to tune per your UX):
# - Thai stretch vowels/marks: cap at 2 (keeps "‡∏à‡πâ‡∏≤‡∏≤", removes "‡∏≤‡∏≤‡∏≤‡∏≤....")
# - Tone marks & diacritics: cap at 1
# - General letters & digits: cap at 3 (optional)
# - Emoji & punctuation: cap at 3
# - Whitespace: collapse to 1

THAI_STRETCH = set("‡∏∞‡∏≤‡∏¥‡∏µ‡∏∂‡∏∑‡∏∏‡∏π")  # common stretch vowels/marks
THAI_TONE_DIACRITIC = set("‡πà‡πâ‡πä‡πã‡πå‡πç")  # tones/nikhahit/thanthakhat
PUNCT_OR_EMOJI = r"[\p{P}\p{S}]"  # punctuation/symbols (includes many emoji)
DIGIT = r"\p{Nd}"


In [3]:
def limit_run(text: str, max_len: int) -> str:
    """Limit any grapheme cluster run to max_len (generic)."""
    # \X matches one user-perceived character (grapheme cluster)
    pattern = re.compile(r"(\X)\1{" + str(max_len) + r",}")
    return pattern.sub(lambda m: m.group(1) * max_len, text)

def limit_specific_sets(text: str) -> str:
    # 1) Whitespace ‚Üí single space
    text = re.sub(r"\s+", " ", text)

    # 2) Tone marks/diacritics ‚Üí max 1
    for ch in THAI_TONE_DIACRITIC:
        text = re.sub(re.escape(ch) + r"{2,}", ch, text)

    # 3) Thai stretch vowels ‚Üí max 2
    for ch in THAI_STRETCH:
        text = re.sub(re.escape(ch) + r"{3,}", ch * 2, text)

    # 4) Punctuation & emoji ‚Üí cap at 3
    text = re.sub(r"(" + PUNCT_OR_EMOJI + r")\1{3,}", r"\1\1\1", text)

    # 5) Digits ‚Üí cap at 3 (so 55555 -> 555); comment this out if unwanted
    text = re.sub(r"(" + DIGIT + r")\1{3,}", r"\1\1\1", text)

    # 6) Generic safety net on graphemes ‚Üí cap at 3
    text = limit_run(text, max_len=3)

    # 7) Trim
    return text.strip()

In [4]:
# --- Examples ---
examples = [
    "‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤",
    "‡∏Ç‡∏≠‡∏ö‡∏Ñ‡∏∏‡∏ì‡∏à‡πâ‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤    ‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤",
    "‡∏î‡∏µ‡∏°‡∏≤‡∏Å‡∏Å‡∏Å‡∏Å‡∏Å!!!! ‡πÄ‡∏¢‡∏µ‡πà‡∏¢‡∏°‡∏°‡∏°‡∏°‡∏°‡∏° ü§©ü§©ü§©ü§©ü§©",
    "55555555 ‡∏™‡∏∏‡∏î‡πÜ‡πÜ‡πÜ‡πÜ‡πÜ‡πÜ",
]

In [5]:
for s in examples:
    print(limit_specific_sets(s))

‡∏≤‡∏≤
‡∏Ç‡∏≠‡∏ö‡∏Ñ‡∏∏‡∏ì‡∏à‡πâ‡∏≤‡∏≤ ‡∏≤‡∏≤
‡∏î‡∏µ‡∏°‡∏≤‡∏Å‡∏Å‡∏Å!!! ‡πÄ‡∏¢‡∏µ‡πà‡∏¢‡∏°‡∏°‡∏° ü§©ü§©ü§©
555 ‡∏™‡∏∏‡∏î‡πÜ‡πÜ‡πÜ


# Define cap

In [6]:
CAPS = {
    "tone_diacritic": 1,     # Tone marks,
    "stretch_vowel": 1,       # Long vowels,
    "letter_digit": 3,        # General letters and digits
    "punct_emoji": 1,         # Symbols, punctuation, emoji
    "whitespace": 1,          # Collapse all whitespace to one
}


In [7]:
# Character policy (feel free to tune per your UX):
# - Thai stretch vowels/marks: cap at 2 (keeps "‡∏à‡πâ‡∏≤‡∏≤", removes "‡∏≤‡∏≤‡∏≤‡∏≤....")
# - Tone marks & diacritics: cap at 1
# - General letters & digits: cap at 3 (optional)
# - Emoji & punctuation: cap at 3
# - Whitespace: collapse to 1

THAI_STRETCH = set("‡∏∞‡∏≤‡∏¥‡∏µ‡∏∂‡∏∑‡∏∏‡∏π")  # common stretch vowels/marks
THAI_TONE_DIACRITIC = set("‡πà‡πâ‡πä‡πã‡πå‡πç")  # tones/nikhahit/thanthakhat
PUNCT_OR_EMOJI = r"[\p{P}\p{S}]"  # punctuation/symbols (includes many emoji)
DIGIT = r"\p{Nd}"

In [8]:
def limit_runs(text: str, chset: set[str], max_len: int) -> str:
    """Limit consecutive occurrences of any character in chset to max_len."""
    for ch in chset:
        text = re.sub(re.escape(ch) + r"{" + str(max_len + 1) + r",}", ch * max_len, text)
    return text

def normalize_text(text: str) -> str:
    # 1Ô∏è‚É£ Collapse whitespace
    if CAPS["whitespace"] == 1:
        text = re.sub(r"\s+", " ", text)
    else:
        text = re.sub(r"\s{" + str(CAPS["whitespace"] + 1) + r",}", " " * CAPS["whitespace"], text)

    # 2Ô∏è‚É£ Limit tone marks & diacritics
    text = limit_runs(text, THAI_TONE_DIACRITIC, CAPS["tone_diacritic"])

    # 3Ô∏è‚É£ Limit Thai stretch vowels
    text = limit_runs(text, THAI_STRETCH, CAPS["stretch_vowel"])

    # 4Ô∏è‚É£ Limit punctuation & emoji
    text = re.sub(r"([\p{P}\p{S}])\1{" + str(CAPS["punct_emoji"]) + r",}", r"\1" * CAPS["punct_emoji"], text)

    # 5Ô∏è‚É£ Limit letters & digits (generic)
    text = re.sub(r"([\p{L}\p{N}])\1{" + str(CAPS["letter_digit"]) + r",}", r"\1" * CAPS["letter_digit"], text)

    # 6Ô∏è‚É£ Trim
    return text.strip()

In [9]:
# -------------------------------
# TEST EXAMPLES
# -------------------------------
examples = [
    "‡∏Ç‡∏≠‡∏ö‡∏Ñ‡∏∏‡∏ì‡∏à‡πâ‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤    ‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤",
    "‡∏î‡∏µ‡∏°‡∏≤‡∏Å‡∏Å‡∏Å‡∏Å‡∏Å!!!! ‡πÄ‡∏¢‡∏µ‡πà‡∏¢‡∏°‡∏°‡∏°‡∏°‡∏°‡∏° ü§©ü§©ü§©ü§©ü§©",
    "55555555 ‡∏™‡∏∏‡∏î‡πÜ‡πÜ‡πÜ‡πÜ‡πÜ‡πÜ",
    "‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤",
]
for s in examples:
    print("Before:", s)
    print("After: ", normalize_text(s))
    print("-" * 80)

Before: ‡∏Ç‡∏≠‡∏ö‡∏Ñ‡∏∏‡∏ì‡∏à‡πâ‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤    ‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤
After:  ‡∏Ç‡∏≠‡∏ö‡∏Ñ‡∏∏‡∏ì‡∏à‡πâ‡∏≤ ‡∏≤
--------------------------------------------------------------------------------
Before: ‡∏î‡∏µ‡∏°‡∏≤‡∏Å‡∏Å‡∏Å‡∏Å‡∏Å!!!! ‡πÄ‡∏¢‡∏µ‡πà‡∏¢‡∏°‡∏°‡∏°‡∏°‡∏°‡∏° ü§©ü§©ü§©ü§©ü§©
After:  ‡∏î‡∏µ‡∏°‡∏≤‡∏Å‡∏Å‡∏Å! ‡πÄ‡∏¢‡∏µ‡πà‡∏¢‡∏°‡∏°‡∏° ü§©
--------------------------------------------------------------------------------
Before: 55555555 ‡∏™‡∏∏‡∏î‡πÜ‡πÜ‡πÜ‡πÜ‡πÜ‡πÜ
After:  555 ‡∏™‡∏∏‡∏î‡πÜ‡πÜ‡πÜ
--------------------------------------------------------------------------------
Before: ‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤‡∏≤
After:  ‡∏≤
--------------------------------------------------------------------------------
