# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

# Thai Grapheme Classification Algorithm

This algorithm classifies Thai graphemes into **three main classes**, with subclasses, plus a special case for **อ**.

1. **ฐาน (tan)** - *foundation class*
   Consonant letters that act as the **base** for dependent marks (vowels, tone marks, etc.).

2. **สระ (sara)** - *vowel class*
   Vowel graphemes (both independent and dependent), which attach to a foundation consonant.

3. **ยุกต์ (yuk)** - *dependent class*
   Tone marks and other diacritics that cannot exist without a foundation consonant.

4. **ข้อยกเว้น (kho yok waen)** - *exception class*
   The consonant **อ** is treated separately, since it functions both as a **foundation** (carrier consonant) and as part of certain **vowel symbols**.



In [4]:
# Test cases
thai1 = "ยา"  # simple: x=ย, vowel=า
thai2 = "เด็ก"  # pattern เx็f: x=ด, vowel=เ็, f=ก
thai3 = "คน"  # x=ค, f=น (hidden vowel)
thai4 = "เลว"  # AMBIGUOUS: could be เxf (x=ล, f=ว) OR cluster ลว + เx

thaihard1 = "เกรียน"  # pattern เxียf: x=กร (cluster), vowel=เีย, f=น
thaihard2 = "เอา"  # pattern เxา: x=อ (silent), vowel=เา
thaihard3 = "อย่า"  # x=อ (silent), vowel=่า
thaihard4 = "เอือม"  # pattern เxือf: x=อ, vowel=เือ, f=ม
thaihard5 = "ไกล"  # pattern ไxf: x=ก, vowel=ไ, f=ล

In [5]:
import json
from typing import List, Dict, Set, Tuple

# Load data
with open("../../res/foundation/foundation.json", "r", encoding="utf-8") as f:
    data_foundation = json.load(f)

with open("../../thai_vowels_tagged_9-21-2025-2-31-pm.json", "r", encoding="utf-8") as f:
    tagged_vowel_data = json.load(f)

foundation = set(data_foundation["foundation"])
vowel_patterns = list(tagged_vowel_data["patterns"].keys())

print(f"Loaded {len(foundation)} foundation consonants")
print(f"Loaded {len(vowel_patterns)} vowel patterns")

# Characters that can be ambiguous (can be part of cluster OR part of vowel)
ambiguous_chars = {"ว", "ย", "อ"}  # ว and ย can be final consonants OR part of vowels; อ can be initial OR part of vowel

Loaded 42 foundation consonants
Loaded 72 vowel patterns


In [6]:
def find_all_reading_orders(text: str) -> List[Dict]:
    """
    Find all possible reading orders for Thai text based on vowel patterns.
    Returns a list of possible interpretations.
    
    Each interpretation contains:
    - segments: List of (grapheme, reading_order, role) tuples
    - ambiguity: Whether this reading has ambiguous elements
    - ambiguity_reason: Why it's ambiguous (if applicable)
    """
    if not text:
        return []
    
    interpretations = []
    
    # Try to match vowel patterns at each position
    def try_patterns(pos: int, consumed: Set[int], current_segments: List):
        if len(consumed) == len(text):
            # We've consumed all characters - this is a valid interpretation
            interpretations.append({
                'segments': sorted(current_segments, key=lambda x: x[1]),
                'ambiguity': False,
                'ambiguity_reason': None
            })
            return
        
        # Skip already consumed positions
        if pos in consumed:
            try_patterns(pos + 1, consumed, current_segments)
            return
        
        if pos >= len(text):
            # We have unconsumed characters - add them as standalone
            for i in range(len(text)):
                if i not in consumed:
                    char = text[i]
                    role = 'unmatched'
                    current_segments.append((char, i, role))
                    consumed.add(i)
            interpretations.append({
                'segments': sorted(current_segments, key=lambda x: x[1]),
                'ambiguity': True,
                'ambiguity_reason': 'unmatched characters'
            })
            return
        
        # Try each vowel pattern starting at this position
        matched_any = False
        for pattern in vowel_patterns:
            match_result = try_match_pattern(text, pos, pattern, consumed)
            if match_result:
                matched_text, positions, x_pos, f_pos = match_result
                new_consumed = consumed.copy()
                new_segments = current_segments.copy()
                
                # Add segments with reading order
                reading_order = 0
                
                # Initial consonant(s) always read first
                if x_pos:
                    x_text = ''.join(text[p] for p in x_pos)
                    new_segments.append((x_text, reading_order, 'initial'))
                    reading_order += 1
                    new_consumed.update(x_pos)
                
                # Vowel parts read second
                vowel_positions = [p for p in positions if p not in x_pos and (not f_pos or p not in f_pos)]
                if vowel_positions:
                    vowel_text = pattern  # Store the pattern as the vowel identifier
                    new_segments.append((vowel_text, reading_order, 'vowel_pattern'))
                    reading_order += 1
                    new_consumed.update(vowel_positions)
                
                # Final consonant read last
                if f_pos:
                    f_text = ''.join(text[p] for p in f_pos)
                    new_segments.append((f_text, reading_order, 'final'))
                    new_consumed.update(f_pos)
                
                # Continue matching from next unconsumed position
                next_pos = 0
                while next_pos < len(text) and next_pos in new_consumed:
                    next_pos += 1
                
                try_patterns(next_pos, new_consumed, new_segments)
                matched_any = True
        
        # If no pattern matched, try treating current character as standalone
        if not matched_any:
            if pos not in consumed:
                new_consumed = consumed.copy()
                new_segments = current_segments.copy()
                new_consumed.add(pos)
                new_segments.append((text[pos], pos, 'standalone'))
                try_patterns(pos + 1, new_consumed, new_segments)
    
    try_patterns(0, set(), [])
    
    # Identify ambiguities
    for interp in interpretations:
        # Check for ว ambiguity
        for seg in interp['segments']:
            if 'ว' in seg[0] and (seg[2] == 'final' or seg[2] in ['initial', 'cluster']):
                interp['ambiguity'] = True
                interp['ambiguity_reason'] = 'ว could be part of cluster or vowel pattern'
                break
    
    return interpretations

def try_match_pattern(text: str, start_pos: int, pattern: str, consumed: Set[int]) -> Tuple:
    """
    Try to match a vowel pattern at the given position.
    Returns: (matched_text, all_positions, x_positions, f_positions) or None
    """
    positions = []
    x_positions = []
    f_positions = []
    text_idx = start_pos
    pattern_idx = 0
    
    while pattern_idx < len(pattern) and text_idx < len(text):
        if text_idx in consumed:
            return None
            
        pattern_char = pattern[pattern_idx]
        
        if pattern_char == 'x':
            # x = initial consonant(s) - can be cluster
            if text[text_idx] in foundation:
                x_positions.append(text_idx)
                positions.append(text_idx)
                text_idx += 1
                
                # Check for possible cluster
                while text_idx < len(text) and text[text_idx] in foundation and text_idx not in consumed:
                    # Look ahead to see if this could be part of cluster
                    if pattern_idx + 1 < len(pattern) and pattern[pattern_idx + 1] != 'f':
                        # Could be cluster, but we'll explore both options
                        break
                    text_idx += 1
                    
            else:
                return None
                
        elif pattern_char == 'f':
            # f = final consonant
            if text[text_idx] in foundation:
                f_positions.append(text_idx)
                positions.append(text_idx)
                text_idx += 1
            else:
                return None
                
        else:
            # Must match exact character
            if text[text_idx] == pattern_char:
                positions.append(text_idx)
                text_idx += 1
            else:
                return None
        
        pattern_idx += 1
    
    # Check if pattern fully matched
    if pattern_idx == len(pattern):
        return (''.join(text[p] for p in positions), positions, x_positions, f_positions)
    
    return None

In [None]:
def find_all_possible_readings(text: str) -> List[List[Tuple]]:
    """
    Find ALL possible reading orders for Thai text by trying every valid pattern match.
    
    Key insight: We need to find ALL ways to segment the text into syllables,
    where each syllable has one vowel pattern.
    
    Returns: List of possible readings, where each reading is a list of 
             (initial_consonant(s), vowel_pattern, final_consonant) tuples
    """
    if not text:
        return []
    
    all_readings = []
    text_len = len(text)
    
    def find_pattern_matches_at_position(text: str, pattern: str) -> List[Dict]:
        """
        Find all ways this pattern could match in the text.
        Returns list of matches with their positions and components.
        """
        matches = []
        
        # For patterns with 'x', we need to try different cluster sizes
        has_x = 'x' in pattern
        has_f = 'f' in pattern
        
        # Try matching at each position
        for start_pos in range(len(text)):
            if has_x:
                # Try different cluster sizes (1-3 consonants typically)
                for cluster_size in range(1, min(4, len(text) - start_pos + 1)):
                    match = try_match_with_specific_cluster_size(
                        text, start_pos, pattern, cluster_size
                    )
                    if match:
                        matches.append(match)
            else:
                # No x in pattern, just try direct match
                match = try_match_with_specific_cluster_size(
                    text, start_pos, pattern, 0
                )
                if match:
                    matches.append(match)
        
        return matches
    
    def try_match_with_specific_cluster_size(text: str, start_pos: int, 
                                            pattern: str, cluster_size: int) -> Dict:
        """
        Try to match pattern at start_pos with specific cluster size for 'x'.
        Returns match info or None.
        """
        text_idx = start_pos
        pattern_idx = 0
        positions_used = []
        x_text = ""
        f_text = ""
        
        while pattern_idx < len(pattern) and text_idx < len(text):
            p_char = pattern[pattern_idx]
            
            if p_char == 'x':
                # Match cluster_size consonants
                consumed = 0
                x_chars = []
                while consumed < cluster_size and text_idx < len(text):
                    if text[text_idx] in foundation:
                        x_chars.append(text[text_idx])
                        positions_used.append(text_idx)
                        text_idx += 1
                        consumed += 1
                    else:
                        return None
                
                if consumed != cluster_size:
                    return None
                    
                x_text = ''.join(x_chars)
                
            elif p_char == 'f':
                # Match single final consonant
                if text_idx < len(text) and text[text_idx] in foundation:
                    f_text = text[text_idx]
                    positions_used.append(text_idx)
                    text_idx += 1
                else:
                    return None
                    
            else:
                # Must match exact character
                if text_idx < len(text) and text[text_idx] == p_char:
                    positions_used.append(text_idx)
                    text_idx += 1
                else:
                    return None
            
            pattern_idx += 1
        
        # Check if pattern fully matched
        if pattern_idx == len(pattern):
            return {
                'pattern': pattern,
                'x': x_text,
                'f': f_text,
                'positions': positions_used,
                'start': min(positions_used),
                'end': max(positions_used)
            }
        
        return None
    
    def explore_segmentations(used_positions: Set[int], current_reading: List):
        """
        Recursively explore all possible ways to segment remaining text.
        """
        # If all positions used, we have a complete reading
        if len(used_positions) == text_len:
            all_readings.append(current_reading.copy())
            return
        
        # Find next unused position
        next_pos = 0
        while next_pos < text_len and next_pos in used_positions:
            next_pos += 1
            
        if next_pos >= text_len:
            return
        
        # Try all patterns that could include this position
        found_match = False
        for pattern in vowel_patterns:
            matches = find_pattern_matches_at_position(text, pattern)
            
            for match in matches:
                # Check if this match includes our position and doesn't conflict
                if next_pos in match['positions']:
                    # Check for conflicts with already used positions
                    if not any(pos in used_positions for pos in match['positions']):
                        # Valid match - explore this path
                        syllable = (match['x'] if match['x'] else None, 
                                  match['pattern'], 
                                  match['f'] if match['f'] else None)
                        
                        new_used = used_positions | set(match['positions'])
                        new_reading = current_reading + [syllable]
                        
                        explore_segmentations(new_used, new_reading)
                        found_match = True
        
        # If no pattern matched at this position, text might be incomplete
        if not found_match:
            # Could add handling for unmatched characters here
            pass
    
    # Start exploration
    explore_segmentations(set(), [])
    
    # Remove duplicates
    unique_readings = []
    seen = set()
    for reading in all_readings:
        reading_str = str(reading)
        if reading_str not in seen:
            seen.add(reading_str)
            unique_readings.append(reading)
    
    return unique_readings

# Use the new algorithm
find_canonical_reading_order = find_all_possible_readings

In [None]:
# Test the algorithm
print("="*60)
print("FINDING ALL POSSIBLE READING ORDERS")
print("="*60)

test_cases = [
    (thai1, "simple case"),
    (thai2, "vowel before consonant"),
    (thai3, "hidden vowel"),
    (thai4, "AMBIGUOUS: ว could be final OR part of cluster"),
    (thaihard1, "cluster case"),
    (thaihard2, "อ as silent initial"),
    (thaihard3, "อ with tone mark"),
    (thaihard4, "อ in complex pattern"),
    (thaihard5, "ไ before consonant")
]

for thai_text, description in test_cases:
    print(f"\n'{thai_text}' - {description}")
    print("-" * 40)

    readings = find_canonical_reading_order(thai_text)

    # Remove duplicate readings
    unique_readings = []
    seen = set()
    for reading in readings:
        reading_str = str(reading)
        if reading_str not in seen:
            seen.add(reading_str)
            unique_readings.append(reading)

    if len(unique_readings) == 0:
        print("  No valid readings found!")
    elif len(unique_readings) == 1:
        print(f"  UNAMBIGUOUS - Single reading:")
        for initial, vowel, final in unique_readings[0]:
            if final:
                print(f"    {initial} + {vowel} + {final}")
            else:
                print(f"    {initial} + {vowel}")
    else:
        print(f"  AMBIGUOUS - {len(unique_readings)} possible readings:")
        for i, reading in enumerate(unique_readings[:10]):  # Show first 10 readings
            print(f"  Reading {i+1}:")
            for initial, vowel, final in reading:
                if final:
                    print(f"    {initial} + {vowel} + {final}")
                else:
                    print(f"    {initial} + {vowel}")
        if len(unique_readings) > 10:
            print(f"  ... and {len(unique_readings) - 10} more readings")

print("\n" + "="*60)
print("ANALYSIS OF AMBIGUITIES")
print("="*60)

# Analyze เลว specifically
print(f"\nDetailed analysis of 'เลว':")
readings = find_canonical_reading_order("เลว")
unique_readings = []
seen = set()
for reading in readings:
    reading_str = str(reading)
    if reading_str not in seen:
        seen.add(reading_str)
        unique_readings.append(reading)

if len(unique_readings) > 1:
    print(f"  Found {len(unique_readings)} interpretations:")
    for i, reading in enumerate(unique_readings):
        print(f"  Interpretation {i+1}:")
        for initial, vowel, final in reading:
            # Explain what this means
            if vowel == "เxf" and initial == "ล" and final == "ว":
                print(f"    {initial} + {vowel} + {final}  → ล is initial, ว is final consonant")
            elif vowel == "เx" and initial == "ลว" and not final:
                print(f"    {initial} + {vowel}  → ลว is a consonant cluster")
            else:
                if final:
                    print(f"    {initial} + {vowel} + {final}")
                else:
                    print(f"    {initial} + {vowel}")
else:
    print("  Only one interpretation found (should be ambiguous!)")
    print("  This suggests the algorithm needs improvement.")

In [None]:
# Debug: Check what patterns could match เลว
print("Debugging เลว patterns:")
print("-" * 40)

test_text = "เลว"
print(f"Text: {test_text}")
print(f"Characters: {[c for c in test_text]}")
print()

# Check which patterns could potentially match
matching_patterns = []
for pattern in vowel_patterns:
    # Check if pattern contains the vowel marks in เลว
    if "เ" in pattern:
        # Could this pattern match?
        if "เx" in pattern or "เxf" in pattern or "เxว" in pattern:
            matching_patterns.append(pattern)
            print(f"Potential pattern: {pattern}")

print(f"\nFound {len(matching_patterns)} patterns with เ")

# Now manually test the two interpretations we expect
print("\n1. Testing เxf pattern (ล as x, ว as f):")
if "เxf" in vowel_patterns:
    print("   Pattern เxf exists")
    # This should match as: เ[ล]f where ว is the final
else:
    print("   Pattern เxf NOT FOUND!")

print("\n2. Testing เx pattern (ลว as cluster x):")
if "เx" in vowel_patterns:
    print("   Pattern เx exists")
    # This should match as: เ[ลว] where ลว is a cluster
else:
    print("   Pattern เx NOT FOUND!")

print("\n3. Testing เxว pattern (ล as x, ว part of vowel):")
if "เxว" in vowel_patterns:
    print("   Pattern เxว exists")
    # This matches as: เ[ล]ว where ว is part of the vowel pattern
else:
    print("   Pattern เxว NOT FOUND!")

In [None]:
# Test เลว specifically to see all interpretations
print("\n" + "="*60)
print("TESTING เลว WITH NEW ALGORITHM")
print("="*60)

text = "เลว"
print(f"\nAnalyzing: {text}")

# Get all possible readings
readings = find_canonical_reading_order(text)

print(f"Found {len(readings)} possible reading(s):")

for i, reading in enumerate(readings):
    print(f"\nReading {i+1}:")
    for x, pattern, f in reading:
        interpretation = []
        if x:
            interpretation.append(f"initial={x}")
        interpretation.append(f"pattern={pattern}")
        if f:
            interpretation.append(f"final={f}")

        # Explain what this means
        if pattern == "เxว" and x == "ล":
            print(f"  {' + '.join(interpretation)}")
            print(f"    → ล is initial, ว is part of vowel pattern เxว")
        elif pattern == "เxf" and x == "ล" and f == "ว":
            print(f"  {' + '.join(interpretation)}")
            print(f"    → ล is initial, ว is final consonant")
        elif pattern == "เx" and x == "ลว":
            print(f"  {' + '.join(interpretation)}")
            print(f"    → ลว is a consonant cluster")
        else:
            print(f"  {' + '.join(interpretation)}")

if len(readings) < 2:
    print("\n⚠️ PROBLEM: Should find multiple interpretations but only found", len(readings))
    print("Expected at least:")
    print("  1. ล + เxว (ว part of vowel)")
    print("  2. ล + เxf (ว as final)")
    print("  3. ลว + เx (cluster)")
else:
    print("\n✓ Successfully found ambiguity!")