In [None]:
!pip install tqdm numpy



In [None]:
import re
from collections import Counter
from typing import List, Tuple, Dict, Generator, Set
import tqdm

# Definition of valid Amharic characters
AMHARIC_CHARS: Set[str] = {
    # Fidel - base characters
    'ሀ', 'ሁ', 'ሂ', 'ሃ', 'ሄ', 'ህ', 'ሆ',
    'ለ', 'ሉ', 'ሊ', 'ላ', 'ሌ', 'ል', 'ሎ',
    'ሐ', 'ሑ', 'ሒ', 'ሓ', 'ሔ', 'ሕ', 'ሖ',
    'መ', 'ሙ', 'ሚ', 'ማ', 'ሜ', 'ም', 'ሞ',
    'ሠ', 'ሡ', 'ሢ', 'ሣ', 'ሤ', 'ሥ', 'ሦ',
    'ረ', 'ሩ', 'ሪ', 'ራ', 'ሬ', 'ር', 'ሮ',
    'ሰ', 'ሱ', 'ሲ', 'ሳ', 'ሴ', 'ስ', 'ሶ',
    'ሸ', 'ሹ', 'ሺ', 'ሻ', 'ሼ', 'ሽ', 'ሾ',
    'ቀ', 'ቁ', 'ቂ', 'ቃ', 'ቄ', 'ቅ', 'ቆ',
    'በ', 'ቡ', 'ቢ', 'ባ', 'ቤ', 'ብ', 'ቦ',
    'ተ', 'ቱ', 'ቲ', 'ታ', 'ቴ', 'ት', 'ቶ',
    'ቸ', 'ቹ', 'ቺ', 'ቻ', 'ቼ', 'ች', 'ቾ',
    'ኀ', 'ኁ', 'ኂ', 'ኃ', 'ኄ', 'ኅ', 'ኆ',
    'ነ', 'ኑ', 'ኒ', 'ና', 'ኔ', 'ን', 'ኖ',
    'ኘ', 'ኙ', 'ኚ', 'ኛ', 'ኜ', 'ኝ', 'ኞ',
    'አ', 'ኡ', 'ኢ', 'ኣ', 'ኤ', 'እ', 'ኦ',
    'ከ', 'ኩ', 'ኪ', 'ካ', 'ኬ', 'ክ', 'ኮ',
    'ኸ', 'ኹ', 'ኺ', 'ኻ', 'ኼ', 'ኽ', 'ኾ',
    'ወ', 'ዉ', 'ዊ', 'ዋ', 'ዌ', 'ው', 'ዎ',
    'ዐ', 'ዑ', 'ዒ', 'ዓ', 'ዔ', 'ዕ', 'ዖ',
    'ዘ', 'ዙ', 'ዚ', 'ዛ', 'ዜ', 'ዝ', 'ዞ',
    'ዠ', 'ዡ', 'ዢ', 'ዣ', 'ዤ', 'ዥ', 'ዦ',
    'የ', 'ዩ', 'ዪ', 'ያ', 'ዬ', 'ይ', 'ዮ',
    'ደ', 'ዱ', 'ዲ', 'ዳ', 'ዴ', 'ድ', 'ዶ',
    'ጀ', 'ጁ', 'ጂ', 'ጃ', 'ጄ', 'ጅ', 'ጆ',
    'ገ', 'ጉ', 'ጊ', 'ጋ', 'ጌ', 'ግ', 'ጎ',
    'ጠ', 'ጡ', 'ጢ', 'ጣ', 'ጤ', 'ጥ', 'ጦ',
    'ጨ', 'ጩ', 'ጪ', 'ጫ', 'ጬ', 'ጭ', 'ጮ',
    'ጰ', 'ጱ', 'ጲ', 'ጳ', 'ጴ', 'ጵ', 'ጶ',
    'ጸ', 'ጹ', 'ጺ', 'ጻ', 'ጼ', 'ጽ', 'ጾ',
    'ፀ', 'ፁ', 'ፂ', 'ፃ', 'ፄ', 'ፅ', 'ፆ',
    'ፈ', 'ፉ', 'ፊ', 'ፋ', 'ፌ', 'ፍ', 'ፎ',
    'ፐ', 'ፑ', 'ፒ', 'ፓ', 'ፔ', 'ፕ', 'ፖ',
    'ቨ', 'ቩ', 'ቪ', 'ቫ', 'ቬ', 'ቭ', 'ቮ',

    # Labialized characters
    'ሏ', 'ሟ', 'ሯ', 'ሷ', 'ሿ', 'ቋ', 'ቧ', 'ቷ', 'ቿ', 'ኋ', 'ኗ', 'ኟ','ዃ',
    'ኳ', 'ዟ', 'ዧ', 'ዷ', 'ጇ', 'ጓ', 'ጧ', 'ጯ', 'ጷ', 'ጿ', 'ፏ', 'ፗ', 'ቯ',

    # Numbers
    '፩', '፪', '፫', '፬', '፭', '፮', '፯', '፰', '፱', '፲',
    '፳', '፴', '፵', '፶', '፷', '፸', '፹', '፺', '፻', '፼'
}

def is_amharic_char(char: str) -> bool:
    """
    Check if a character is a valid Amharic character.
    """
    return char in AMHARIC_CHARS


def clean_amharic_text(text: str) -> str:
    """
    Clean text to keep only Amharic characters and spaces.
    """
    # Keep only Amharic characters and spaces
    cleaned = ''.join(char if is_amharic_char(char) else ' ' for char in text)
    # Remove multiple spaces
    cleaned = ' '.join(cleaned.split())
    return cleaned

def read_vert_file_lines(filename: str, chunk_size: int = 1000000) -> Generator[List[str], None, None]:
    """
    Memory-efficient file reading - yields chunks instead of loading entire file.
    """
    with open(filename, 'r') as file:
        while True:
            lines = []
            for _ in range(chunk_size):
                line = file.readline()
                if not line:
                    if lines:
                        yield lines
                    return
                lines.append(line.strip())
            yield lines

def process_chunk(chunk: List[str]) -> List[str]:
    """
    Process a chunk of lines into sentences.
    """
    sentences = []
    current_sentence = []

    for line in chunk:
        if line == '<s>':
            current_sentence = []
        elif line == '</s>' and current_sentence:
            # Clean and join the sentence
            cleaned_words = [clean_amharic_text(word) for word in current_sentence]
            # Filter out empty strings that might result from cleaning
            cleaned_words = [w for w in cleaned_words if w.strip()]
            if cleaned_words:  # Only add if there are actual words left
                sentences.append(" ".join(cleaned_words))
            current_sentence = []
        else:
            line = line.replace('<g/>', '')
            matches = re.findall(r"^\S+", line)
            current_sentence.extend(matches)

    return sentences

def process_sentences_in_batches(sentences: List[str], batch_size: int = 5000) -> Generator[List[str], None, None]:
    """
    Process sentences in smaller batches to manage memory.
    """
    for i in range(0, len(sentences), batch_size):
        yield sentences[i:i + batch_size]

def count_amharic_chars(text: str) -> Counter:
    """
    Count only Amharic characters in text.
    """
    return Counter(char for char in text if is_amharic_char(char))

def select_sentences_memory_efficient(sentences_generator: Generator,
                                   target_count: int = 24000,
                                   min_words: int = 6,
                                   min_char_occurrences: int = 2000,
                                   batch_size: int = 5000) -> Tuple[List[str], Dict, Dict]:
    """
    Memory-efficient sentence selection process with Amharic character filtering.
    """
    selected_sentences = []
    char_frequencies = Counter()
    all_eligible_sentences = []

    print("First pass: collecting eligible sentences...")
    for sentences_batch in sentences_generator:
        # Clean sentences and filter by word count
        eligible_batch = []
        for s in sentences_batch:
            cleaned = clean_amharic_text(s)
            if len(cleaned.split()) >= min_words:
                eligible_batch.append(cleaned)

        all_eligible_sentences.extend(eligible_batch)

        if len(selected_sentences) < target_count:
            needed = target_count - len(selected_sentences)
            batch_selection = eligible_batch[:needed]
            selected_sentences.extend(batch_selection)
            for sentence in batch_selection:
                char_frequencies.update(count_amharic_chars(sentence))

    if len(selected_sentences) < target_count:
        raise ValueError(f"Only found {len(selected_sentences)} eligible sentences, needed {target_count}")

    chars_below_threshold = {char: count for char, count in char_frequencies.items()
                           if count < min_char_occurrences and is_amharic_char(char)}

    failed_chars = {}

    if chars_below_threshold:
        print("Second pass: optimizing character frequencies...")
        remaining_sentences = set(all_eligible_sentences) - set(selected_sentences)

        for char, current_count in tqdm.tqdm(chars_below_threshold.items()):
            if not remaining_sentences:
                failed_chars[char] = char_frequencies[char]
                continue

            remaining_list = list(remaining_sentences)
            for batch_start in range(0, len(remaining_list), batch_size):
                batch = remaining_list[batch_start:batch_start + batch_size]
                sentences_with_char = [s for s in batch if char in s]

                for new_sentence in sentences_with_char:
                    if char_frequencies[char] >= min_char_occurrences:
                        break

                    for i, selected in enumerate(selected_sentences):
                        if char not in selected:
                            # Update frequencies using only Amharic characters
                            for c in selected:
                                if is_amharic_char(c):
                                    char_frequencies[c] -= 1
                            for c in new_sentence:
                                if is_amharic_char(c):
                                    char_frequencies[c] += 1

                            selected_sentences[i] = new_sentence
                            remaining_sentences.remove(new_sentence)
                            break

                    if char_frequencies[char] >= min_char_occurrences:
                        break

            if char_frequencies[char] < min_char_occurrences:
                failed_chars[char] = char_frequencies[char]

    return [s + "::" for s in selected_sentences], char_frequencies, failed_chars


def main():
    filename = '/content/am131516.vert'
    batch_size = 5000

    try:
        print("Processing file in chunks...")
        all_sentences = []

        for chunk in read_vert_file_lines(filename):
            sentences = process_chunk(chunk)
            all_sentences.extend(sentences)
            print(f"\rCollected {len(all_sentences)} sentences...", end="")

        print(f"\nTotal sentences collected: {len(all_sentences)}")

        # Print character set information
        print("\nValid Amharic characters in our set:")
        for char in sorted(AMHARIC_CHARS):
            print(f"'{char}' (U+{ord(char):04X})")

        sentences_generator = process_sentences_in_batches(all_sentences, batch_size)

        print("\nSelecting and optimizing sentences...")
        selected_sentences, char_frequencies, failed_chars = select_sentences_memory_efficient(
            sentences_generator,
            batch_size=batch_size
        )

        # Filter character frequencies to show only valid Amharic characters
        amharic_frequencies = {char: freq for char, freq in char_frequencies.items()
                             if is_amharic_char(char)}

        print(f"\nStatistics:")
        print(f"Number of selected sentences: {len(selected_sentences)}")
        print(f"Total Amharic characters: {sum(amharic_frequencies.values())}")
        print(f"Unique Amharic characters: {len(amharic_frequencies)}")
        print(f"\nAmharic character frequencies:")
        for char, freq in sorted(amharic_frequencies.items(), key=lambda x: x[1], reverse=True):
            print(f"'{char}' (U+{ord(char):04X}): {freq}")

        # Print characters found but not in our set
        invalid_chars = {char: freq for char, freq in char_frequencies.items()
                        if not is_amharic_char(char) and char.strip()}
        if invalid_chars:
            print("\nWARNING: Found and removed these non-Amharic characters:")
            for char, freq in sorted(invalid_chars.items(), key=lambda x: x[1], reverse=True):
                print(f"'{char}' (U+{ord(char):04X}): {freq} occurrences")

        if failed_chars:
            print("\nWARNING: Could not reach minimum occurrence threshold for these Amharic characters:")
            for char, count in failed_chars.items():
                print(f"'{char}' (U+{ord(char):04X}): Only found {count} occurrences (needed {2000})")

        print("\nSaving results...")
        output_filename = 'selected_amharic_sentences.txt'
        with open(output_filename, 'w') as f:
            for sentence in selected_sentences:
                f.write(sentence + '\n')
        print(f"Selected sentences saved to {output_filename}")

    except ValueError as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()

Processing file in chunks...
Collected 1167859 sentences...
Total sentences collected: 1167859

Valid Amharic characters in our set:
'ሀ' (U+1200)
'ሁ' (U+1201)
'ሂ' (U+1202)
'ሃ' (U+1203)
'ሄ' (U+1204)
'ህ' (U+1205)
'ሆ' (U+1206)
'ለ' (U+1208)
'ሉ' (U+1209)
'ሊ' (U+120A)
'ላ' (U+120B)
'ሌ' (U+120C)
'ል' (U+120D)
'ሎ' (U+120E)
'ሏ' (U+120F)
'ሐ' (U+1210)
'ሑ' (U+1211)
'ሒ' (U+1212)
'ሓ' (U+1213)
'ሔ' (U+1214)
'ሕ' (U+1215)
'ሖ' (U+1216)
'መ' (U+1218)
'ሙ' (U+1219)
'ሚ' (U+121A)
'ማ' (U+121B)
'ሜ' (U+121C)
'ም' (U+121D)
'ሞ' (U+121E)
'ሟ' (U+121F)
'ሠ' (U+1220)
'ሡ' (U+1221)
'ሢ' (U+1222)
'ሣ' (U+1223)
'ሤ' (U+1224)
'ሥ' (U+1225)
'ሦ' (U+1226)
'ረ' (U+1228)
'ሩ' (U+1229)
'ሪ' (U+122A)
'ራ' (U+122B)
'ሬ' (U+122C)
'ር' (U+122D)
'ሮ' (U+122E)
'ሯ' (U+122F)
'ሰ' (U+1230)
'ሱ' (U+1231)
'ሲ' (U+1232)
'ሳ' (U+1233)
'ሴ' (U+1234)
'ስ' (U+1235)
'ሶ' (U+1236)
'ሷ' (U+1237)
'ሸ' (U+1238)
'ሹ' (U+1239)
'ሺ' (U+123A)
'ሻ' (U+123B)
'ሼ' (U+123C)
'ሽ' (U+123D)
'ሾ' (U+123E)
'ሿ' (U+123F)
'ቀ' (U+1240)
'ቁ' (U+1241)
'ቂ' (U+1242)
'ቃ' (U+1243)
'ቄ' (U+1244)
'ቅ' (U+12

100%|██████████| 162/162 [01:06<00:00,  2.43it/s]



Statistics:
Number of selected sentences: 24000
Total Amharic characters: 1759298
Unique Amharic characters: 284

Amharic character frequencies:
'ን' (U+1295): 96923
'ት' (U+1275): 67196
'በ' (U+1260): 58325
'የ' (U+12E8): 58172
'ው' (U+12CD): 57360
'መ' (U+1218): 48076
'ር' (U+122D): 46549
'ም' (U+121D): 46410
'ለ' (U+1208): 46368
'አ' (U+12A0): 45217
'ስ' (U+1235): 44527
'ል' (U+120D): 43849
'ተ' (U+1270): 37042
'ነ' (U+1290): 36062
'እ' (U+12A5): 35659
'ይ' (U+12ED): 35029
'ያ' (U+12EB): 34555
'ና' (U+1293): 30193
'ደ' (U+12F0): 26513
'ች' (U+127D): 25877
'ብ' (U+1265): 25573
'ገ' (U+1308): 25452
'ማ' (U+121B): 24285
'ግ' (U+130D): 24147
'ከ' (U+12A8): 23370
'ላ' (U+120B): 22464
'ረ' (U+1228): 21737
'ድ' (U+12F5): 20808
'ሰ' (U+1230): 19553
'ባ' (U+1263): 18697
'ራ' (U+122B): 18604
'ሚ' (U+121A): 18186
'ታ' (U+1273): 17957
'ወ' (U+12C8): 17937
'ህ' (U+1205): 15479
'ጥ' (U+1325): 13775
'ክ' (U+12AD): 13768
'ቀ' (U+1240): 13739
'ሳ' (U+1233): 12073
'ሆ' (U+1206): 11993
'ጠ' (U+1320): 11586
'ቸ' (U+1278): 11413
'ቅ' (U+1245): 

# Generating Forms


In [None]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/244.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


# Sample Form

In [None]:
from docx import Document
from docx.shared import Pt, Cm
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT

def read_first_n_lines(file_path: str, n: int) -> list:
    """Read the first n lines from a file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return [next(file).strip() for _ in range(n)]

def create_single_page_form(sentences: list, form_number: int, output_path: str) -> None:
    doc = Document()

    # Set margins
    section = doc.sections[0]
    section.top_margin = Cm(1.27)
    section.bottom_margin = Cm(1.27)
    section.left_margin = Cm(1.27)
    section.right_margin = Cm(1.27)

    # Add form number
    form_num = doc.add_paragraph()
    form_num.add_run(f"{form_number:04d}")
    form_num.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER  # Center the form number
    form_num.paragraph_format.space_after = Pt(6)

    # First horizontal line
    doc.add_paragraph("_" * 105)

    # Content paragraph
    para = doc.add_paragraph()
    para.paragraph_format.space_after = Pt(12)

    for i, sentence in enumerate(sentences):
        para.add_run(sentence.strip("::") + "።")  # Add Ethiopian period after stripping ::
        if i < len(sentences) - 1:
            para.add_run(" ")

    # Middle horizontal line
    doc.add_paragraph("_" * 105)

    # Add enough empty paragraphs to push content but avoid extra pages
    for _ in range(8):  # Reduced from 10 to leave more space for footer
        doc.add_paragraph("\n")

    # Add footer with increased spacing
    footer = section.footer.paragraphs[0] if section.footer.paragraphs else section.footer.add_paragraph()
    footer.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER

    # Add the line with extra spacing
    footer_line = footer.add_run("_" * 98)
    footer_line.font.size = Pt(12)

    # Add two line breaks for spacing
    footer.add_run("\n\n")

    # Add the writer number
    footer.add_run("Writer Number: " + "_" * 20)

    # Save only the first page
    doc.save(output_path)

# Test function with file input
try:
    test_sentences = read_first_n_lines('selected_amharic_sentences(2).txt', 6)
    create_single_page_form(test_sentences, 1, 'test_form.docx')
    print("Single-page form created as 'test_form.docx'")
except FileNotFoundError:
    print("Error: Could not find 'selected_amharic_sentences.txt'")
except Exception as e:
    print(f"An error occurred: {e}")

Single-page form created as 'test_form.docx'
