Amharic Text Processing Script

Cleans and analyzes Amharic text by removing unwanted symbols,

English letters, numbers, and emojis.


In [1]:
# Install dependencies
!pip install emoji --quiet

import re
import emoji
from collections import Counter


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/590.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Load text
with open('/content/11.md', 'r', encoding='utf-8') as f:
    text = f.read()

In [16]:
import re
import os
from typing import Dict, Tuple

try:
    import emoji
except ImportError:
    print("Warning: emoji library not found. Install with: pip install emoji")
    emoji = None

def load_text_file(file_path: str) -> str:
    """Load text from file with UTF-8 encoding."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return ""
    except Exception as e:
        print(f"Error reading file: {e}")
        return ""

def amharic_only_cleaned(text: str) -> str:
    """
    Extract only Amharic characters and clean unwanted symbols.
    Removes emojis, English words, numbers, math expressions and returns one word per line.
    """
    # Remove emojis first
    text = remove_emojis(text)

    # Remove English words and numbers
    text = remove_english_and_numbers(text)

    # Define unwanted symbols including mathematical expressions
    symbols_to_remove = r"[።፡፣፤፥፦፧፨?!.,;:\-\(\)\[\]\{\}\"'/\\@#$%&*\+=<>|~^_=≠≤≥∞±×÷√∑∏∫∂∆∇°]"

    # Remove unwanted symbols
    cleaned = re.sub(symbols_to_remove, '', text)

    # Extract only Amharic characters and whitespace
    amharic_text = ''.join(re.findall(r'[\u1200-\u137F\s]', cleaned))

    # Split by whitespace and rejoin with newlines (one word per line)
    words = [word for word in amharic_text.split() if word.strip()]
    return '\n'.join(words)

def remove_emojis(text: str) -> str:
    """Remove emojis from text."""
    if emoji:
        return emoji.replace_emoji(text, replace='')
    else:
        # Basic emoji removal using Unicode ranges if emoji library unavailable
        emoji_pattern = r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002700-\U000027BF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F]'
        return re.sub(emoji_pattern, '', text)

def remove_english_and_numbers(text: str) -> str:
    """Remove English words, numbers, mathematical expressions, and related symbols."""
    # Remove mathematical expressions and formulas
    text = re.sub(r'[0-9]+\s*[+\-*/=×÷]\s*[0-9]+', '', text)  # Simple math expressions
    text = re.sub(r'[0-9]+\s*%', '', text)  # Percentages
    text = re.sub(r'\([^)]*[0-9][^)]*\)', '', text)  # Content in parentheses with numbers

    # Remove standalone numbers (integers, decimals, fractions)
    text = re.sub(r'\b\d+\.?\d*\b', '', text)  # Numbers with optional decimals
    text = re.sub(r'\b\d+/\d+\b', '', text)  # Fractions

    # Remove English words
    text = re.sub(r'\b[A-Za-z]+\b', '', text)

    # Remove mathematical symbols and operators
    math_symbols = r'[+\-*/=×÷±≠≤≥∞√∑∏∫∂∆∇°%~]'
    text = re.sub(math_symbols, '', text)

    # Remove multiple spaces and clean up
    text = re.sub(r'\s+', ' ', text)

    return text

def remove_english_numbers(text: str) -> str:
    """Remove English words, numbers, mathematical expressions, and emojis from text."""
    text = remove_emojis(text)
    return remove_english_and_numbers(text)

def remove_emojis_english_numbers(text: str) -> str:
    """Remove emojis, English words, numbers, and mathematical expressions from text."""
    text = remove_emojis(text)
    return remove_english_and_numbers(text)

def analyze_text(text: str) -> Dict[str, int]:
    """Analyze text and return character statistics."""
    analysis = {
        "total_length": len(text),
        "amharic_chars": len(re.findall(r'[\u1200-\u137F]', text)),
        "english_letters": len(re.findall(r'[A-Za-z]', text)),
        "english_words": len(re.findall(r'\b[A-Za-z]+\b', text)),
        "numbers": len(re.findall(r'\d', text)),
        "math_expressions": len(re.findall(r'[0-9]+\s*[+\-*/=×÷]\s*[0-9]+', text)),
        "percentages": len(re.findall(r'[0-9]+\s*%', text)),
        "parentheses_content": len(re.findall(r'\([^)]*\)', text)),
        "math_symbols": len(re.findall(r'[+\-*/=×÷±≠≤≥∞√∑∏∫∂∆∇°%~]', text)),
        "punctuation": len(re.findall(r'[^\w\s\u1200-\u137F]', text)),
        "whitespace_chars": len(re.findall(r'\s', text)),
    }

    # Count emojis
    if emoji:
        analysis["emojis"] = sum(1 for c in text if c in emoji.EMOJI_DATA)
    else:
        # Basic emoji counting using Unicode ranges
        emoji_pattern = r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002700-\U000027BF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F]'
        analysis["emojis"] = len(re.findall(emoji_pattern, text))

    return analysis

def text_to_word_lines(text: str) -> str:
    """Convert text to word-per-line format after removing emojis, English words, numbers, math expressions, and cleaning."""
    text = remove_emojis(text)
    text = remove_english_and_numbers(text)

    # Enhanced symbol removal including mathematical symbols
    symbols_to_remove = r"[።፡፣፤፥፦፧፨?!.,;:\-\(\)\[\]\{\}\"'/\\@#$%&*\+=<>|~^_=≠≤≥∞±×÷√∑∏∫∂∆∇°]"
    cleaned = re.sub(symbols_to_remove, '', text)
    words = [word for word in cleaned.split() if word.strip()]
    return '\n'.join(words)

def save_processed_text(results: Dict[str, str], analysis: Dict[str, int], output_path: str):
    """Save all processed results to a file."""
    with open(output_path, "w", encoding="utf-8") as f:
        f.write("=== AMHARIC ONLY (CLEANED & WORD PER LINE) ===\n")
        f.write(results["amharic_only"] + "\n\n")

        f.write("=== REMOVE ENGLISH WORDS, NUMBERS & MATH EXPRESSIONS ===\n")
        f.write(results["no_english_numbers"] + "\n\n")

        f.write("=== REMOVE ENGLISH, NUMBERS, MATH, EMOJIS ===\n")
        f.write(results["no_english_numbers_emojis"] + "\n\n")

        f.write("=== TEXT ANALYSIS ===\n")
        for key, value in analysis.items():
            f.write(f"{key}: {value}\n")

        f.write("\n=== WORD-PER-LINE (FULL TEXT) ===\n")
        f.write(results["word_lines"])

def main():
    """Main processing function."""
    # Configuration
    input_file = "11.md"  # Change this to your input file path
    output_file = "final_processed_text.txt"

    # Check if input file exists
    if not os.path.exists(input_file):
        print(f"Input file '{input_file}' not found.")
        print("Please ensure the file exists or update the input_file variable.")
        return

    print(f"Loading text from: {input_file}")
    text = load_text_file(input_file)

    if not text:
        print("No text loaded. Exiting.")
        return

    print("Processing text...")

    # Process text in different ways
    results = {
        "amharic_only": amharic_only_cleaned(text),
        "no_english_numbers": remove_english_numbers(text),
        "no_english_numbers_emojis": remove_emojis_english_numbers(text),
        "word_lines": text_to_word_lines(text)
    }

    # Analyze original text
    analysis = analyze_text(text)

    # Save results
    save_processed_text(results, analysis, output_file)

    print(f"Processing complete! Results saved to: {output_file}")
    print("\nText Analysis Summary:")
    for key, value in analysis.items():
        print(f"  {key}: {value}")

if __name__ == "__main__":
    main()

Loading text from: 11.md
Processing text...
Processing complete! Results saved to: final_processed_text.txt

Text Analysis Summary:
  total_length: 444262
  amharic_chars: 125662
  english_letters: 140709
  english_words: 71355
  numbers: 25430
  math_expressions: 115
  percentages: 19
  parentheses_content: 583
  math_symbols: 11414
  punctuation: 32664
  whitespace_chars: 116705
  emojis: 8370
