# The most important idea behind training the model

`curriculum learning` is the main idea behind train the model. because we use character level tokenization for input (transformer encoding layer) and sub-word level tokenization for output (transformer decoder layer) it's important to learn alignment of input-output token pair first. unless input output token alignment the model cannot produce correct output from given input even the correct input were given.

### The first step: Generate individual words from all the text dataset.

In [88]:
from pathlib import Path

data_dir = Path.cwd().parent.parent / ".data" / "text"
files_to_be_included = [f for f in data_dir.iterdir() if f.is_file() and f.name.endswith(".json")]
# files_to_be_included = [data_dir / f"{i * 1000}.json" for i in range(25)]
print(f"files: {", ".join([i.name for i in files_to_be_included])}")
print(f"count: {len(files_to_be_included)}")

files: 59000.json, 31000.json, 69000.json, 34000.json, 101000.json, 143000.json, 122000.json, 108000.json, 14000.json, 43000.json, 90000.json, 107000.json, 56000.json, 49000.json, 55000.json, 20000.json, 28000.json, 106000.json, 4000.json, 70000.json, 62000.json, 39000.json, 33000.json, 103000.json, 61000.json, 11000.json, 137000.json, 87000.json, 17000.json, 127000.json, 124000.json, 65000.json, 48000.json, 50000.json, 94000.json, 52000.json, 138000.json, 66000.json, 7000.json, 134000.json, 123000.json, 8000.json, 24000.json, 116000.json, 19000.json, 125000.json, 131000.json, 79000.json, 104000.json, 118000.json, 84000.json, 60000.json, 53000.json, 5000.json, 76000.json, 73000.json, 1000.json, 35000.json, 6000.json, 93000.json, 92000.json, 67000.json, 88000.json, 105000.json, 16000.json, 36000.json, 21000.json, 78000.json, 18000.json, 64000.json, 100000.json, 97000.json, 140000.json, 10000.json, 135000.json, 12000.json, 109000.json, 41000.json, 130000.json, 42000.json, dataset.json, 3

Read all the file contents into a variable.

In [89]:
from tqdm import tqdm
import json

sentences: list[str] = []

for i in tqdm(files_to_be_included, total=len(files_to_be_included)):
    with open(i, "r") as f:
        sentences_in_file: list[str] = json.load(f)
        expanded_sentences: list[str] = []
        for s in sentences_in_file:
            ss = s.split(".")
            for s in ss:
                s = s.strip()
                if s == "":
                    continue
                expanded_sentences.append(s)
    sentences.extend(expanded_sentences)

print(f"All the sentence count: {len(sentences)}")

100%|██████████| 146/146 [00:02<00:00, 71.29it/s]

All the sentence count: 2866036





split them with whitespace and -, : symbols

In [90]:
import re

def split_and_clean_sentences(sentences: list[str]) -> list[str]:
    """
    Splits each sentence in the input list by common symbols. like whitespace, -, :, =, ،, ., !, ?, ), (, », «, ؛
    Returns a flat list of cleaned sentence fragments (non-empty).
    """
    common_symbols = r"[.\s:\-=،!?؟()»«؛،.›‹]+|[\d_]"
    lists = [re.split(common_symbols, sentence) for sentence in tqdm(sentences)]
    return [item for sublist in tqdm(lists) for item in sublist]
words = split_and_clean_sentences(sentences)
print(f"Extracted {len(words)} words.")


100%|██████████| 2866036/2866036 [00:21<00:00, 130785.89it/s]
100%|██████████| 2866036/2866036 [00:00<00:00, 3642803.67it/s]

Extracted 53308345 words.





In [91]:
def deduplicate_words(words: list[str]) -> list[str]:
    """
    Removes duplicate words while preserving order of first appearance.
    """
    print(len(words))
    return list(set(words))

unique_words = deduplicate_words(words)
print(f"Unique word count: {len(unique_words)}")

with open(data_dir.parent / "individual_words.json", "w") as f:
    json.dump(unique_words, f, ensure_ascii=False, indent=2)

53308345
Unique word count: 715371


In [92]:
# The unique words contain meaningless characters like ".", "!", "?", and English words. 
# Since this is an Uyghur language dataset of individual words, we need to remove them to make the dataset clean.

# First, clean common symbols like ".", "!", "?", ")", "(" and numbers.

import re

def clean_common_symbols(words: list[str]) -> list[str]:
    """
    Cleans common symbols from the input list of words.
    
    Removes punctuation marks, numbers, and other non-alphabetic characters
    from each word in the list. Returns a list of cleaned words.
    """

    common_symbols = r"[.\s:\-=،!?؟()»«؛،.›‹]+|[\d_]"

    cleaned_words = []
    for word in words:
        # Remove common symbols, numbers, and punctuation
        cleaned = re.sub(common_symbols, '', word)
        if cleaned:  # Only add non-empty words
            cleaned_words.append(cleaned)
    return cleaned_words

# Remove empty words
unique_words = [word for word in unique_words if word]
print(f"Unique word count after cleaning: {len(unique_words)}")

# Remove English characters from words

def clean_english_characters(word: str) -> str:
    """
    Removes all English alphabet characters from a word.
    
    Args:
        word: The word to clean
        
    Returns:
        The word with all English characters removed
    """
    # Remove all English alphabet characters (both uppercase and lowercase)
    return re.sub(r'[a-zA-Z]', '', word)

def clean_non_uyghur_words(words: list[str]) -> list[str]:
    """
    Cleans the list of words by removing English characters.
    
    Args:
        words: List of words to clean
        
    Returns:
        List of cleaned words with English characters removed
    """
    # First clean common symbols
    cleaned_words = clean_common_symbols(words)
    
    # Then remove English characters from each word
    filtered_words = []
    for word in cleaned_words:
        cleaned_word = clean_english_characters(word)
        if cleaned_word:  # Only keep non-empty words
            filtered_words.append(cleaned_word)
    
    return filtered_words

# Clean the unique words by removing symbols and English characters
cleaned_unique_words = clean_non_uyghur_words(unique_words)
cleaned_unique_words = deduplicate_words(cleaned_unique_words)
print(f"Unique Uyghur word count after removing English characters: {len(cleaned_unique_words)}")

# Save the cleaned words to a new file
with open(data_dir.parent / "individual_words.json", "w") as f:
    json.dump(cleaned_unique_words, f, ensure_ascii=False, indent=2)


Unique word count after cleaning: 715370
686103
Unique Uyghur word count after removing English characters: 678761


In [93]:
#sort the words by length
cleaned_unique_words.sort(key=len)

cleaned_unique_words = cleaned_unique_words[10_000:500_000]

#save the sorted words to a new file
with open(data_dir.parent / "individual_words.json", "w") as f:
    json.dump(cleaned_unique_words, f, ensure_ascii=False, indent=2)


In [3]:
import json

with open("/home/dream-lab/project/spell_correction/.data/words.json", "r") as f:
    lines = f.read()

with open("/home/dream-lab/project/spell_correction/.data/words.json", "w") as f:
    json.dump(lines, f, ensure_ascii=False, indent=2)