# Basic data cleaning and tokenization

## Cleaning

Some simple, regex-based cleaning is performed on train and dev datasets, e.g. to remove HTML tags from Wikipedia articles, non-verbal cues from subtitles, or even to correct I‚Äôs that were incorrectly recognized as l‚Äôs in OCR‚Äôed uppercase text.

In [1]:
from pathlib import Path
from mrclean import *

In [15]:
DATA_ROOT = Path("F:/llm-deploy-data/data/Babyllama")
SEQ_LENGTH = 128 # this is a legacy parameter, it does not affect cleaning
DATA_SPLITS = ['babylm_10M', 'babylm_dev']

CLEANUP_FUNCTIONS = {
    'aochildes': cleanup_aochildes,
    'bnc_spoken': cleanup_bnc_spoken,
    'cbt': cleanup_cbt,
    'childes': cleanup_children_stories,
    'gutenberg': cleanup_gutenberg,
    'open_subtitles': cleanup_open_subtitles,
    'qed': cleanup_qed,
    'simple_wiki': cleanup_simple_wikipedia,
    'switchboard': cleanup_switchboard,
    'wikipedia': cleanup_wikipedia,
}


In [16]:
for split in DATA_SPLITS:
    INPUT_DIR = DATA_ROOT /  split
    OUTPUT_DIR = DATA_ROOT / f'{split}_clean'
    
    OUTPUT_DIR.mkdir(exist_ok=True)

    train_files = [f for f in INPUT_DIR.iterdir() if f.is_file() and f.suffix in ['.train', '.dev']]
    
    for file in train_files:
        text = file.read_text(encoding='utf-8')
        cleaned_text = CLEANUP_FUNCTIONS[file.stem](text, SEQ_LENGTH)
        (OUTPUT_DIR / file.name).write_text(cleaned_text, encoding='utf-8')
        print(f"üßπ Cleaned '{file.name}' (size {len(text)} -> {len(cleaned_text)}) in {split}")


üßπ Cleaned 'bnc_spoken.train' (size 4883879 -> 4851676) in babylm_10M
üßπ Cleaned 'childes.train' (size 15482927 -> 15482927) in babylm_10M
üßπ Cleaned 'gutenberg.train' (size 13910986 -> 13910986) in babylm_10M
üßπ Cleaned 'open_subtitles.train' (size 10806305 -> 10804026) in babylm_10M
üßπ Cleaned 'simple_wiki.train' (size 8411630 -> 8387062) in babylm_10M
üßπ Cleaned 'switchboard.train' (size 719322 -> 719322) in babylm_10M
üßπ Cleaned 'bnc_spoken.dev' (size 6538139 -> 6503778) in babylm_dev
üßπ Cleaned 'childes.dev' (size 14638378 -> 14638378) in babylm_dev
üßπ Cleaned 'gutenberg.dev' (size 15490473 -> 15490473) in babylm_dev
üßπ Cleaned 'open_subtitles.dev' (size 11016133 -> 11014854) in babylm_dev
üßπ Cleaned 'simple_wiki.dev' (size 8149513 -> 8128239) in babylm_dev
üßπ Cleaned 'switchboard.dev' (size 724013 -> 724013) in babylm_dev


## Training a tokenizer

In [17]:
from pathlib import Path
from tokenizers import (Tokenizer, decoders, models, pre_tokenizers,
                        processors, trainers)
from tokenizers.normalizers import NFKC

In [18]:
# We train the tokenizer on the train data only
data_dir = Path("F:/llm-deploy-data/data/Babyllama/babylm_10M_clean/")

paths = [str(f) for f in data_dir.glob("*") if f.is_file() and not f.name.endswith(".DS_Store") and f.suffix in [".train"]]

# paths
print(len(paths))
assert len(paths) > 0, 'No data files found'

6


In [19]:
tokenizer = Tokenizer(models.BPE())

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
tokenizer.normalizer = NFKC()

In [20]:
trainer = trainers.BpeTrainer(vocab_size=16000, min_frequency=2, special_tokens=["<pad>", "<s>", "</s>"])
tokenizer.train(paths, trainer)

In [22]:
tokenizer_path =  DATA_ROOT / "models/gpt-clean-16000.json"
tokenizer.save(str(tokenizer_path), pretty=True)

## Testing the tokenizer

In [23]:

tokenizer = Tokenizer.from_file(str(tokenizer_path))


# text = 'Shiro Okada (Â≤°Áî∞ÂøóÈÉé, "Okada Shir≈ç", June 9, 1949; Hirakata, Osaka {age 71} - ) is a Japanese guitarist who participate in the Group Sound band, the Ox. His nickname was Shiro („Ç∑„É≠„Éº) and his real name is Shiro Okamoto (Â≤°Áî∞Âè≤ÈÉé).'
text = "The quick brown fox jumps over the lazy dog."

encoded = tokenizer.encode(text)
print(f"Encoded String: {encoded.tokens}")

print(f"Encoded IDs: {encoded.ids}")

decoded = tokenizer.decode(encoded.ids)
print(f"Decoded String: {decoded}")


Encoded String: ['ƒ†The', 'ƒ†quick', 'ƒ†brown', 'ƒ†fox', 'ƒ†jumps', 'ƒ†over', 'ƒ†the', 'ƒ†lazy', 'ƒ†dog', '.']
Encoded IDs: [302, 1784, 3266, 5712, 15961, 541, 190, 11553, 1469, 16]
Decoded String:  The quick brown fox jumps over the lazy dog.
