In [1]:
# to construct byte-level base tokenizer
from magicab.etoken import TokenTrie 
from magicab import ETokenizer 

tokenizer = ETokenizer(mode="byte")

In [2]:
import time
import warnings
from nest_asyncio import apply

# Suppress both types of warnings - be more aggressive with the filtering
warnings.filterwarnings("ignore", category=RuntimeWarning)

apply()

texts = ["I am super duper", "Hey how is it going?", "I am not super duper"] * 1000

start = time.time()
tokenizer.encode_with_chunking(texts, mode="sequential")
end = time.time()
time_sequential = end - start
print(f"Time for encoding (sequential): {time_sequential} seconds")

start = time.time()
tokenizer.encode_with_chunking(texts, mode="parallel")
end = time.time()
time_parallel = end - start
print(f"Time for encoding (parallel): {time_parallel} seconds")

print("Parallel Speed-ups:")
print(f"Sequential / Parallel: {time_sequential / time_parallel}")


Time for encoding (sequential): 0.11640429496765137 seconds
Time for encoding (parallel): 3.719329833984375e-05 seconds
Parallel Speed-ups:
Sequential / Parallel: 3129.7115384615386


In [14]:
start = time.time()
tokenizer.encode_with_chunking(texts, mode="sequential")
end = time.time()
print(f"Time taken for sequential: {end - start} seconds")

Time taken for sequential: 0.0461881160736084 seconds


In [10]:
from magicab.etoken import chunk_text, _encode_chunks



Time taken for sequential: 0.004302263259887695 seconds


In [None]:
# Debug on pre-processing functional 
from data.composio.process_pt_data import process_fineweb_edu, process_dataset, load_from_disk
import os
dataset_config = ('fineweb-edu', process_fineweb_edu)

# ------------------- input ---------------------------
datasets_dir = "data/composio"
dataset_name, processor_fn = dataset_config
block_size=512
num_proc = 1
dataset = load_from_disk(os.path.join(datasets_dir, dataset_name))
val_size = 10
train_size = len(dataset) - val_size
train_dataset, val_dataset = process_dataset(
    dataset=dataset,
    processor_fn=processor_fn,
    tokenizer=tokenizer,
    block_size=block_size,
    num_proc=num_proc,
    train_size=train_size,
    val_size=val_size,
    desc_prefix=dataset_name
)

Processed fineweb-edu: 190 train, 10 val


In [3]:
# test with 'list-of-list' ids save & load (with padding)

all_train_ids = [[1]*600, [2]*800, [3]*100]

# 1. save to .bin 
import torch 
train_path = "test.bin"
torch.save(all_train_ids, train_path)

# 2. get_batch (load from .bin, in-sample slice with padding)


In [61]:
# Example usage
from magicab.data import get_batch_slice, save_sequences_for_memmap

text_list = ["I am super duper", "Hey how is it going?", "I am not super duper"]
all_train_ids = [tokenizer.encode(text) for text in text_list]

save_sequences_for_memmap(all_train_ids, "optimized_data.bin")

random_slice = get_batch_slice("optimized_data.bin", pad_token_id=tokenizer.pad_token_id, block_size=512)

In [60]:
random_slice = get_batch_slice("optimized_data.bin", pad_token_id=tokenizer.pad_token_id, block_size=8)
x, y = random_slice

input_ids = [int(i) for i in list(x.cpu().numpy())]
target_ids = [int(i) for i in list(y.cpu().numpy())]

input = tokenizer.decode(input_ids)
target = tokenizer.decode(target_ids)

print("Input: ", input)
print("Target: ", target)

Input:  am not s
Target:  m not su


In [29]:
# tok.encode("I am super duper") # bug 

from magicab.etoken import encode_bytes
self = tokenizer
text = "I am super duper"

# encoding 
ids = encode_bytes(text, self.special_tokens, self.special2idx, self.byte2idx)
ids = tokenizer.encode_id(text)
ids = tokenizer.encode(text)

# decoding 
tokenizer.decode(ids)

'I am super duper'

In [None]:
# parallel encoding across chunks (async)
from magicab.etoken import chunk_text, _encode_chunks
chunk_size = 1024 

ids_list = [] 
for t in text: 
    chunks = chunk_text(t, chunk_size)
    ids = _encode_chunks(chunks, self, chunk_size)
    ids_list.append(ids)

In [67]:
# async encoding (parallel processing)

from magicab.etoken import chunk_text, _encode_chunks
import asyncio
from concurrent.futures import ThreadPoolExecutor

texts = ["I am super duper", "Hey how is it going?", "I am not super duper"]
chunk_size = 1024 

async def encode_text(t, tokenizer, chunk_size):
    chunks = chunk_text(t, chunk_size)
    # Run the encoding in a thread pool since it's likely CPU-bound
    loop = asyncio.get_event_loop()
    with ThreadPoolExecutor() as pool:
        return await loop.run_in_executor(pool, lambda: _encode_chunks(chunks, tokenizer, chunk_size))

async def encode_all_texts(texts, tokenizer, chunk_size):
    tasks = [encode_text(t, tokenizer, chunk_size) for t in texts]
    return await asyncio.gather(*tasks)

# Run the async function
import nest_asyncio
nest_asyncio.apply()

ids_list = asyncio.run(encode_all_texts(texts, self, chunk_size))