In [None]:
import time

import pandas as pd
import tiktoken
from bpe_encode_rust import PyBpeEncode

In [None]:
def format_size(bytes_size):
    """Format bytes to human readable format"""
    for unit in ['B', 'KB', 'MB', 'GB']:
        if bytes_size < 1024.0:
            return f"{bytes_size:.2f} {unit}"
        bytes_size /= 1024.0
    return f"{bytes_size:.2f} TB"

def print_metrics(encode_time, num_tokens, chars_per_sec, bytes_per_sec, tokens_per_sec, num_chars, num_bytes):
    print(f"Encoding completed in {encode_time:.4f} seconds, Tokens: {num_tokens:,}")
    # Calculate metrics
    print("-"*40)
    print("Performance Metrics")
    print("-"*40)
    # throughput
    print("Throughput:")
    print("-"*40)
    print(f"  {chars_per_sec:,.0f} chars/sec")
    print(f"  {format_size(bytes_per_sec)}/sec")
    print(f"  {tokens_per_sec:,.0f} tokens/sec")
    # compression
    print("-"*40)
    print("Compression:")
    print("-"*40)
    print(f"  {num_chars / num_tokens:.2f} chars/token")
    print(f"  {num_bytes / num_tokens:.2f} bytes/token")

In [None]:
# Read input file
file_path = "data/TinyStoriesV2-GPT4-valid.txt"
print(f"Reading input file: {file_path}")
with open(file_path, "r") as f:
    text = f.read()
num_chars = len(text)
num_bytes = len(text.encode('utf-8'))

# print(f"File read in {read_time:.4f} seconds")
print(f"Characters: {num_chars:,}")
print(f"Bytes (UTF-8): {num_bytes:,} ({format_size(num_bytes)})")
print(f"Lines: {text.count(chr(10)):,}")


In [None]:
# read the parquet file
file_path = "data/013_00000.parquet"
nrows = 5000

In [None]:
# Read input file
print(f"Reading input file: {file_path}")
read_start = time.time()
df = pd.read_parquet(file_path)
text_column = df['text'][:nrows]
print(len(text_column))
text = "<|endoftext|>"
for t in text_column:
    t += "<|endoftext|>"
    text += t
read_time = time.time() - read_start

num_chars = len(text)
num_bytes = len(text.encode('utf-8'))

print(f"File read in {read_time:.4f} seconds")
print(f"Characters: {num_chars:,}")
print(f"Bytes (UTF-8): {num_bytes:,} ({format_size(num_bytes)})")
print(f"Lines: {text.count(chr(10)):,}")

In [None]:
# tiktoken (gpt-2)
tokenizer = tiktoken.get_encoding("gpt2")
encode_start = time.time()
tokens = tokenizer.encode(text, allowed_special='all')
encode_time = time.time() - encode_start

num_tokens = len(tokens)

chars_per_sec = num_chars / encode_time
bytes_per_sec = num_bytes / encode_time
tokens_per_sec = num_tokens / encode_time

print_metrics(encode_time, num_tokens, chars_per_sec, bytes_per_sec, tokens_per_sec, num_chars, num_bytes)

In [None]:
# tiktoken (gpt-2)
tokenizer = PyBpeEncode()
tokenizer.load('data/tinystoriesv2-gpt4-train-2048.model')
encode_start = time.time()
tokens = tokenizer.encode(text, 'all')
encode_time = time.time() - encode_start

num_tokens = len(tokens)

chars_per_sec = num_chars / encode_time
bytes_per_sec = num_bytes / encode_time
tokens_per_sec = num_tokens / encode_time

print_metrics(encode_time, num_tokens, chars_per_sec, bytes_per_sec, tokens_per_sec, num_chars, num_bytes)

In [None]:
# tiktoken (gpt-2)
tokenizer = PyBpeEncode()
tokenizer.load('data/tinystoriesv2-gpt4-train-8192.model')
encode_start = time.time()
tokens = tokenizer.encode(text, 'all')
encode_time = time.time() - encode_start

num_tokens = len(tokens)

chars_per_sec = num_chars / encode_time
bytes_per_sec = num_bytes / encode_time
tokens_per_sec = num_tokens / encode_time

print_metrics(encode_time, num_tokens, chars_per_sec, bytes_per_sec, tokens_per_sec, num_chars, num_bytes)

In [None]:
# tiktoken (gpt-2)
tokenizer = PyBpeEncode()
tokenizer.load('data/tinystoriesv2-gpt4-train-16384.model')
encode_start = time.time()
tokens = tokenizer.encode(text, 'all')
encode_time = time.time() - encode_start

num_tokens = len(tokens)

chars_per_sec = num_chars / encode_time
bytes_per_sec = num_bytes / encode_time
tokens_per_sec = num_tokens / encode_time

print_metrics(encode_time, num_tokens, chars_per_sec, bytes_per_sec, tokens_per_sec, num_chars, num_bytes)

In [None]:
# tiktoken (gpt-2)
tokenizer = PyBpeEncode()
tokenizer.load('data/fineweb-000_00003-gpt4-train-16384.model')
encode_start = time.time()
tokens = tokenizer.encode(text, 'all')
encode_time = time.time() - encode_start

num_tokens = len(tokens)

chars_per_sec = num_chars / encode_time
bytes_per_sec = num_bytes / encode_time
tokens_per_sec = num_tokens / encode_time

print_metrics(encode_time, num_tokens, chars_per_sec, bytes_per_sec, tokens_per_sec, num_chars, num_bytes)