In [None]:
import eventlet

book_files = {
    "Mickiewicz": [
        "https://wolnelektury.pl/media/book/txt/pan-tadeusz.txt",
        "https://wolnelektury.pl/media/book/txt/dziady-dziady-widowisko-czesc-i.txt",
        "https://wolnelektury.pl/media/book/txt/dziady-dziadow-czesci-iii-ustep-do-przyjaciol-moskali.txt",
        "https://wolnelektury.pl/media/book/txt/ballady-i-romanse-pani-twardowska.txt",
        "https://wolnelektury.pl/media/book/txt/ballady-i-romanse-powrot-taty.txt",
        "https://wolnelektury.pl/media/book/txt/ballady-i-romanse-switez.txt",
        "https://wolnelektury.pl/media/book/txt/dziady-dziady-poema-dziady-czesc-iv.txt",
    ],
    "Sienkiewicz": [
        "https://wolnelektury.pl/media/book/txt/quo-vadis.txt",
        "https://wolnelektury.pl/media/book/txt/sienkiewicz-we-mgle.txt",
        "https://wolnelektury.pl/media/book/txt/potop-tom-pierwszy.txt",
        "https://wolnelektury.pl/media/book/txt/potop-tom-drugi.txt",
        "https://wolnelektury.pl/media/book/txt/potop-tom-trzeci.txt",
    ],
    "Orzeszkowa": [
        "https://wolnelektury.pl/media/book/txt/orzeszkowa-kto-winien.txt",
        "https://wolnelektury.pl/media/book/txt/nad-niemnem-tom-pierwszy.txt",
        "https://wolnelektury.pl/media/book/txt/nad-niemnem-tom-drugi.txt",
        "https://wolnelektury.pl/media/book/txt/nad-niemnem-tom-trzeci.txt",
        "https://wolnelektury.pl/media/book/txt/gloria-victis-dziwna-historia.txt",
        "https://wolnelektury.pl/media/book/txt/z-pozogi.txt",
        "https://wolnelektury.pl/media/book/txt/pani-dudkowa.txt",
        "https://wolnelektury.pl/media/book/txt/dymy.txt",
        "https://wolnelektury.pl/media/book/txt/syn-stolarza.txt",
        "https://wolnelektury.pl/media/book/txt/dobra-pani.txt",
        "https://wolnelektury.pl/media/book/txt/cnotliwi.txt",
        "https://wolnelektury.pl/media/book/txt/kilka-slow-o-kobietach.txt",
        "https://wolnelektury.pl/media/book/txt/patryotyzm-i-kosmopolityzm.txt",
        "https://wolnelektury.pl/media/book/txt/julianka.txt",
    ],
    "Prus": [
        "https://wolnelektury.pl/media/book/txt/lalka-tom-drugi.txt",
        "https://wolnelektury.pl/media/book/txt/lalka-tom-pierwszy.txt",
        "https://wolnelektury.pl/media/book/txt/antek.txt",
        "https://wolnelektury.pl/media/book/txt/katarynka.txt",
        "https://wolnelektury.pl/media/book/txt/prus-anielka.txt",
        "https://wolnelektury.pl/media/book/txt/prus-placowka.txt",
    ],
    "Reymont": [
        "https://wolnelektury.pl/media/book/txt/ziemia-obiecana-tom-pierwszy.txt",
        "https://wolnelektury.pl/media/book/txt/chlopi-czesc-pierwsza-jesien.txt",
        "https://wolnelektury.pl/media/book/txt/reymont-chlopi-zima.txt",
        "https://wolnelektury.pl/media/book/txt/chlopi-czesc-trzecia-wiosna.txt",
        "https://wolnelektury.pl/media/book/txt/chlopi-czesc-czwarta-lato.txt",
    ],
}

In [None]:
!mkdir -p data

In [None]:
from urllib import request
import os


def fetch(url):
    """Download a file and save it to the data directory."""
    file_path = os.path.join("data", os.path.basename(url))
    if os.path.exists(file_path):
        return None, None
    data = request.urlopen(url).read()
    return file_path, data


def download():
    """Download all books from the book_files dictionary."""
    for author in book_files:
        pool = eventlet.GreenPool()

        for file_path, data in pool.imap(fetch, book_files[author]):
            if file_path:
                with open(file_path, mode="wb") as f:
                    f.write(data)

download()
print("DONE")

In [None]:
!ls -la data

In [None]:
tokenizers = [
    "unsloth/Llama-3.3-70B-Instruct-bnb-4bit",
    "mistralai/Mistral-7B-Instruct-v0.3",
    "microsoft/phi-4",
    "deepseek-ai/DeepSeek-R1",
]

In [None]:
from pathlib import Path

def create_corpus_and_stats(data_dir="./data"):
    """Create a corpus from text files in the data directory and compute basic statistics."""
    corpus = " ".join(file.read_text(encoding="utf-8") for file in Path(data_dir).glob("*.txt"))
    # Normalize whitespace quickly with split/join (faster than regex on huge texts; see https://stackoverflow.com/q/2077897)
    corpus = " ".join(corpus.split())
    num_chars = len(corpus)
    num_words = corpus.count(" ") + 1 if corpus else 0
    return corpus, {"num_chars": num_chars, "num_words": num_words}

def tokenize_and_compute_stats(corpus, corpus_stats, tokenizer):
    """Tokenize the corpus and compute statistics."""
    # If corpus is within max_length, tokenize directly.
    max_length = tokenizer.model_max_length
    if len(corpus) <= max_length:
        tokens = tokenizer(corpus, add_special_tokens=False)["input_ids"]
    else:
        tokens = []
        start, corpus_len = 0, len(corpus)
        while start < corpus_len:
            end = start + max_length
            if end >= corpus_len:
                chunk = corpus[start:]
                start = corpus_len
            else:
                # Find last whitespace in the window [start, end)
                split_index = corpus.rfind(" ", start, end)
                if split_index <= start:  # No whitespace found; force split at max_length
                    split_index = end
                chunk = corpus[start:split_index]
                start = split_index  # Leave the whitespace in the remainder
            if chunk:
                tokens.extend(tokenizer(chunk, add_special_tokens=False)["input_ids"])
            else:
                start += 1  # Avoid stalling on empty chunks
    num_tokens = len(tokens)
    num_chars = corpus_stats["num_chars"]
    num_words = corpus_stats["num_words"]
    stats = {
        "tokenizer_name": tokenizer.name_or_path,
        "num_tokens": num_tokens,
        "avg_tokens_per_word": num_tokens / num_words if num_words else 0,
        "avg_chars_per_token": num_chars / num_tokens if num_tokens else 0,
        "avg_words_per_token": num_words / num_tokens if num_tokens else 0,
        "vocab_size": len(tokenizer),
    }
    return tokens, stats

In [None]:
corpus, corpus_stats = create_corpus_and_stats()

In [None]:
corpus_stats

In [None]:
from transformers import AutoTokenizer


def tokenize_stats(tokenizer_name, corpus=corpus, corpus_stats=corpus_stats):
    """Tokenize the corpus and compute statistics for a given tokenizer."""
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    tokens, stats = tokenize_and_compute_stats(corpus, corpus_stats, tokenizer)
    print(f"Tokenizer: {tokenizer_name}")
    # print(f"Tokens: {stats['num_tokens']}")
    print(f"Average tokens per word: {stats['avg_tokens_per_word']:.2f}")
    print(f"Average characters per token: {stats['avg_chars_per_token']:.2f}")
    print(f"Average words per token: {1 / stats['avg_tokens_per_word']:.2f}")
    print("\n")
    return stats

In [None]:
import pandas as pd

stats_df = pd.DataFrame([tokenize_stats(tokenizer) for tokenizer in tokenizers])
stats_df