Generate chunks_trimmed.jsonl, where each chunk is limited to a max token count (e.g. 300 tokens). This is helpful for:

Ensuring compatibility with embedding models

Improving semantic precision

Making chunking more consistent across your dataset

In [11]:
%pip install --upgrade pip

%pip install -q -r ./requirements.txt

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [12]:
import tiktoken

# Use GPT-3.5 tokenizer (good for estimation)
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [13]:



def count_tokens(text: str) -> int:
    return len(tokenizer.encode(text))

In [14]:
def split_into_token_chunks(text, max_tokens=300):
    words = text.split()
    chunks = []
    current = []

    for word in words:
        # Check projected token count *before* adding the word
        projected = " ".join(current + [word])
        if count_tokens(projected) > max_tokens:
            if current:
                chunks.append(" ".join(current))
            current = [word]  # start new chunk with current word
        else:
            current.append(word)

    if current:
        chunks.append(" ".join(current))
    return chunks

In [15]:
import json

MAX_TOKENS = 300
trimmed_chunks = []

In [16]:
import json


def get_chunk_writer(filename):
    f = open(filename, "w", encoding="utf-8")  # or "a" for append mode

    def write_chunk(chunk):
        f.write(json.dumps(chunk) + "\n")

    return write_chunk, f.close

In [17]:
with open("./chunks_raw.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        original = json.loads(line)
        text = original["chunk"]

        if count_tokens(text) <= MAX_TOKENS: # No need to split
            trimmed_chunks.append(original)
        else:
            split_chunks = split_into_token_chunks(text, max_tokens=MAX_TOKENS)
            for i, sub_chunk in enumerate(split_chunks):
                trimmed_chunks.append(
                    {
                        **original,
                        "chunk": sub_chunk,
                        "heading": f"{original['heading']} (part {i+1})",
                    }
                )

In [18]:
write_chunk, close_writer = get_chunk_writer(f"chunks_trimmed_{MAX_TOKENS}.jsonl")

In [19]:

for chunk in trimmed_chunks:
    write_chunk(chunk)  # Write each chunk to the file

In [20]:
token_lengths = [count_tokens(c["chunk"]) for c in trimmed_chunks]

print(f"Total trimmed chunks: {len(trimmed_chunks)}")
print(f"Token range: {min(token_lengths)} to {max(token_lengths)}")
print(
    f"Chunks over {MAX_TOKENS} tokens: {sum(t > MAX_TOKENS for t in token_lengths)} (should be 0)"
)

Total trimmed chunks: 201
Token range: 0 to 300
Chunks over 300 tokens: 0 (should be 0)
