In [None]:
import requests

from src.data_models import TextChunks


In [2]:
LLAMA_SERVER_URL = "http://localhost:32000"

In [3]:
system_prompt = """## Text Chunk Extraction

### Task description

You are an expert at dividing long-form documents into **coherent, topic-focused text chunks**.

Each chunk must focus on **one clear topic or idea** and be **semantically complete** on its own.  
Chunks should not cut off sentences, definitions, arguments, or important context.

The goal is to produce **clean, self-contained text segments** that can later be used to generate precise questions or embeddings.

Chunks **do not need to be small or uniform in size**.  
Instead, group sentences and paragraphs that naturally belong together, such as:
- A single argument or analysis thread
- One news item or case study
- A complete explanation of a concept
- A short Q&A or FAQ exchange
- A subsection of a longer essay or newsletter

### Important rules (strict)

- **Do not paraphrase, summarize, rewrite, or edit** the text.
- **Do not drop or skip any text**. Every character must appear in exactly one chunk.
- **Preserve original wording and formatting**, including:
  - Paragraph breaks
  - Lists and bullet points
  - Quotes
  - Emphasis (bold, italics, headings, etc.)
- Keep **tables, figures, footnotes, or parenthetical explanations** with the text that references them.
- Do **not merge unrelated topics** into a single chunk.
- Do **not split a single topic** across multiple chunks unless the topic clearly shifts.
- Output **only valid JSON**, matching the provided schema.
- Do **not include explanations, commentary, or metadata outside the JSON**.

### Chunking guidance (heuristics)

Use topic boundaries such as:
- A clear shift in subject matter
- A new example or case study
- A transition like “But,” “However,” “Meanwhile,” or “Separately” when it introduces a new idea
- Section headers or implicit newsletter breaks

Avoid splitting:
- Mid-argument
- Mid-example
- Between a claim and its explanation
- Between a question and its answer

### Output format

Use the supplied JSON schema to return a well-formed JSON object.

The schema corresponds to the following Pydantic model:

class TextChunks(BaseModel):
    chunks: List[str]

Your output must be a JSON object with a single key "chunks", whose value is an array of strings.
Each string is one extracted chunk of text.

### Inference

Here is the text to process:

---

## Examples

### Example 1: Simple analytical paragraph

Input text:
Apple reported earnings yesterday. Revenue was flat year over year, but margins improved due to lower component costs.
This matters because Apple has been under pressure from investors to show pricing power.

Separately, the Fed released minutes from its last meeting.

Expected output:
{
  "chunks": [
    "Apple reported earnings yesterday. Revenue was flat year over year, but margins improved due to lower component costs.\nThis matters because Apple has been under pressure from investors to show pricing power.",
    "Separately, the Fed released minutes from its last meeting."
  ]
}

---

### Example 2: Newsletter-style argument with continuation

Input text:
The interesting thing about convertible bonds is that they sit between debt and equity.
They pay interest like bonds, but can convert into shares.

That optionality is valuable when volatility is high.
It also creates weird incentives for issuers.

Expected output:
{
  "chunks": [
    "The interesting thing about convertible bonds is that they sit between debt and equity.\nThey pay interest like bonds, but can convert into shares.\n\nThat optionality is valuable when volatility is high.\nIt also creates weird incentives for issuers."
  ]
}

---

### Example 3: Bulleted list that must stay together

Input text:
There are three ways this trade can go wrong:
- Rates fall faster than expected
- Liquidity dries up
- The counterparty fails

Each of these risks is manageable, but not trivial.

Expected output:
{
  "chunks": [
    "There are three ways this trade can go wrong:\n- Rates fall faster than expected\n- Liquidity dries up\n- The counterparty fails\n\nEach of these risks is manageable, but not trivial."
  ]
}
"""

In [4]:
extracted_text = open("data/text/money_stuff.md").read()
extracted_text

'# AI rollup\n\nPeople have been worried for a while\xa0about private equity\xa0buying up every company and coming to dominate the economy. “Private equity,” in this worry, tends to mean specifically the large private-equity firms that have their roots in doing leveraged buyouts of mature cash-flowing companies. But the fun hipster alternative is, what if\xa0*venture capital*buys up every company and comes to dominate the economy? Historically no\xa0one\xa0worried about that much, because historically venture capital was about making concentrated bets on small startups that might change the world, not about buying the local pest-control company or medical practice in every town in America. But that’s changing.\xa0We have talked a few times about “AI rollups,” where a venture capital firm buys a bunch of small companies, combines them, and sprinkles them with artificial intelligence.\n\nOne way to think about it is that each of PE and VC has a powerful general-purpose technology that it

In [5]:
def count_tokens(prompt: str) -> int:
    try:
        response = requests.post(
            url=f"{LLAMA_SERVER_URL}/tokenize",
            headers={"Content-Type": "application/json"},
            data=json.dumps(
                {
                    "content": prompt,
                }
            ),
        )
        response.raise_for_status()

        response_json = response.json()
        tokens = response_json.get("tokens", [])

        return len(tokens)

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return -1

system_prompt_tokens = count_tokens(system_prompt)
extracted_text_tokens = count_tokens(extracted_text)
total_tokens = system_prompt_tokens + extracted_text_tokens

print(f"System prompt tokens  : {system_prompt_tokens}")
print(f"Extracted text tokens : {extracted_text_tokens}")
print(f"Total tokens          : {total_tokens}")

System prompt tokens  : 868
Extracted text tokens : 1548
Total tokens          : 2416


In [6]:
def chunk_text(system_prompt, user_prompt, text):
    try:
        messages = [
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": user_prompt + "\n\n" + text,
            },
        ]

        payload = {
            "messages": messages,
            "temperature": 0,
            "response_format": {
                "type": "json_object",
                "schema": TextChunks.model_json_schema(),
            },
        }

        response = requests.post(
            url=f"{LLAMA_SERVER_URL}/v1/chat/completions",
            data=json.dumps(payload),
        )
        response.raise_for_status()

        content = response.json()["choices"][0]["message"]["content"]
        return TextChunks.model_validate_json(content)

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")

In [7]:
user_prompt = """Below is the extracted text from a document.

Your task is to read the text carefully and split it into **coherent, topic-focused chunks**.

Rules:
- Each chunk must focus on **one clear topic or idea**.
- Do **not** summarize, paraphrase, rewrite, or modify the text in any way.
- Do **not** skip or omit any content. Every character must appear in exactly one chunk.
- Preserve the **exact text and formatting** as it appears, including:
  - Paragraph breaks
  - Lists and bullet points
  - Punctuation and capitalization
  - Quotes and emphasis
- Each chunk must be **self-contained and semantically meaningful** on its own.
- Do not split sentences, arguments, examples, or lists across chunks unless the topic clearly changes.

Return your output **exactly** in the format specified by the provided schema.
Do not include any explanations or text outside the JSON object.

EXTRACTED TEXT:
"""
result = chunk_text(system_prompt, user_prompt, extracted_text)

In [8]:
open("data/text/money_stuff_chunked.json", "w").write(result.model_dump_json(indent=2))

7845