# Advanced Build: Semantic Chunking Strategy


In [1]:
import os
import numpy as np
from getpass import getpass
from dotenv import load_dotenv

load_dotenv()

if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass("Please enter your OpenAI API key!")

In [2]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [3]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("data/HealthWellnessGuide.txt")
docs = loader.load()

with open("data/HealthWellnessGuide.txt", "r") as f:
    raw_text = f.read()

print(f"Document length: {len(raw_text)} characters")

Document length: 16206 characters


Implement semantic chunking using the following steps:
1. Split the document into individual sentences using `nltk.sent_tokenize`
2. Embed all sentences
3. Greedily merge adjacent sentences: if the next sentence is semantically similar
   (cosine similarity > threshold) to the current chunk AND adding it stays under
   the max chunk size, merge it in. Otherwise, close the chunk and start a new one.
4. Second pass: greedily merge adjacent chunks using the same logic.

In [None]:
import nltk
nltk.download("punkt_tab", quiet=True)
from nltk.tokenize import sent_tokenize


def split_into_sentences(text: str) -> list[str]:
    """Split text into sentences using nltk sentence tokenizer.

    Splits by line breaks first (to respect bullet points, headers, etc.),
    then applies sent_tokenize within each line to handle multi-sentence lines.
    """
    sentences = []
    for line in text.split("\n"):
        line = line.strip()
        if not line:
            continue
        sentences.extend(sent_tokenize(line))
    return sentences


def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def _greedy_merge(texts: list[str], text_embeddings: list[np.ndarray], similarity_threshold: float, max_chunk_size: int) -> tuple[list[str], list[np.ndarray]]:
    """Greedily merge adjacent texts based on semantic similarity.

    Compares each next text's embedding against the average embedding of the
    current group. Merges if similarity >= threshold and size <= max.
    """
    chunks = []
    chunk_embeddings = []

    current_texts = [texts[0]]
    current_embs = [text_embeddings[0]]

    for i in range(1, len(texts)):
        current_avg_emb = np.mean(current_embs, axis=0)
        sim = cosine_similarity(current_avg_emb, text_embeddings[i])

        potential_text = " ".join(current_texts + [texts[i]])

        if sim >= similarity_threshold and len(potential_text) <= max_chunk_size:
            current_texts.append(texts[i])
            current_embs.append(text_embeddings[i])
        else:
            chunks.append(" ".join(current_texts))
            chunk_embeddings.append(np.mean(current_embs, axis=0))
            current_texts = [texts[i]]
            current_embs = [text_embeddings[i]]

    # Flush last chunk
    chunks.append(" ".join(current_texts))
    chunk_embeddings.append(np.mean(current_embs, axis=0))

    return chunks, chunk_embeddings


def semantic_chunk(
    text: str,
    embedding_model,
    similarity_threshold: float = 0.25,
    max_chunk_size: int = 750,
) -> list[str]:
    """Greedily chunk text based on semantic similarity.

    Pass 1: Merge adjacent similar sentences into chunks.
    Pass 2: Merge adjacent similar chunks into larger chunks.

    Args:
        text: The full document text.
        embedding_model: An embedding model with embed_documents().
        similarity_threshold: Cosine similarity threshold for merging.
        max_chunk_size: Maximum chunk size in characters.

    Returns:
        A list of chunk strings.
    """
    sentences = split_into_sentences(text)
    if not sentences:
        return []

    # Embed all sentences at once
    sentence_embeddings = embedding_model.embed_documents(sentences)
    sentence_embeddings = [np.array(e) for e in sentence_embeddings]

    # Greedily merge adjacent similar sentences
    chunks, chunk_embeddings = _greedy_merge(sentences, sentence_embeddings, similarity_threshold, max_chunk_size)

    # Greedily merge adjacent similar chunks
    merged_chunks, _ = _greedy_merge(chunks, chunk_embeddings, similarity_threshold, max_chunk_size)

    return merged_chunks

## Test the Semantic Chunker

In [None]:
# Tuned similarity_threshold and max_chunk_size so that the average chunk size is ~500 characters
semantic_chunks = semantic_chunk(
    raw_text,
    embeddings,
    similarity_threshold=0.23,
    max_chunk_size=800,
)

sizes = [len(c) for c in semantic_chunks]
print(f"Number of chunks: {len(semantic_chunks)}")
print(f"Chunk sizes (chars): min={min(sizes)}, max={max(sizes)}, mean={np.mean(sizes):.0f}")
print()

for i, chunk in enumerate(semantic_chunks[:5]):
    print(f"--- Chunk {i} ({len(chunk)} chars) ---")
    print(chunk)
    print()

Number of chunks: 33
Chunk sizes (chars): min=2, max=800, mean=487

--- Chunk 0 (778 chars) ---
The Personal Wellness Guide A Comprehensive Resource for Health and Well-being PART 1: EXERCISE AND MOVEMENT Chapter 1: Understanding Exercise Basics Exercise is one of the most important things you can do for your health. Regular physical activity can improve your brain health, help manage weight, reduce the risk of disease, strengthen bones and muscles, and improve your ability to do everyday activities. The four main types of exercise are aerobic (cardio), strength training, flexibility, and balance exercises. A well-rounded fitness routine includes all four types. Adults should aim for at least 150 minutes of moderate-intensity aerobic activity per week, along with muscle-strengthening activities on 2 or more days per week. Chapter 2: Exercises for Common Problems

--- Chunk 1 (744 chars) ---
Lower Back Pain Relief Lower back pain affects approximately 80% of adults at some point in thei

### Synthetic Data Generation for Evaluation

Generate synthetic test set using RAGAS. This will be used to evaluate both
the baseline and the semantic chunking systems. Same approach as the other notebook

In [6]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI

generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

In [7]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(docs, testset_size=10)

Applying HeadlinesExtractor:   0%|          | 0/1 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/1 [00:00<?, ?it/s]

Applying SummaryExtractor:   0%|          | 0/1 [00:00<?, ?it/s]

Applying CustomNodeFilter:   0%|          | 0/4 [00:00<?, ?it/s]

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/9 [00:00<?, ?it/s]

Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/11 [00:00<?, ?it/s]

In [8]:
dataset.to_pandas()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,How can Neck Rolls help with neck and shoulder...,[The Personal Wellness Guide A Comprehensive R...,Neck Rolls can provide relief from neck and sh...,single_hop_specifc_query_synthesizer
1,what help lower back pain exercises,[The Personal Wellness Guide A Comprehensive R...,Gentle stretching and strengthening exercises ...,single_hop_specifc_query_synthesizer
2,what cbt-i do for sleep?,[PART 3: SLEEP AND RECOVERY Chapter 7: The Sci...,Cognitive Behavioral Therapy for Insomnia (CBT...,single_hop_specifc_query_synthesizer
3,What is Cognitive Behavioral Therapy for Insom...,[PART 3: SLEEP AND RECOVERY Chapter 7: The Sci...,Cognitive Behavioral Therapy for Insomnia (CBT...,single_hop_specifc_query_synthesizer
4,What are the key signs of poor work-life balan...,[PART 5: BUILDING HEALTHY HABITS Chapter 13: T...,"According to Chapter 19, signs of poor work-li...",single_hop_specifc_query_synthesizer
5,Wut are the main tipz for boostin immune funct...,[PART 5: BUILDING HEALTHY HABITS Chapter 13: T...,Chapter 18 recomends the following for boostin...,single_hop_specifc_query_synthesizer
6,What strategies from Chapter 9 can help manage...,[<1-hop>\n\nPART 5: BUILDING HEALTHY HABITS Ch...,Chapter 9 outlines several strategies for mana...,multi_hop_specific_query_synthesizer
7,Drawing on the guidance from Chapter 7 regardi...,[<1-hop>\n\nPART 5: BUILDING HEALTHY HABITS Ch...,"Establishing a consistent sleep schedule, as r...",multi_hop_specific_query_synthesizer
8,How does the guidance in Chapter 9 on understa...,[<1-hop>\n\nPART 5: BUILDING HEALTHY HABITS Ch...,Chapter 9 discusses insomnia as difficulty fal...,multi_hop_specific_query_synthesizer
9,How do the strategies for managing insomnia in...,[<1-hop>\n\nPART 5: BUILDING HEALTHY HABITS Ch...,The strategies for managing insomnia in Chapte...,multi_hop_specific_query_synthesizer


### Baseline RAG - Fixed-size Chunking + Naive Retrieval

Re-use the baseline RAG system using `RecursiveCharacterTextSplitter` (chunk_size=500) with naive top-k retrieval (i.e., no reranking).

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

# Fixed-size chunking
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=30)
baseline_split_docs = text_splitter.split_documents(docs)
print(f"Baseline chunks: {len(baseline_split_docs)}")

# Vector store
baseline_client = QdrantClient(":memory:")
baseline_client.create_collection(
    collection_name="baseline",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)
baseline_vector_store = QdrantVectorStore(
    client=baseline_client,
    collection_name="baseline",
    embedding=embeddings,
)
_ = baseline_vector_store.add_documents(documents=baseline_split_docs)

# Naive retriever (top-k, no reranking)
baseline_retriever = baseline_vector_store.as_retriever(search_kwargs={"k": 3})

Baseline chunks: 44


In [10]:
from langchain.prompts import ChatPromptTemplate
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain_core.documents import Document

RAG_PROMPT = """You are a helpful assistant who answers questions based on provided context. You must only use the provided context, and cannot use your own knowledge.

### Question
{question}

### Context
{context}
"""
rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
llm = ChatOpenAI(model="gpt-4.1-nano")


class State(TypedDict):
    question: str
    context: List[Document]
    response: str


def baseline_retrieve(state):
    retrieved_docs = baseline_retriever.invoke(state["question"])
    return {"context": retrieved_docs}


def generate(state):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = rag_prompt.format_messages(question=state["question"], context=docs_content)
    response = llm.invoke(messages)
    return {"response": response.content}


baseline_graph = StateGraph(State).add_sequence([baseline_retrieve, generate])
baseline_graph.add_edge(START, "baseline_retrieve")
baseline_graph = baseline_graph.compile()

In [11]:
# Sanity check
response = baseline_graph.invoke({"question": "What exercises help with lower back pain?"})
print(response["response"])

Exercises that help with lower back pain include:
- Cat-Cow Stretch: alternately arching and sagging your back while on hands and knees, doing 10-15 repetitions.
- Bird Dog: extending opposite arm and leg from hands and knees, holding each for 5 seconds, and doing 10 repetitions per side.
- Pelvic Tilts: lying on your back with knees bent, tilting your pelvis to flatten your back against the floor, holding for 10 seconds, and repeating 8-12 times.


In [12]:
import copy

baseline_dataset = copy.deepcopy(dataset)

for test_row in baseline_dataset:
    response = baseline_graph.invoke({"question": test_row.eval_sample.user_input})
    test_row.eval_sample.response = response["response"]
    test_row.eval_sample.retrieved_contexts = [
        context.page_content for context in response["context"]
    ]

In [13]:
from ragas import EvaluationDataset, evaluate, RunConfig
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-mini"))
custom_run_config = RunConfig(timeout=360)

metrics = [LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()]

baseline_eval_dataset = EvaluationDataset.from_pandas(baseline_dataset.to_pandas())

baseline_result = evaluate(
    dataset=baseline_eval_dataset,
    metrics=metrics,
    llm=evaluator_llm,
    run_config=custom_run_config,
)
baseline_result

Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

{'context_recall': 0.6019, 'faithfulness': 0.6578, 'factual_correctness': 0.6073, 'answer_relevancy': 0.9535, 'context_entity_recall': 0.3586, 'noise_sensitivity_relevant': 0.1149}

### Semantic Chunking RAG + Naive Retrieval

Build a second RAG system using semantic chunks with the same naive top-k retrieval.

In [None]:
semantic_documents = [Document(page_content=chunk) for chunk in semantic_chunks]
print(f"Semantic chunks: {len(semantic_documents)}")

# Vector store
semantic_client = QdrantClient(":memory:")
semantic_client.create_collection(
    collection_name="semantic",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)
semantic_vector_store = QdrantVectorStore(
    client=semantic_client,
    collection_name="semantic",
    embedding=embeddings,
)
_ = semantic_vector_store.add_documents(documents=semantic_documents)

# Naive retriever (top-k, no reranking)
semantic_retriever = semantic_vector_store.as_retriever(search_kwargs={"k": 3})

Semantic chunks: 33


In [15]:
def semantic_retrieve(state):
    retrieved_docs = semantic_retriever.invoke(state["question"])
    return {"context": retrieved_docs}

semantic_graph = StateGraph(State).add_sequence([semantic_retrieve, generate])
semantic_graph.add_edge(START, "semantic_retrieve")
semantic_graph = semantic_graph.compile()

In [None]:
# Sanity check
response = semantic_graph.invoke({"question": "What exercises help with lower back pain?"})
print(response["response"])

Exercises that help with lower back pain include:

- Cat-Cow Stretch: Start on hands and knees, alternate between arching your back up (cat) and letting it sag down (cow). Do 10-15 repetitions.
- Bird Dog: From hands and knees, extend opposite arm and leg while keeping your core engaged. Hold for 5 seconds, then switch sides. Do 10 repetitions per side.
- Partial Crunches: Lie on your back with knees bent, cross arms over chest, tighten stomach muscles and raise shoulders off floor. Hold briefly, then lower. Do 8-12 repetitions.
- Knee-to-Chest Stretch: Lie on your back, pull one knee toward your chest while keeping the other foot flat. Hold for 15-30 seconds, then switch legs.
- Pelvic Tilts: Lie on your back with knees bent, flatten your back against the floor by tightening abs and tilting pelvis up slightly. Hold for 10 seconds, repeat 8-12 times.


### Evaluate Semantic Chunking with RAGAS

In [17]:
semantic_dataset = copy.deepcopy(dataset)

for test_row in semantic_dataset:
    response = semantic_graph.invoke({"question": test_row.eval_sample.user_input})
    test_row.eval_sample.response = response["response"]
    test_row.eval_sample.retrieved_contexts = [
        context.page_content for context in response["context"]
    ]

In [18]:
semantic_eval_dataset = EvaluationDataset.from_pandas(semantic_dataset.to_pandas())

semantic_result = evaluate(
    dataset=semantic_eval_dataset,
    metrics=metrics,
    llm=evaluator_llm,
    run_config=custom_run_config,
)
semantic_result

Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Exception raised in Job[53]: AttributeError('StringIO' object has no attribute 'statements')


{'context_recall': 0.8604, 'faithfulness': 0.6264, 'factual_correctness': 0.6055, 'answer_relevancy': 0.9584, 'context_entity_recall': 0.3997, 'noise_sensitivity_relevant': 0.1663}

## Step 10: Compare and Contrast Results

In [21]:
import pandas as pd

baseline_df = pd.DataFrame(baseline_result.scores)
semantic_df = pd.DataFrame(semantic_result.scores)

baseline_means = baseline_df.mean()
semantic_means = semantic_df.mean()

comparison = pd.DataFrame({
    "Metric": baseline_means.index,
    "Baseline (Fixed-size)": [f"{v:.4f}" for v in baseline_means.values],
    "Semantic Chunking": [f"{v:.4f}" for v in semantic_means.values],
})

comparison

Unnamed: 0,Metric,Baseline (Fixed-size),Semantic Chunking
0,context_recall,0.6019,0.8604
1,faithfulness,0.6578,0.6264
2,factual_correctness,0.6073,0.6055
3,answer_relevancy,0.9535,0.9584
4,context_entity_recall,0.3586,0.3997
5,noise_sensitivity_relevant,0.1149,0.1663


### Analysis

The semantic chunking approach provides the biggest gain in `context_recall` compared to the baseline fixed size chunking, from 0.6019 to 0.8604.

This is reasonable since the fixed size chunking around 500 chars could split the discussions of a same topic over multiple chunks; while the semantic chunking approach could keep the full topic together. During retrieval phase, the retriever is able to retrieve much more ground-truth claims. 

Most other metrics are relatively flat though, which shows the LLM could provide reasonable answers based on the retrieved chunks for both baselien and semnatic chunking approach. So this shows we may need to apply additional retrieval improvements like re-ranking to further improve the results, like what we did in the first assignemnt. 