# Data preparation

This notebook contains the code for data loading, preparation and chunking.

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from tqdm import tqdm

from datasets import load_dataset

import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, util

In [24]:
squad = load_dataset("squad")

In [25]:
train_squad = squad["train"]
val_squad = squad["validation"]

In [26]:
train_df = train_squad.to_pandas()
val_df = val_squad.to_pandas()

In [30]:
# ---------- Coreference Resolution Stub ----------
def apply_coreference(text):
    """
    Replace with actual coreference resolution model later.
    For now, returns original text.
    """
    return text

In [31]:
# ---------- Sliding Window Chunking ----------
def sliding_window_chunks(text, chunk_size=300, stride=100):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append((text[start:end], start, end))
        if end == len(text):
            break
        start += stride
    return chunks

In [32]:
# ---------- Semantic Chunking ----------
model = SentenceTransformer("all-MiniLM-L6-v2")

def semantic_chunks(text, sim_threshold=0.6, max_chunk_size=500):
    sentences = sent_tokenize(text)
    # compute char offsets robustly
    offsets = []
    pos = 0
    for s in sentences:
        start = text.find(s, pos)
        if start == -1:
            start = pos
        end = start + len(s)
        offsets.append((s, start, end))
        pos = end

    if not offsets:
        return []

    sents = [s for (s, _, _) in offsets]
    emb = model.encode(sents, convert_to_tensor=True)
    chunks = []
    cur_sents = [offsets[0]]
    prev_emb = emb[0]

    for i in range(1, len(offsets)):
        sent, st, ed = offsets[i]
        cur_emb = emb[i]
        sim = util.cos_sim(prev_emb, cur_emb).item()
        cur_text_len = offsets[i][2] - offsets[i - len(cur_sents)][1]

        if sim >= sim_threshold and cur_text_len <= max_chunk_size:
            cur_sents.append(offsets[i])
        else:
            start_char = cur_sents[0][1]
            end_char = cur_sents[-1][2]
            chunk_text = text[start_char:end_char]
            chunks.append((chunk_text, start_char, end_char))
            cur_sents = [offsets[i]]
        prev_emb = cur_emb

    if cur_sents:
        start_char = cur_sents[0][1]
        end_char = cur_sents[-1][2]
        chunks.append((text[start_char:end_char], start_char, end_char))

    return chunks


In [33]:
# ---------- Answer Containment ----------
def is_answer_in_chunk(chunk_text, answer_text):
    return answer_text.lower().strip() in chunk_text.lower()

In [35]:
# ---------- Build Chunk DataFrame ----------
def build_chunk_df(df, chunk_func, coref=False, **kwargs):
    all_chunks = []

    for idx, row in tqdm(df.iterrows()):
        orig_context = row["context"]
        context = orig_context
        if coref:
            # For embeddings: resolved text; offsets stay on orig_context
            resolved_context = apply_coreference(orig_context)
            context_for_offsets = orig_context
            context_for_embedding = resolved_context
        else:
            context_for_offsets = orig_context
            context_for_embedding = orig_context

        chunks_offsets = chunk_func(context_for_offsets, **kwargs)

        for i, (chunk_text, start, end) in enumerate(chunks_offsets):
            # chunk text for embedding (coref-resolved if coref=True)
            chunk_embed_text = context_for_embedding[start:end]

            for ans_text, ans_start in zip(row["answers"]["text"], row["answers"]["answer_start"]):
                in_chunk = is_answer_in_chunk(chunk_text, ans_text)

                all_chunks.append({
                    "context_id": idx,
                    "chunk_id": f"{idx}_{i}",
                    "chunk_text": chunk_text,
                    "chunk_embed_text": chunk_embed_text,
                    "title": row.get("title", ""),
                    "chunk_start": start,
                    "chunk_end": end,
                    "question": row["question"],
                    "answer_text": ans_text,
                    "answer_start": ans_start,
                    "answer_in_chunk": in_chunk,
                    "coref": coref,
                    "chunking_type": chunk_func.__name__
                })

    return pd.DataFrame(all_chunks)

In [36]:
df_sliding_train = build_chunk_df(train_df, sliding_window_chunks, coref=False, chunk_size=300, stride=100)
df_sliding_val = build_chunk_df(val_df, sliding_window_chunks, coref=False, chunk_size=300, stride=100)

87599it [00:02, 29648.76it/s]
10570it [00:00, 16452.77it/s]


In [12]:
df_semantic_train = build_chunk_df(train_df, semantic_chunks, coref=False, sim_threshold=0.6, max_chunk_size=500)
df_semantic_val = build_chunk_df(val_df, semantic_chunks, coref=False, sim_threshold=0.6, max_chunk_size=500)

87599it [24:55, 58.58it/s] 
10570it [02:54, 60.72it/s]


In [None]:
#df_sliding_coref = build_chunk_df(train_df, sliding_window_chunks, coref=True, chunk_size=300, stride=100)
#df_semantic_coref = build_chunk_df(train_df, semantic_chunks, coref=True, sim_threshold=0.6, max_chunk_size=500)

In [None]:
# ---------- Quick Check ----------
print("Sliding train:", df_sliding_train.shape)
print("Semantic train:", df_semantic_train.shape)
#print("Sliding+Coref:", df_sliding_coref.shape)
#print("Semantic+Coref:", df_semantic_coref.shape)

In [80]:
df_semantic_train.head(3)

Unnamed: 0,context_id,chunk_id,chunk_text,chunk_embed_text,chunk_start,chunk_end,question,answer_text,answer_start,answer_in_chunk,coref,chunking_type
0,0,0_0,"Architecturally, the school has a Catholic cha...","Architecturally, the school has a Catholic cha...",0,53,To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,515,False,False,semantic_chunks
1,0,0_1,Atop the Main Building's gold dome is a golden...,Atop the Main Building's gold dome is a golden...,54,127,To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,515,False,False,semantic_chunks
2,0,0_2,Immediately in front of the Main Building and ...,Immediately in front of the Main Building and ...,128,270,To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,515,False,False,semantic_chunks


In [81]:
df_sliding_train.head(3)

Unnamed: 0,context_id,chunk_id,chunk_text,chunk_embed_text,chunk_start,chunk_end,question,answer_text,answer_start,answer_in_chunk,coref,chunking_type
0,0,0_0,"Architecturally, the school has a Catholic cha...","Architecturally, the school has a Catholic cha...",0,300,To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,515,False,False,sliding_window_chunks
1,0,0_1,statue of the Virgin Mary. Immediately in fro...,statue of the Virgin Mary. Immediately in fro...,100,400,To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,515,False,False,sliding_window_chunks
2,0,0_2,tue of Christ with arms upraised with the lege...,tue of Christ with arms upraised with the lege...,200,500,To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,515,False,False,sliding_window_chunks


In [13]:
df_semantic_train.to_excel("./data/prepared/squad_train_v1_semantic_chunking.xlsx", index=False)
df_semantic_val.to_excel("./data/prepared/squad_val_v1_semantic_chunking.xlsx", index=False)