In [1]:
import pandas as pd
df = pd.read_csv(r"C:\Users\diwas\Downloads\preprocessed_data.csv~1\preprocessed_data.csv")

In [2]:
import pandas as pd
import numpy as np
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")
from nltk.tokenize import sent_tokenize

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\diwas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\diwas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
# Embedding model
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

MAX_TOKENS_PER_CHUNK = 300
MIN_TOKENS_PER_CHUNK = 120
SIMILARITY_THRESHOLD = 0.55  # keep


# Token estimation
WORDS_PER_TOKEN = 0.75            # approximation (works well in practice)


In [4]:
def estimate_tokens(text: str) -> int:
    """
    Rough token estimator.
    1 token ≈ 0.75 words (empirical average).
    """
    return int(len(text.split()) / WORDS_PER_TOKEN)


In [5]:
def semantic_chunk_story(
    story_text: str,
    model: SentenceTransformer,
    similarity_threshold: float = SIMILARITY_THRESHOLD,
    max_tokens: int = MAX_TOKENS_PER_CHUNK,
    min_tokens: int = MIN_TOKENS_PER_CHUNK
):
    sentences = sent_tokenize(story_text)
    if not sentences:
        return []
    
    sentence_embeddings = model.encode(sentences)
    chunks = []
    current_chunk = [sentences[0]]
    current_tokens = estimate_tokens(sentences[0])
    current_embedding = sentence_embeddings[0].reshape(1, -1)
    
    for i in range(1, len(sentences)):
        sentence = sentences[i]
        sentence_embedding = sentence_embeddings[i].reshape(1, -1)
        sentence_tokens = estimate_tokens(sentence)
        
        similarity = cosine_similarity(
            current_embedding,
            sentence_embedding
        )[0][0]
        
        should_split = (
            similarity < similarity_threshold or
            current_tokens + sentence_tokens > max_tokens
        )
        
        if should_split and current_tokens >= min_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_tokens = sentence_tokens
            current_embedding = sentence_embedding
        else:
            current_chunk.append(sentence)
            current_tokens += sentence_tokens
            # incremental mean embedding (CORRECT)
            current_embedding = (
                current_embedding * (len(current_chunk) - 1) + sentence_embedding
            ) / len(current_chunk)
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    # Post-process: merge last tiny chunk
    if len(chunks) >= 2:
        last_tokens = estimate_tokens(chunks[-1])
        if last_tokens < min_tokens:
            chunks[-2] = chunks[-2] + " " + chunks[-1]
            chunks.pop()
    
    return chunks

In [6]:
def chunk_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply semantic chunking to entire dataset.
    Returns a new DataFrame of chunks.
    """

    required_columns = {"title", "text"}
    if not required_columns.issubset(df.columns):
        raise ValueError(f"Dataset must contain columns: {required_columns}")

    model = SentenceTransformer(MODEL_NAME)

    records = []

    for _, row in df.iterrows():
        title = row["title"]
        text = row["text"]

        story_chunks = semantic_chunk_story(text, model)

        for chunk_id, chunk_text in enumerate(story_chunks):
            records.append({
                "title": title,
                "chunk_id": chunk_id,
                "chunk_text": chunk_text,
                "token_count": estimate_tokens(chunk_text)
            })

    return pd.DataFrame(records)


In [7]:
chunked_df = chunk_dataset(df)

print(chunked_df.head())
print("Total chunks:", len(chunked_df))
chunked_df["token_count"].describe()


                          title  chunk_id  \
0  A DESCENT INTO THE MAELSTRÖM         0   
1  A DESCENT INTO THE MAELSTRÖM         1   
2  A DESCENT INTO THE MAELSTRÖM         2   
3  A DESCENT INTO THE MAELSTRÖM         3   
4  A DESCENT INTO THE MAELSTRÖM         4   

                                          chunk_text  token_count  
0  The ways of God in Nature, as in Providence, a...          185  
1  You suppose me a very old man—but I am not. It...          176  
2  Nothing would have tempted me to within half a...          130  
3  “You must get over these fancies,” said the gu...          200  
4  A panorama more deplorably desolate no human i...          148  
Total chunks: 2852


count    2852.000000
mean      154.103086
std        36.453762
min       120.000000
25%       130.000000
50%       142.000000
75%       165.000000
max       550.000000
Name: token_count, dtype: float64

In [8]:
chunked_df

Unnamed: 0,title,chunk_id,chunk_text,token_count
0,A DESCENT INTO THE MAELSTRÖM,0,"The ways of God in Nature, as in Providence, a...",185
1,A DESCENT INTO THE MAELSTRÖM,1,You suppose me a very old man—but I am not. It...,176
2,A DESCENT INTO THE MAELSTRÖM,2,Nothing would have tempted me to within half a...,130
3,A DESCENT INTO THE MAELSTRÖM,3,"“You must get over these fancies,” said the gu...",200
4,A DESCENT INTO THE MAELSTRÖM,4,A panorama more deplorably desolate no human i...,148
...,...,...,...,...
2847,X-ING A PARAGRAPH,15,"The true reason, perhaps, is that x is rather ...",145
2848,X-ING A PARAGRAPH,16,Next morning the population of Nopolis were ta...,276
2849,X-ING A PARAGRAPH,17,The first definite idea entertained by the pop...,122
2850,X-ING A PARAGRAPH,18,One gentleman thought the whole an X-ellent jo...,133


In [9]:
chunked_df["chunk_text"][1]

'You suppose me a very old man—but I am not. It took less than a single day to change these hairs from a jetty black to white, to weaken my limbs, and to unstring my nerves, so that I tremble at the least exertion, and am frightened at a shadow. Do you know I can scarcely look over this little cliff without getting giddy?”     The “little cliff,” upon whose edge he had so carelessly thrown himself down to rest that the weightier portion of his body hung over it, while he was only kept from falling by the tenure of his elbow on its extreme and slippery edge—this “little cliff” arose, a sheer unobstructed precipice of black shining rock, some fifteen or sixteen hundred feet from the world of crags beneath us.'

In [10]:
chunked_df.to_csv('samentic_chunk.csv', index=False)