In [None]:
import re
import nltk
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt')


# Sample input text from your notes
sample_text = """
Section 1: Loan Eligibility
To qualify for a personal loan, the customer must be between 21 and 60 years old, have a minimum salary of ₹25,000, and a credit score above 700.

Section 2: Disbursal and Timelines
Loan disbursal typically occurs within 48 hours of approval. Delays can occur due to incomplete documentation.

Section 3: EMI Defaults
If the customer misses 2 or more EMIs, penalties apply. Further defaults may lead to legal action or freezing of accounts.

Section 4: Foreclosure
Customers can foreclose their loan after 6 EMIs have been paid. A foreclosure fee of 2% is applicable.
"""

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# --- 1. Fixed-Size Chunking ---
def fixed_size_chunking(text, chunk_size=40, overlap=10):
    """Splits text into fixed-size chunks with overlap (measured in words)."""
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = ' '.join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

In [None]:
# --- 2. Hierarchical Chunking ---
def hierarchical_chunking(text):
    """Splits a document using structured section headers."""
    # The pattern looks for a newline followed by "Section" and a digit
    sections = re.split(r'\n(?=Section \d+:)', text.strip())
    return [sec.strip() for sec in sections if sec.strip()]

In [None]:
# --- 3. Semantic Chunking ---
def semantic_chunking(text, threshold=0.3):
    """Groups nearby sentences based on semantic similarity."""
    model = SentenceTransformer('all-MiniLM-L6-v2')
    sentences = sent_tokenize(text)
    embeddings = model.encode(sentences)

    chunks = []
    current_chunk_sentences = [sentences[0]]

    for i in range(1, len(sentences)):
        # Compare the current sentence with the previous one
        similarity = cosine_similarity([embeddings[i]], [embeddings[i-1]])[0][0]

        # If similarity is high, add to the current chunk
        if similarity >= threshold:
            current_chunk_sentences.append(sentences[i])
        else:
            # If similarity drops, the topic has changed. Finalize the current chunk.
            chunks.append(' '.join(current_chunk_sentences))
            current_chunk_sentences = [sentences[i]] # Start a new chunk

    if current_chunk_sentences:
        chunks.append(' '.join(current_chunk_sentences))

    return chunks

In [None]:
from nltk.tokenize import sent_tokenize

# --- Utility to display the results ---
def display_chunks(chunks, title):
    print(f"\n--- {title} ---")
    print(f"Total Chunks: {len(chunks)}")
    for i, chunk in enumerate(chunks):
        print(f"\n[Chunk {i+1}]")
        print(chunk)

In [None]:
# --- Run All Chunkers ---
fixed_chunks = fixed_size_chunking(sample_text)
display_chunks(fixed_chunks, "Fixed-Size Chunking")


--- Fixed-Size Chunking ---
Total Chunks: 4

[Chunk 1]
Section 1: Loan Eligibility To qualify for a personal loan, the customer must be between 21 and 60 years old, have a minimum salary of ₹25,000, and a credit score above 700. Section 2: Disbursal and Timelines Loan disbursal typically

[Chunk 2]
above 700. Section 2: Disbursal and Timelines Loan disbursal typically occurs within 48 hours of approval. Delays can occur due to incomplete documentation. Section 3: EMI Defaults If the customer misses 2 or more EMIs, penalties apply. Further defaults may

[Chunk 3]
misses 2 or more EMIs, penalties apply. Further defaults may lead to legal action or freezing of accounts. Section 4: Foreclosure Customers can foreclose their loan after 6 EMIs have been paid. A foreclosure fee of 2% is applicable.

[Chunk 4]
been paid. A foreclosure fee of 2% is applicable.


In [None]:
hierarchical_chunks = hierarchical_chunking(sample_text)
display_chunks(hierarchical_chunks, "Hierarchical Chunking")


--- Hierarchical Chunking ---
Total Chunks: 4

[Chunk 1]
Section 1: Loan Eligibility
To qualify for a personal loan, the customer must be between 21 and 60 years old, have a minimum salary of ₹25,000, and a credit score above 700.

[Chunk 2]
Section 2: Disbursal and Timelines
Loan disbursal typically occurs within 48 hours of approval. Delays can occur due to incomplete documentation.

[Chunk 3]
Section 3: EMI Defaults
If the customer misses 2 or more EMIs, penalties apply. Further defaults may lead to legal action or freezing of accounts.

[Chunk 4]
Section 4: Foreclosure
Customers can foreclose their loan after 6 EMIs have been paid. A foreclosure fee of 2% is applicable.


In [None]:
import nltk
nltk.download('punkt_tab')

semantic_chunks = semantic_chunking(sample_text)
display_chunks(semantic_chunks, "Semantic Chunking")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


--- Semantic Chunking ---
Total Chunks: 4

[Chunk 1]

Section 1: Loan Eligibility
To qualify for a personal loan, the customer must be between 21 and 60 years old, have a minimum salary of ₹25,000, and a credit score above 700.

[Chunk 2]
Section 2: Disbursal and Timelines
Loan disbursal typically occurs within 48 hours of approval. Delays can occur due to incomplete documentation.

[Chunk 3]
Section 3: EMI Defaults
If the customer misses 2 or more EMIs, penalties apply. Further defaults may lead to legal action or freezing of accounts.

[Chunk 4]
Section 4: Foreclosure
Customers can foreclose their loan after 6 EMIs have been paid. A foreclosure fee of 2% is applicable.
