# Sliding window chunking

In [1]:
def sliding_window_chunk(text, window_size, step_size):
    chunks = []
    for i in range(0, len(text) - window_size + 1, step_size):
        chunks.append(text[i:i + window_size])
    return chunks

text = "This is an example text for sliding window chunking."
chunks = sliding_window_chunk(text, 10, 5)
print(chunks)


['This is an', 'is an exam', ' example t', 'ple text f', 'ext for sl', 'or sliding', 'iding wind', ' window ch', 'ow chunkin']


# Semantic chunking (sentence wise)

In [2]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def semantic_chunk(text):
    return sent_tokenize(text)

text = "This is an example text. It demonstrates semantic chunking. Each sentence is a chunk."
chunks = semantic_chunk(text)
print(chunks)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['This is an example text.', 'It demonstrates semantic chunking.', 'Each sentence is a chunk.']


# Fixed Size Chunking

In [3]:
def fixed_length_chunk(text, chunk_size):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

text = "This is an example text for fixed-length chunking."
chunks = fixed_length_chunk(text, 10)
print(chunks)


['This is an', ' example t', 'ext for fi', 'xed-length', ' chunking.']


# Topic Based Chunking

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def topic_based_chunk(text, num_topics):
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform([text])
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=0)
    lda.fit(X)
    topics = lda.transform(X)
    return topics

text = "This is an example text for topic-based chunking. It demonstrates how text can be divided based on topics."
chunks = topic_based_chunk(text, 2)
print(chunks)


[[0.94643071 0.05356929]]
