In [1]:
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the spaCy model for sentence segmentation
nlp = spacy.load("en_core_web_sm")

# Semantic splitting based on sentence boundaries and similarity
def semantic_splitting(text, threshold=0.3):
    # Parse the document
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]  # Extract sentences

    # Vectorize the sentences for similarity checking
    vectorizer = TfidfVectorizer().fit_transform(sentences)
    vectors = vectorizer.toarray()

    # Calculate pairwise cosine similarity between sentences
    similarities = cosine_similarity(vectors)

    # Initialize chunks with the first sentence
    chunks = [[sentences[0]]]

    # Group sentences into chunks based on similarity threshold
    for i in range(1, len(sentences)):
        sim_score = similarities[i-1, i]

        if sim_score >= threshold:
            # If the similarity is above the threshold, add to the current chunk
            chunks[-1].append(sentences[i])
        else:
            # Start a new chunk
            chunks.append([sentences[i]])

    # Join the sentences in each chunk to form coherent paragraphs
    return [' '.join(chunk) for chunk in chunks]

# Example usage
text = """
Long-context LLMs. There has long been efforts for enabling LLMs to handle long contexts 
(Guo et al., 2022; Beltagy et al., 2020; Chen et al., 
2023b). While recent LLMs like Gemini-1.5 (Reid
et al., 2024), GPT-4 (Achiam et al., 2023), Claude3 (Anthropic, 2024) achieve significantly larger
context window size, long-context prompting is
still expensive due to the quadratic computation
cost of transformers regarding to the input token
numbers. Recent work proposes methods to reduce
cost by prompt compression (Jiang et al., 2023),
model distillation (Hsieh et al., 2023), or LLM cascading (Chen et al., 2023a).

Retrieval-augmented generation. Augmenting
LLMs with relevant information retrieved from
various sources (Lewis et al., 2020), i.e., RAG,
has been successful in complementing LLMs with
external knowledge. RAG achieves good performance on various of tasks like language modeling
(Khandelwal et al., 2019; Shi et al., 2023) and QA
(Guu et al., 2020; Izacard and Grave, 2020), with
a significantly lower computation cost (Borgeaud
et al., 2022). Related to but different from our work,
recently works augment RAG with correction (Yan
et al., 2024), critique (Asai et al., 2023), or verification (Li et al., 2023) to improve retrieval quality
on knowledge-intensive tasks.
Long-context evaluation. Evaluating long-context
models is challenging due to the difficulty in
collecting and analyzing long texts. Recent researchers propose both synthetic tests like needlein-a-haystack (Greg Kamradt, 2023), Ruler (Hsieh
et al., 2024), or Counting Stars (Song et al., 2024),
and real datasets including LongBench (Bai et al.,
2023), ∞Bench (Zhang et al., 2024), L-Eval (An
et al., 2023), and others (Shaham et al., 2022; Yuan
et al., 2024; Maharana et al., 2024). Evaluating
on these datasets, recent works study the performance degradation over various context lengths
(Levy et al., 2024; Hsieh et al., 2024), the lostin-the-middle phenomenon (Liu et al., 2024), and
explore solutions (Kuratov et al., 2024). Related
to our work, Xu et al. (2023) compare RAG and
long-context prompting and find that long-context
models still lags behind RAG. This is different
from our findings, possibly due to consideration of
stronger LLMs and longer contexts in our work.
"""

# Perform semantic splitting
semantic_chunks = semantic_splitting(text)

# Print the chunks
for idx, chunk in enumerate(semantic_chunks):
    print(f"Chunk {idx+1}:\n{chunk}\n")


Chunk 1:

Long-context LLMs.

Chunk 2:
There has long been efforts for enabling LLMs to handle long contexts 
(Guo et al., 2022; Beltagy et al., 2020; Chen et al., 
2023b).

Chunk 3:
While recent LLMs like Gemini-1.5 (Reid
et al., 2024), GPT-4 (Achiam et al., 2023), Claude3 (Anthropic, 2024) achieve significantly larger
context window size, long-context prompting is
still expensive due to the quadratic computation
cost of transformers regarding to the input token
numbers.

Chunk 4:
Recent work proposes methods to reduce
cost by prompt compression (Jiang et al., 2023),
model distillation (Hsieh et al., 2023), or LLM cascading (Chen et al., 2023a).



Chunk 5:
Retrieval-augmented generation.

Chunk 6:
Augmenting
LLMs with relevant information retrieved from
various sources (Lewis et al., 2020), i.e., RAG,
has been successful in complementing LLMs with
external knowledge.

Chunk 7:
RAG achieves good performance on various of tasks like language modeling
(Khandelwal et al., 2019; Shi et al