In [2]:
from dotenv import load_dotenv, find_dotenv
import os
import pprint
load_dotenv(find_dotenv())

from rich import print

# Character Text Splitting Algorithm

In [3]:
from langchain_core.documents import Document
text = """
The astronauts on the International Space Station conducted a spacewalk to repair a malfunctioning solar panel. 
The aroma of freshly baked croissants wafted through the charming French bakery. 
The new policy aimed to reduce carbon emissions by 50% within the next decade. 
The ancient Egyptian pharaohs were known for their elaborate headdresses and ornate jewelry."""

# Manual Splitting
chunks = []
chunk_size = 35 # Characters
for i in range(0, len(text), chunk_size):
    chunk = text[i:i + chunk_size]
    chunks.append(chunk)
documents = [Document(page_content=chunk, metadata={"source": "local"}) for chunk in chunks]
print(documents)

# Character Text Splitting Library

In [4]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size = 35, chunk_overlap=0, separator='', strip_whitespace=False)
documents = text_splitter.create_documents([text])
print(documents)

# Recursive Text Splitting

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 35, chunk_overlap=0) 
print(text_splitter.create_documents([text]))


# Semantic Chunking

In [None]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.embeddings.ollama import OllamaEmbeddings

# Percentile - all differences between sentences are calculated, and then any difference greater than the 
# X percentile is split

#embed_function = OllamaEmbeddings(model="nomic-embed-text")
embed_function = OpenAIEmbeddings(model="nomic-embed-text")
text_splitter = SemanticChunker(
    embeddings=embed_function, 
    breakpoint_threshold_type="percentile", # "percentile" "standard_deviation", "interquartile",
    breakpoint_threshold_amount=60
)
documents = text_splitter.create_documents([text])
print(documents)


# AI Split

In [8]:
from langchain_community.chat_models.ollama import ChatOllama
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate


prompt_template="""Based on the following context anwer the question:
=============================
Context: {context}
==============================
Question: {question}

Extra instructions: 
1. DO NOT ALTER ANY SENTENCE IN THE CONTEXT. 
2. CONSIDER ALL THE SENTENCES IN THE CONTEXT. 
3. DO NOT INVENT ANY NEW CONTEXT
4. DO NOT SUMMARIZE THE CONTEXT WHEN PROVIDING A REPLY
"""
prompt:ChatPromptTemplate = ChatPromptTemplate.from_template(prompt_template)

#Use your local LLM 
#local_llm = ChatOllama(
#    verbose=True,
#    model="phi3", 
#    base_url="http://127.0.0.1:11434", 
#    temperature=0)
#or use a paid one much faster
local_llm = ChatOpenAI(temperature=0)

chain = (
        {"context": RunnablePassthrough(), "question": RunnablePassthrough()}
        | prompt
        | local_llm
        | StrOutputParser()
    )

result = chain.invoke({
    "question":"""
        Split the page_content based on semantic differences if you find necessary
        return your reply in the same schema of the context received""",
    "context": documents
    })
print(result)

#Expected execution time: 51s