### Semenatic chunking
- SemanticChunker is a dacoument splitter tha uses embedding similarity between sentences to decide chunk boundaries
- it ensures the each chunk is semantically coherent and not cu off mid-thought like traditional character/token splitters

In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [8]:
# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

## Sample etxt
text = ""
with open ("langchain_intro.txt", "r") as file:
    text = file.read()
    
## Step 1: Split into sentences
sentences = [s.strip() for s in text.split("\n") if s.strip()]

## Step 2: Embed each sentence
embeddings = model.encode(sentences)

## initialize the parameters
threshold = 0.7
chunks = []
current_chunk=[sentences[0]]

## Step 4: Semantic grouing based on threshold
for i in range(1, len(sentences)):
    sim = cosine_similarity(
        [embeddings[i-1]],
        [embeddings[i]]
    )[0][0]

    if sim > threshold:
        current_chunk.append(sentences[i])
    else:
        chunks.append(" ".join(current_chunk))
        current_chunk=[sentences[i]]
        
        
# Append the last chunk
chunks.append(" ".join(current_chunk))

## Output the chunks
for idx, chunk in enumerate(chunks):
    print(f"\nChunk {idx+1}:\n{chunk}")


Chunk 1:
LangChain is a framework for building applications with LLMs. Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.

Chunk 2:
You can create chains, agents, memory, and retrievers.

Chunk 3:
The Eiffel Tower is located in Paris.

Chunk 4:
France is a popular tourist destination.


## RAG Pipeline - Modular Coding

In [21]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from langchain_ollama import ChatOllama
from langchain.schema.runnable import RunnableLambda, RunnableMap
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [25]:
### Custom semantic chunker with threshold

class ThresholdSemanticChunker:
    def __init__(
            self, embedding_model: str = "http://localhost:8080", threshold: float = 0.7):
        self.threshold = threshold
        self.embeddings = HuggingFaceEndpointEmbeddings(model=embedding_model)
        
    
    def split(self, text: str):
        sentences = [s.strip() for s in text.split("\n") if s.strip()]
        embeddings = model.encode(sentences)
        chunks = []
        current_chunk=[sentences[0]]

        for i in range(1, len(sentences)):
            sim = cosine_similarity([embeddings[i-1]],[embeddings[i]])[0][0]

            if sim > threshold:
                current_chunk.append(sentences[i])
            else:
                chunks.append(" ".join(current_chunk) + ".")
                current_chunk=[sentences[i]]     
        
        chunks.append(". ".join(current_chunk) + ".")
        return chunks 
           
    
    def split_documents(self, docs):
        result = []
        for doc in docs:
            for chunk in self.split(doc.page_content):
                result.append(Document(page_content=chunk, metadata=doc.metadata))
        return result

In [26]:
## Sample etxt
text = ""
with open ("langchain_intro.txt", "r") as file:
    text = file.read()
    
doc = Document(page_content=text)
doc

Document(metadata={}, page_content='LangChain is a framework for building applications with LLMs.\nLangchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.\nYou can create chains, agents, memory, and retrievers.\nThe Eiffel Tower is located in Paris.\nFrance is a popular tourist destination.')

In [27]:
## Chunking
chunker = ThresholdSemanticChunker(threshold=0.7)
chunks=chunker.split_documents([doc])
chunks

[Document(metadata={}, page_content='LangChain is a framework for building applications with LLMs. Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone..'),
 Document(metadata={}, page_content='You can create chains, agents, memory, and retrievers..'),
 Document(metadata={}, page_content='The Eiffel Tower is located in Paris..'),
 Document(metadata={}, page_content='France is a popular tourist destination..')]

In [33]:
### Vectorestore
from langchain_huggingface import HuggingFaceEndpointEmbeddings
vectorestore = FAISS.from_documents(chunks, HuggingFaceEndpointEmbeddings(model="http://localhost:8080"))
retriever = vectorestore.as_retriever(search_kwargs={"k": 3}, search_type="similarity")

In [31]:
### Prompttemplate
prompt = PromptTemplate.from_template(
    """
    Your are a helpful assistant, please provide the answers to the following questions based on the given context:
    
    {context}
    
    Question: {question}
    """
)

prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='\n    Your are a helpful assistant, please provide the answers to the following questions based on the given context:\n\n    {context}\n\n    Question: {question}\n    ')

In [None]:
## LLM Model
from langchain_ollama import ChatOllama

llm = ChatOllama(
    model="gemma2:9b-instruct-q4_K_M",
    temperature=0.4
)

In [36]:
### LCEL Chain
rag_chain = (
    RunnableMap({
        "context": lambda x: retriever.invoke(x['question']),
        "question": lambda x: x["question"]
    })
    | prompt
    | llm
    | StrOutputParser()
)

In [40]:
result = rag_chain.invoke({"question": "What is the capital of France"})
result

"While the provided text mentions that France is a popular tourist destination and that the Eiffel Tower is located in Paris, it doesn't explicitly state what the capital of France is.  \n\nHowever, it can be inferred from the context that **Paris** is the capital of France. \n"

## Semantic chunker with Langchain

In [45]:
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain.document_loaders import TextLoader
 
loader = TextLoader("langchain_intro.txt")
docs =loader.load()

embeddings = HuggingFaceEndpointEmbeddings()

semantic_chunker = SemanticChunker(embeddings=embeddings)
chunks = semantic_chunker.split_documents(docs)

for i, chunk in enumerate(chunks):
    print(f"\n chunk {i+1}: \n{chunk.page_content}")


 chunk 1: 
LangChain is a framework for building applications with LLMs. Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone. You can create chains, agents, memory, and retrievers.

 chunk 2: 
The Eiffel Tower is located in Paris. France is a popular tourist destination.
