# Chunking

In [None]:
import argparse
import os
import shutil
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.vectorstores.chroma import Chroma

In [None]:
CHROMA_PATH = "chroma"
DATA_PATH = "data"

In [None]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader

def load_documents():
    document_loader = PyPDFDirectoryLoader(path="data")
    return document_loader.load()

In [None]:
import re
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

def preprocess_text(text: str) -> str:
    # Remove multiple spaces and unnecessary line breaks
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single one
    text = text.strip()  # Remove spaces at the beginning and end
    return text

def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800, #essayer avec 500
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False
    )
    
    # Nettoyage avant la séparation
    cleaned_documents = [Document(page_content=preprocess_text(doc.page_content), metadata=doc.metadata) for doc in documents]
    return text_splitter.split_documents(cleaned_documents)


In [None]:
documents = load_documents()
chunks = split_documents(documents)
print(chunks[0])

# Embedding functions

In [None]:
from langchain_community.embeddings.ollama import OllamaEmbeddings

def get_embedding_function():
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    return embeddings

In [None]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks


# Data storage

In [None]:
from langchain.vectorstores import Chroma

def store_chunks_in_chroma(chunks):
    # Initialize the embedding model
    embedding_function = get_embedding_function()
    
    # Initialize the Chroma database and add the chunks
    vectorstore = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
    vectorstore.add_documents(chunks)
    
    # Save the vector database to disk
    vectorstore.persist()
    print(f"✅ {len(chunks)} chunks stored successfully in Chroma!")

    return vectorstore

# Calling the function
chunks_with_ids = calculate_chunk_ids(chunks)
vectorstore = store_chunks_in_chroma(chunks_with_ids)


# Example

In [None]:
def query_chroma(query):
    # Load the existing database
    vectorstore = Chroma(persist_directory=CHROMA_PATH, embedding_function=get_embedding_function())
    
    # Perform a similarity search
    results = vectorstore.similarity_search(query, k=3)
    
    print("\n🔎 Query Results:")
    for result in results:
        print(f"Source: {result.metadata['source']}")
        print(f"Chunk: {result.page_content}\n")
        
# Example query
query_chroma("Quel est le code établissement pour l'application SoWeSign ?")


In [None]:
query_chroma("Est-il autorisé de fumer dans l'enceinte de l'école ?")