In [21]:
import os
from langchain_ollama import OllamaLLM
llm = OllamaLLM(model="llama3.2")

In [22]:
from PyPDF2 import PdfReader
import os
import re
from pathlib import Path
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption


if not os.path.exists('output'):
    os.makedirs('output')

def convert_pdf_to_markdown(file_path: str) -> str:
    input_doc_path = Path(file_path)
    
    if not input_doc_path.exists():
        raise FileNotFoundError(f"Input file not found: {input_doc_path}")

    # Prepare pipeline options
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True

    # Initialize document converter
    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options
            )
        }
    )

    conv_result = doc_converter.convert(input_doc_path)
    
    # Return markdown as text
    markdown_output = conv_result.document.export_to_markdown()
    cleaned_markdown = re.sub(r'<!--\s*image\s*-->', '', markdown_output, flags=re.IGNORECASE)

    # if not os.path.exists('output'):
    #     os.makedirs('output')

    file_path = os.path.join('output', 'sample.md')

    with open('output/sample.md', 'w') as file:
        file.write(cleaned_markdown)

    with open('output/sample.md', 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def read_file(file,chunking_method=None):
    ext = os.path.splitext(file)[-1].lower()
    if ext == ".pdf" and chunking_method == "MarkdownHeaderSplitter":
        text = convert_pdf_to_markdown(file)
        return text
    
    if ext == ".pdf":
        reader = PdfReader(file)
        text = "\n".join([page.extract_text() for page in reader.pages])
        return text
    
    elif ext == ".md":
        with open(file.name, "r", encoding="utf-8") as f:
            text = f.read()
        return text

    else:
        raise ValueError("Unsupported file format. Please upload a PDF or Markdown (.md) file.")

In [23]:
from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from rapidfuzz import fuzz, process
import re

def prepare_pipeline_options() -> PdfPipelineOptions:
    options = PdfPipelineOptions()
    options.do_ocr = True
    options.do_table_structure = True
    options.table_structure_options.do_cell_matching = True
    return options

def normalize_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def split_text_into_candidates(text, window_size=300, step=150):
    """
    Create overlapping chunks from flat text for efficient fuzzy matching.
    """
    candidates = []
    for i in range(0, len(text), step):
        window = text[i:i+window_size]
        if len(window) > 20:
            candidates.append((window, i))
    # print(candidates)
    return candidates

def match_chunks_to_pdf(chunks, pdf_file_path):
    """
    Matches the provided chunks to the PDF content and extracts metadata.
    Handles multi-page, multi-section, and multi-bbox chunks.
    """
    pdf_path = Path(pdf_file_path)
    if not pdf_path.exists():
        raise FileNotFoundError(f"PDF file not found: {pdf_file_path}")

    pipeline_options = prepare_pipeline_options()

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options
            )
        }
    )
    conv_result = doc_converter.convert(pdf_path)

    document = conv_result.document

    flat_text = ""
    char_map = []
    element_meta = []

    current_section = None
    for idx, text_element in enumerate(document.texts):
        if getattr(text_element, "label", None) == "section_header":
            current_section = text_element.text
        element_meta.append({
            "page_no": text_element.prov[0].page_no,
            "bbox": text_element.prov[0].bbox,
            "section_header": current_section,
            "text_element_idx": idx,
        })
        for char_idx, char in enumerate(text_element.text):
            flat_text += char
            char_map.append({
                "text_element_idx": idx,
                "char_idx_in_element": char_idx,
            })
        flat_text += " "
        char_map.append({
            "text_element_idx": idx,
            "char_idx_in_element": len(text_element.text),
        })

    norm_flat_text = normalize_text(flat_text)

    # Pre-split flat text into overlapping candidates for fuzzy match
    candidates = split_text_into_candidates(norm_flat_text)

    candidate_texts = [c[0] for c in candidates]
    candidate_offsets = [c[1] for c in candidates]

    results = []
    for chunk in chunks:
        norm_chunk = normalize_text(chunk)
        match = re.search(re.escape(norm_chunk), norm_flat_text)

        match_type = "exact"
        if match:
            start, end = match.start(), match.end()
        else:
            match_type = "fuzzy"
            fuzzy_result = process.extractOne(
                norm_chunk,
                candidate_texts,
                scorer=fuzz.partial_ratio,
                score_cutoff=85
            )
            if fuzzy_result:
                best_match, score, idx = fuzzy_result
                offset = candidate_offsets[idx]
                local_start = norm_flat_text.find(best_match, offset, offset + len(best_match) + 20)
                if local_start == -1:
                    # Add empty fallback result
                    results.append({
                        "original_chunk": chunk.strip(),
                        "matched_text": "",
                        "match_type": "none",
                        "pages": [],
                        "bounding_boxes": [],
                        "section_headers": []
                    })
                    continue
                start, end = local_start, local_start + len(best_match)
            else:
                # Add empty fallback result
                results.append({
                    "original_chunk": chunk.strip(),
                    "matched_text": "",
                    "match_type": "none",
                    "pages": [],
                    "bounding_boxes": [],
                    "section_headers": []
                })
                continue

        involved_elements = set()
        involved_pages = set()
        involved_bboxes = []
        involved_sections = set()
        matched_text = flat_text[start:end]

        for i in range(start, end):
            if i >= len(char_map): continue
            mapping = char_map[i]
            idx = mapping["text_element_idx"]
            meta = element_meta[idx]
            involved_elements.add(idx)
            involved_pages.add(meta["page_no"])
            involved_bboxes.append(meta["bbox"])
            if meta["section_header"]:
                involved_sections.add(meta["section_header"])

        bbox_info = []
        for idx in sorted(involved_elements):
            meta = element_meta[idx]
            bbox = meta["bbox"]
            bbox_info.append({
                "page_no": meta["page_no"],
                "bbox": {
                    "l": bbox.l,
                    "t": bbox.t,
                    "r": bbox.r,
                    "b": bbox.b
                }
            })

        results.append({
            "original_chunk": chunk.strip(),
            "matched_text": matched_text.strip(),
            "match_type": match_type,
            "pages": sorted(list(involved_pages)),
            "bounding_boxes": bbox_info,
            "section_headers": list(involved_sections)
        })

    return results

In [24]:
from qdrant_client import QdrantClient
from langchain_qdrant import Qdrant
from langchain_huggingface import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from httpx import Timeout

qdrant_client = QdrantClient(
    url="http://localhost:6333",  # or your Qdrant server URL
    timeout= 9999.0  # Increase timeout to 30 seconds
)


#qdrant_client = QdrantClient(host="localhost", port=6333)
embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
sen_model = SentenceTransformer("all-MiniLM-L6-v2")
try:
    qdrant_client.create_collection(
        collection_name="rag_collection",
        vectors_config={
            "size": 384,
            "distance": "Cosine"
        },
        optimizers_config={"default_segment_number": 1},
        on_disk_payload=True
    )
except:
    pass

vectorstore = Qdrant(
    client=qdrant_client,
    collection_name="rag_collection",
    embeddings=embeddings_model
)
collections = qdrant_client.get_collections()
print(collections)

# def store_embeddings(text_chunks, file_id):
#     metadatas = [
#         {"file_id": file_id, "chunk_num": i + 1}
#         for i in range(len(text_chunks))
#     ]
#     vectorstore.add_texts(texts=text_chunks, metadatas=metadatas)
#     print(f"Successfully stored {len(text_chunks)} chunks in vectorstore.")

def store_embeddings(chunks, file_id, pdf_path):
    chunk_metadata = match_chunks_to_pdf(chunks, pdf_path)

    metadatas = []
    for i, chunk in enumerate(chunks):
        base_meta = {
            "file_id": file_id,
            "chunk_num": i + 1,
            "original_chunk": chunk.strip()
        }

        # Find the matching result by original chunk
        matched_meta = next(
            (item for item in chunk_metadata if normalize_text(item["original_chunk"]) == normalize_text(chunk)),
            None
        )

        if matched_meta:
            base_meta.update({
                "matched_text": matched_meta.get("matched_text", ""),
                "match_type": matched_meta.get("match_type", "none"),
                "pages": matched_meta.get("pages", []),
                "bounding_boxes": matched_meta.get("bounding_boxes", []),
                "section_headers": matched_meta.get("section_headers", [])
            })
        else:
            # In case no metadata was returned for a chunk at all (shouldn't happen now)
            base_meta.update({
                "matched_text": "",
                "match_type": "none",
                "pages": [],
                "bounding_boxes": [],
                "section_headers": []
            })

        metadatas.append(base_meta)

    vectorstore.add_texts(texts=chunks, metadatas=metadatas)
    print(f"Successfully stored {len(chunks)} chunks in vectorstore with metadata.")

collections=[CollectionDescription(name='rag_collection')]


In [25]:
import nltk
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')

def semantic_chunking(text, sen_model, threshold=0.75, window_before=1, window_after=1, max_chunk_len=300):
    import numpy as np
    import nltk
    from sklearn.metrics.pairwise import cosine_similarity

    sentences = nltk.sent_tokenize(text)
    sentence_groups = []
    for i in range(len(sentences)):
        context = sentences[max(0, i - window_before): min(len(sentences), i + window_after + 1)]
        sentence_groups.append(" ".join(context))

    embeddings = sen_model.encode(sentence_groups, normalize_embeddings=True)

    chunks = []
    current_chunk = [sentences[0]]
    current_len = len(sentences[0])

    for i in range(1, len(sentences)):
        sim = cosine_similarity([embeddings[i - 1]], [embeddings[i]])[0][0]
        sentence_len = len(sentences[i])

        if sim >= threshold and (current_len + sentence_len) <= max_chunk_len:
            current_chunk.append(sentences[i])
            current_len += sentence_len
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentences[i]]
            current_len = sentence_len

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


def sentence_similarity_chunking(text, sen_model, threshold=0.3, max_chunk_len=300):
    sentences = nltk.sent_tokenize(text)
    embeddings = sen_model.encode(sentences, normalize_embeddings=True)
    chunks = []
    current_chunk = [sentences[0]]
    current_len = len(sentences[0])

    for i in range(1, len(sentences)):
        sim = cosine_similarity([embeddings[i - 1]], [embeddings[i]])[0][0]
        next_len = current_len + len(sentences[i])

        if sim >= threshold and next_len <= max_chunk_len:
            current_chunk.append(sentences[i])
            current_len = next_len
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentences[i]]
            current_len = len(sentences[i])
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

def agentic_chunking(text):
    from phi.agent import Agent
    from phi.document.chunking.agentic import AgenticChunking
    from phi.model.ollama import Ollama
    agent = Agent(
    model=Ollama(id="llama3.2"),
    search_knowledge=True
)
 
# Step 3: Create AgenticChunking instance
    chunker = AgenticChunking(model=agent.model)
 
# Step 4: Perform chunking on extracted text
    chunks = chunker.chunk(text)  # Returns a list of DocumentChunks
    return chunks

[nltk_data] Downloading package punkt to /home/diva001/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [26]:
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter

def split_text(text, chunking_method="RecursiveTextSplitter",model=None):
    if chunking_method == "MarkdownHeaderSplitter":
        splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[
            ("#" , "Header 1"),
            ("##" , "Header 2"),
            ("###" , "Header 3"),
            ("####" , "Header 4"),
            ("#####" , "Header 5"),
            ("######" , "Header 6"),
        ])
        docs = splitter.split_text(text)
        recursive_splitter = RecursiveCharacterTextSplitter(chunk_overlap=50)
        chunks = []
        for doc in docs:
            sub_chunks = recursive_splitter.split_text(doc.page_content)
            chunks.extend(sub_chunks)
        return chunks
    
    # Case: Recursive Chunking (default/fallback)
    elif chunking_method == "RecursiveTextSplitter":
        chunker = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
        return chunker.split_text(text)

    # Semantic Chunking
    elif chunking_method == "SemanticChunking":
        return semantic_chunking(text,model)

    # Sentence Similarity Chunking
    elif chunking_method == "SentenceSimilarityChunking":
        return sentence_similarity_chunking(text,model)

    # Agentic (Paragraph-based) Chunking
    elif chunking_method == "AgenticChunking":
        return agentic_chunking(text)

    # Fallback
    else:
        return [text]


In [27]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

custom_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are an intelligent assistant. Use the following context to answer the question accurately.Don't hallucinate and generate precise answers.

Context:
{context}

Question:
{question}

Answer:
""".strip()
)


In [28]:
def ask_question(query, top_k,temp,rep_panalty):
    llm.temperature = temp
    llm.repeat_penalty = rep_panalty   
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever= vectorstore.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={"score_threshold": 0.5, "k": top_k}
    ),
        chain_type="stuff",
        return_source_documents=True,
        chain_type_kwargs={"prompt": custom_prompt}
    )
    
    response = qa_chain({"query": query})
    answer = response["result"]
    sources = response["source_documents"]

    chunks_text = "\n\n".join([
    f"Chunk {doc.metadata.get('chunk_num', i + 1)} \n "
    f"(File ID: {doc.metadata.get('file_id', 'unknown')}\n | "
    f"Page(s): {doc.metadata.get('pages', [])}\n| "
    f"Section(s): {doc.metadata.get('section_headers', [])} \n| "
    f"Bounding Box(es): {doc.metadata.get('bounding_boxes', [])}):\n"
    f"{doc.page_content}"
    for i, doc in enumerate(sources)
    ])
    print(chunks_text)
    return f"Output:\n{answer}\n\nRetrieved Chunks:\n{chunks_text}"

In [29]:
def delete_collection():
    qdrant_client.delete_collection(collection_name="rag_collection")
    return f"Collection {"rag_collection"} deleted successfully."

In [30]:
import shutil

SAVE_DIR = "uploaded_files"
os.makedirs(SAVE_DIR, exist_ok=True)

# === HELPER FUNCTIONS ===
def clear_folder(folder_path):
    """Deletes all files/folders in the directory."""
    for item in os.listdir(folder_path):
        item_path = os.path.join(folder_path, item)
        if os.path.isdir(item_path):
            shutil.rmtree(item_path)
        else:
            os.remove(item_path)

def save_pdf(file_path):
    """Save uploaded PDF to SAVE_DIR and return path + status."""
    if file_path:
        clear_folder(SAVE_DIR)
        file_name = os.path.basename(file_path)
        save_path = os.path.join(SAVE_DIR, file_name)
        shutil.move(file_path, save_path)
        return save_path, f"File saved at: {save_path}"
    return None, "No file uploaded."

In [31]:
try:
    def process_and_store(filepath, chunking_method):
        file_id = filepath
        status_message = f"Processing file: {file_id}\n"
        status_message += f"Chunking method selected: {chunking_method}\n"
        try:
            qdrant_client.create_collection(
                collection_name="rag_collection",
                vectors_config={"size": 384, "distance": "Cosine"},
                optimizers_config={"default_segment_number": 1},
                on_disk_payload=True
            )
        except:
            pass

        text = read_file(filepath,chunking_method)
        chunks = split_text(text, chunking_method=chunking_method, model=sen_model)
        status_message += f"Text split into {len(chunks)} chunks using '{chunking_method}'.\n"
        
        print("\n=====Chunks Preview =====\n")
        for i, chunk in enumerate(chunks):
            print(f"Chunk {i+1}:\n{chunk}\n{'-'*80}")

        store_embeddings(chunks, file_id,filepath)
        status_message += f"Successfully stored {len(chunks)} chunks in vectorstore."
        return status_message
except Exception as e:
    import traceback
    print("Error occurred:")
    traceback.print_exc()


In [None]:
import gradio as gr

with gr.Blocks() as app:
    with gr.Row():
        file_input =  gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"])
        save_button = gr.Button("Save File")
        file_status = gr.Textbox(label="File Save Status", interactive=False)
    
    file_path_state = gr.State()

    save_button.click(
        fn=save_pdf,
        inputs=[file_input],
        outputs=[file_path_state, file_status]
    )

    with gr.Row():
        chunking_method_input = gr.Dropdown(
            choices=[
                "RecursiveTextSplitter",
                "MarkdownHeaderSplitter",
                "SemanticChunking",
                "SentenceSimilarityChunking",
                "AgenticChunking"
            ],
            label="Select Chunking Method",
            value="RecursiveTextSplitter"
        )
        delete_btn = gr.Button("Delete Collection")
        popup = gr.Textbox(visible=True, label="Status of deletion", interactive=False)
        upload_btn = gr.Button("Process File")
        
    file_output = gr.Textbox(label="Status")

    with gr.Row():
        question_input = gr.Textbox(label="Ask a question")
        top_k = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Top-K Chunks to Retrieve")
        temp = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.5, label="Temperature")
        rep_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.2, step=0.1, label="Repetition Penalty")
        ask_btn = gr.Button("Submit")

    answer_output = gr.Textbox(label="Answer")

    upload_btn.click(fn=process_and_store, inputs=[file_path_state, chunking_method_input], outputs=file_output)
    ask_btn.click(fn=ask_question, inputs=[question_input, top_k,temp,rep_penalty], outputs=answer_output)
    delete_btn.click(fn=delete_collection, outputs=popup)
app.launch()
