# 🧱 Modular RAG Pipeline
This notebook walks you through building a modular Retrieval-Augmented Generation (RAG) pipeline using:
- `docling` for document parsing
- `lancedb` for vector search
- `OpenAI` for answering queries with retrieved context

Each step is wrapped in functions for clean and reusable design.


In [None]:
!pip install docling lancedb openai pandas requests tqdm python-dotenv

In [None]:
!wget -O .env https://farmsilo.blob.core.windows.net/mlcourse/.env

In [None]:
from tqdm.notebook import tqdm

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

In [None]:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options.lang = ["en"]


doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

def parse_document(file_path):
    print("📄 Loading and parsing document...")
    result = doc_converter.convert(source=file_path)
    
    if len(result.errors) > 0:
        print(f"❌ Conversion has {len(result.errors)} errors")
    
    print(f"✅ Parsed {len(result.pages)} pages.")
    print("----------------------------------\n")
    return result.document.export_to_markdown()

In [None]:
# On Colab: Upload data
doc = parse_document("data/bitcoin.pdf")

In [None]:
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 100):
    """
    Naively splits a markdown string into fixed-size chunks with optional overlap.

    Args:
        text (str): The markdown string to split.
        max_length (int): Maximum number of characters per chunk.
        overlap (int): Number of overlapping characters between chunks.

    Returns:
        List[str]: List of text chunks.
    """
    print("✂️ Chunking document with overlap...")
    
    if chunk_size <= overlap:
        raise ValueError("max_length must be greater than overlap")

    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - overlap

    print(f"✅ Created {len(chunks)} chunks of size {chunk_size} with overlap {overlap}. First chunk preview:\n{chunks[0][:min(200, chunk_size)]}...")
    print("----------------------------------\n")
    return chunks

In [None]:
chunks = chunk_text(doc, chunk_size=500, overlap=100)

In [None]:
import requests

EMBEDDER_URL = "https://embedserver.enki.farm/v2/models/embed/infer"
EMBEDDER_API_KEY = os.getenv("ENKI_API_KEY")

headers = {
    "X-ENKI-FARM-API-KEY": EMBEDDER_API_KEY
}

def embedd_chunk(chunk):
    inference_request = {
        "inputs": [
            {
                "name": "text_chunk",
                "shape": [1],
                "datatype": "BYTES",
                "data": [chunk]
            }
        ]
    }

    try:
        response = requests.post(EMBEDDER_URL, json=inference_request, headers=headers)
        result = response.json()
        return result["outputs"][0]["data"]
    except Exception as ex:
        print(f"Error: {ex}")
        return None



def get_embeddings(chunks):
    print(f"🧠 Generating embeddings for {len(chunks)} chunks...")
    embeddings = []

    for chunk in tqdm(chunks):
        embedding = embedd_chunk(chunk)

        if embedding:
            embeddings.append(embedding)
    
    print(f"✅ Generated {len(embeddings)} embeddings. Vector size: {len(embeddings[0])}")
    print("----------------------------------\n")
    return embeddings

In [None]:
embeddings = get_embeddings(chunks)

In [None]:
import lancedb
import pandas as pd

def create_lancedb_table(db_path: str, texts, embeddings, table_name="documents"):
    print("📦 Creating LanceDB table...")
    db = lancedb.connect(db_path)
    df = pd.DataFrame({
        "text": texts,
        "vector": embeddings
    })
    table = db.create_table(table_name, data=df, mode="overwrite")
    print(f"✅ Stored {len(texts)} records in '{table_name}' table.")
    print("----------------------------------\n")
    return table

In [None]:
document_table = create_lancedb_table("./bitcoin-embs", chunks, embeddings)

In [None]:
def retrieve_context(query, table, top_k=1):
    print(f"🔍 Retrieving top {top_k} chunks for query: '{query}'")
    query_embedding = get_embeddings(172[query])[0]
    results = table.search(query_embedding).limit(top_k).to_pandas()
    context = "\n".join(results["text"])
    print(f"✅ Retrieved context:\n{context[:500]}...\n")
    print("----------------------------------\n")
    return context

In [None]:
query = "What can the public see about bitcoin transactions?"

In [None]:
context = retrieve_context(query, document_table)

In [None]:
from openai import OpenAI

LLM_API_KEY = os.getenv("OPENAI_API_KEY")

client = OpenAI(
    api_key=LLM_API_KEY,
)

def generate_answer(query, context, model="gpt-4o-mini"):
    print("🤖 Generating answer using LLM...")
 
    response = client.responses.create(
        model="gpt-4o-mini",
        instructions="You are a helpful assistant. Use the following context to answer the question.",
        input=f"""
        Context:
        {context}

        Question:
        {query}

        Answer:
        """
    )
    
    answer = response.output_text
    print(f"✅ Answer generated:\n{answer}")
    print("----------------------------------\n")
    return answer

In [None]:
answer = generate_answer(query, context)

In [None]:
def run_rag_pipeline(
    file_path,
    query,
    db_path="./rag-db",
    table_name="documents",
    top_k=1,
    chunk_size=500,
    overlap=100
):
    doc = parse_document(file_path)
    chunks = chunk_text(doc, chunk_size=chunk_size, overlap=overlap)
    embeddings = get_embeddings(chunks)
    table = create_lancedb_table(db_path, chunks, embeddings, table_name)
    context = retrieve_context(query, table, top_k=top_k)
    answer = generate_answer(query, context)
    return answer

In [None]:
# ✅ Example Usage
run_rag_pipeline(
    file_path="data/bitcoin.pdf",
    query="What can the public see about bitcoin transactions?",
    chunk_size=500,
    overlap=100
)

In [None]:
# clean up

import shutil

shutil.rmtree("bitcoin-embs/")