# RAG pipeline

## Installs and checks

In [9]:
from bs4 import BeautifulSoup
import chromadb
import markdown2
import nltk
import numpy as np
import os
from sentence_transformers import SentenceTransformer
import shutil
import stat
import tiktoken
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [11]:
# check PyTorch and CUDA
print("Torch version: ", torch.__version__)
print("CUDA available: ", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU device name: ", torch.cuda.get_device_name(0))

Torch version:  2.9.1+cu130
CUDA available:  True
GPU device name:  NVIDIA GeForce RTX 5070 Ti Laptop GPU


In [12]:
# point to a fresh local folder:
# (reset persistent data)
DB_DIR = "./chroma_db"

# function to handle read-only files during deletion
def remove_readonly(func, path, excinfo):
    """
    Clear the read-only attribute and retry deleting the file/folder.
    This is passed to shutil.rmtree's onerror parameter.
    """
    os.chmod(path, stat.S_IWRITE)
    func(path)
    
def reset_dir(path):
    if os.path.exists(path):
        shutil.rmtree(path, onerror=remove_readonly)
    os.makedirs(path, exist_ok=True)
    print(f"{path} reset successfully")

reset_dir(DB_DIR)

PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: './chroma_db\\chroma.sqlite3'

In [None]:
# load model and move to GPU if available
model_name = "sentence-transformers/all-mpnet-base-v2"
model = SentenceTransformer(model_name)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(DEVICE)

In [None]:
# check dimension
def embedding_dim_for_model(m):
    v = m.encode(["hello"], convert_to_numpy=True)
    return v.shape[-1]

print("Embedding dim: ", embedding_dim_for_model(model))

## Initialize Chroma DB

In [None]:
# initialize Chroma v2 PersistentClient
client = chromadb.PersistentClient(path=DB_DIR)
# create/get collection (new API)
collection = client.get_or_create_collection("embed_test")

In [None]:
def rag(question):
    q_emb = model.encode([question], convert_to_numpy=True, device=DEVICE)

    results = collection.query(
        query_embeddings=q_emb,
        n_results=3
    )
    context = "\n".join(results["documents"][0])
    answer = f"Context:\n{context}\n\nAnswer: (model answer here)"
    return answer

## Load and Chunk Data

In [None]:
# import doc to augment prompt
# md_path = "rag_data.md" 
md_path = "fictional_company.md"
# clean up markdown file
def md_to_plaintext(md):
    # convert markdown to HTML
    html = markdown2.markdown(md)
    # strip HTML tags -> plain text
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text()
    
with open(md_path, "r", encoding="utf-8") as f: # f is the file object
    markdown_text = f.read() # read the file at md_path and store in this variable


print(markdown_text[:500])
print("Total characters:", len(markdown_text))
print("Total lines:", len(markdown_text.splitlines()))

In [None]:
plain_text = md_to_plaintext(markdown_text) # convert markdown to plain text
print(plain_text[:500])

In [None]:
# choose tokenizer: mpnet can use "cl100k_base"
enc = tiktoken.get_encoding("cl100k_base")
# nltk.download('punkt_tab')
sentences = nltk.sent_tokenize(plain_text)
chunks = []
current = ""
for sent in sentences:
    if len(enc.encode(current + sent)) < 120: # smaller chunks to increase granularity
        current += " " + sent
    else:
        chunks.append(current.strip())
        current = sent

if current:
    chunks.append(current.strip())

## Embed and Store Chunks

In [None]:
# embed the chunks
embeds = model.encode(chunks, convert_to_numpy=True, device=DEVICE)

In [None]:
# add chunks and embeddings to Chroma
collection.add(
    documents=chunks,
    embeddings=embeds.tolist(),
    ids=[f"chunk_{i}" for i in range(len(chunks))],
)

print("Stored chunks in Chroma: ", collection.count())

## Retrieval

In [None]:
# get relevant chunks
def retrieve(query, k=2):
    # embed the query
    q_emb = model.encode([query], convert_to_numpy=True, device=DEVICE)
    # query Chroma using embeddings
    res = collection.query(query_embeddings=q_emb.tolist(), n_results=k)
    # extract the retrieved documents
    retrieved_chunks = res["documents"][0] if "documents" in res else []

    query_terms = set(query.lower().split())
    
    # minimal re-ranking:
    retrieved_chunks = sorted(
        retrieved_chunks,
        key=lambda c: len(query_terms & set(c.lower().split())),
        # key=lambda c: query.lower() in c.lower(),
        reverse=True
    )

    return retrieved_chunks

In [None]:
retrieve("What do we do with strong ideas?")

In [None]:
# format the prompt with retrieved chunks and query
def build_prompt(query, retrieved_chunks):
    context = "\n".join(retrieved_chunks)

    return f"""
You are an expert agent at a company. 
Provide your answer about the company based on the given context.
Combine the information from the retrieved documents to provide a concise, reasoned answer.

CONTEXT:
{context}

QUESTION:
{query}

RELEVANT SENTENCES:
ANSWER:
"""

In [None]:
build_prompt("What do we do with strong ideas?", retrieve("What do we do with strong ideas?"))

In [None]:
# call the llm
llm_model_name = "google/flan-t5-xl"
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_name).to(DEVICE)

def generate(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(DEVICE)
    outputs = llm_model.generate(**inputs, max_length=128) # more factual:  do_sample=False
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

In [None]:
k = min(8, collection.count())

def rag(query, k):
    retrieved = retrieve(query, k) # use Chroma retrieval
        # print retrieved chunks for future analysis
    print("\n--- RETRIEVED CHUNKS ---")
    for i, chunk in enumerate(retrieved, 1):
        print(f"[{i}] {chunk}\n")
            
    prompt = build_prompt(query, retrieved) # combine query + context
    answer = generate(prompt)     # call LLM
    return answer

In [None]:
result = rag("In 20 words or less, what does the company do?", k)
print(result)

## Test 3 Cases

### Test Case 1 (Factual): A question directly answerable by the retrieved context.

In [None]:
query1 = "What service does Apex Horizon Agency offer?"
result1 = rag(query1, k=5)
print(result1)

### Test Case 2 (Foil/General): A question where the correct answer is not in the KB, but the LLM should rely on its general knowledge or state that it cannot answer.

In [None]:
query2 = "Can you mail things using USPS?"
result2 = rag(query2, k)
print(result2)

### Test Case 3 (Synthesis): A question requiring the LLM to combine information from multiple retrieved chunks or synthesize an answer.

In [None]:
# synthesis question
query3 = "If I were looking for a modeling agent and I was also interested in publishing a book, what services could I expect Horizon Agency to provide?"
result3 = rag(query3, k=5)
print(result3)

In [None]:
query4 = "Do we take our clients seriously?"
result4 = rag(query4, k=5)
print(result4)

In [None]:
query5 = "What do we do with strong ideas?"
result5 = rag(query5, k=5)
print(result5)