In [7]:
# ONE-CELL: Local RAG demo using chromadb + sentence-transformers + transformers (no langchain)
# Paste & run in Jupyter. Uses packages you already have: chromadb, sentence-transformers, transformers, torch, nltk.

import os, json
from pathlib import Path

# --- Ensure NLTK punkt is available (fixes your earlier LookupError)
import nltk
nltk.download("punkt", quiet=True)
from nltk.tokenize import sent_tokenize

# --- Chroma + embedding function (uses sentence-transformers under the hood)
import chromadb
from chromadb.utils import embedding_functions

# --- Transformers for local generation
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# --- Paths and basic setup
ROOT = "rag_proj"
DATA_DIR = Path(ROOT) / "data"
os.makedirs(DATA_DIR, exist_ok=True)

# --- Create sample documents if none exist
if not any(DATA_DIR.iterdir()):
    sample_texts = [
        ("publication1.txt", 
         "Title: AAIDC RAG Assistant Project\n\n"
         "This publication explains how to build a Retrieval-Augmented Generation (RAG) assistant using vector databases like Chroma. "
         "It describes chunking documents, creating embeddings, and answering questions using retrieved context. "
         "Limitations mention that small datasets are used for demo and retrieval accuracy depends on chunking and embedding quality."),
        ("publication2.txt",
         "Title: Example Models and Tools\n\n"
         "This short doc lists tools: Chroma (vector DB), sentence-transformers (embeddings), and Flan-T5 (local generation). "
         "It also notes to cite sources when answering.")
    ]
    for fname, txt in sample_texts:
        (DATA_DIR / fname).write_text(txt, encoding="utf-8")
    print("Created sample documents in", DATA_DIR)
else:
    print("Using existing files in", DATA_DIR)

# --- Simple safe sentence-based chunker
def chunk_text(text, max_words=300, overlap_words=50):
    sents = sent_tokenize(text)
    chunks = []
    cur = []
    cur_count = 0
    for sent in sents:
        w = len(sent.split())
        if cur_count + w <= max_words or not cur:
            cur.append(sent)
            cur_count += w
        else:
            chunks.append(" ".join(cur))
            # create overlap
            overlap = " ".join(" ".join(cur).split()[-overlap_words:]) if overlap_words>0 else ""
            cur = [overlap] if overlap else []
            cur.append(sent)
            cur_count = len(" ".join(cur).split())
    if cur:
        chunks.append(" ".join(cur))
    return chunks

# --- Read files and prepare lists for Chroma
docs_texts = []
metadatas = []
ids = []

for f in sorted(DATA_DIR.iterdir()):
    if not f.is_file() or f.suffix.lower() not in [".txt", ".md", ".json"]:
        continue
    raw = f.read_text(encoding="utf-8").strip()
    if not raw:
        continue
    if f.suffix.lower() == ".json":
        try:
            jobj = json.loads(raw)
            text = jobj.get("text") or jobj.get("body") or raw
        except Exception:
            text = raw
    else:
        text = raw
    chunks = chunk_text(text, max_words=300, overlap_words=50)
    for i, c in enumerate(chunks):
        _id = f"{f.stem}_chunk_{i}"
        docs_texts.append(c)
        metadatas.append({"source_file": f.name, "chunk_id": _id})
        ids.append(_id)

print(f"Prepared {len(docs_texts)} chunk(s) from files in {DATA_DIR}")

if not docs_texts:
    raise RuntimeError("No documents or chunks found. Add .txt files to rag_proj/data and re-run this cell.")

# --- Create Chroma client and collection using SentenceTransformer embeddings
EMBED_MODEL = "all-MiniLM-L6-v2"
ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EMBED_MODEL)

client = chromadb.Client()  # in-memory demo client (no persistent dir)
COLL_NAME = "rag_demo_collection"
# Remove existing collection with same name to avoid duplicates (safe for demo)
try:
    client.delete_collection(name=COLL_NAME)
except Exception:
    pass

collection = client.create_collection(name=COLL_NAME, embedding_function=ef)

collection.add(documents=docs_texts, metadatas=metadatas, ids=ids)
print(f"Created Chroma collection '{COLL_NAME}' with {collection.count()} items (in-memory)")

# --- Retrieval helper
def retrieve_top_k(query, k=3):
    results = collection.query(query_texts=[query], n_results=k, include=["documents","metadatas","distances"])
    docs = results.get("documents", [[]])[0]
    metas = results.get("metadatas", [[]])[0]
    dists = results.get("distances", [[]])[0]
    out = []
    for doc, meta, dist in zip(docs, metas, dists):
        out.append({"text": doc, "metadata": meta, "distance": dist})
    return out

# --- Load generator model (Flan-T5 small)
MODEL_NAME = "google/flan-t5-small"
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Loading generator model '{MODEL_NAME}' on {device} (may download once)...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)

# --- RAG QA function: retrieve -> build limited context -> generate
def ask_question(question, k=3, max_context_chunks=3, max_new_tokens=128):
    retrieved = retrieve_top_k(question, k=k)
    if not retrieved:
        return "I don't know â€” no relevant documents found.", []
    # build context from top-n retrieved chunks (limit size to avoid tokenizer overflow)
    context_parts = []
    sources = []
    for r in retrieved[:max_context_chunks]:
        txt = r["text"].strip()
        # limit chunk text length (words) to keep prompt small
        words = txt.split()
        if len(words) > 250:
            txt = " ".join(words[:250]) + " ..."
        src = r["metadata"].get("source_file", "unknown")
        cid = r["metadata"].get("chunk_id", "unknown")
        context_parts.append(f"Source: {src} | Chunk: {cid}\n{txt}")
        if src not in sources:
            sources.append(src)
    context = "\n\n---\n\n".join(context_parts)
    prompt = (
        "You are an assistant that must answer the question using ONLY the context below. "
        "If the answer is not in the context, reply exactly: \"I don't know â€” the information is not in the documents.\"\n\n"
        f"CONTEXT:\n{context}\n\nQUESTION: {question}\n\nAnswer concisely (1-3 sentences). Then on a new line list the sources used as: Sources: <filename1>, <filename2>."
    )
    # Tokenize + generate (truncate if prompt too long)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
    gen = gen_model.generate(**inputs, max_new_tokens=max_new_tokens, num_beams=4, early_stopping=True)
    out = tokenizer.decode(gen[0], skip_special_tokens=True)
    return out.strip(), sources

# --- Quick demo queries
demo_qs = [
    "What is this publication about?",
    "Which tools are recommended in the documents?",
    "What limitation is mentioned?"
]

print("\n=== Demo Q&A (answers + sources) ===")
for q in demo_qs:
    ans, srcs = ask_question(q)
    print("\nQ:", q)
    print("A:", ans)
    print("Sources:", ", ".join(srcs))


Using existing files in rag_proj\data
Prepared 1 chunk(s) from files in rag_proj\data
Created Chroma collection 'rag_demo_collection' with 1 items (in-memory)
Loading generator model 'google/flan-t5-small' on cpu (may download once)...

=== Demo Q&A (answers + sources) ===

Q: What is this publication about?
A: Science/Tech
Sources: lesson1.txt

Q: Which tools are recommended in the documents?
A: AAIDC RAG Assistant Project
Sources: lesson1.txt

Q: What limitation is mentioned?
A: filename1>
Sources: lesson1.txt


In [8]:
# ---------------- SAVE RAG DEMO OUTPUT + LOGGING ----------------
import os, json
from pathlib import Path
import chromadb
import nltk
import torch
from nltk.tokenize import sent_tokenize
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Setup directories
ROOT = "rag_proj"
DATA_DIR = Path(ROOT) / "data"
SAVE_LOG = Path(ROOT) / "output_demo.txt"
os.makedirs(DATA_DIR, exist_ok=True)

# Load existing Chroma collection
EMBED_MODEL = "all-MiniLM-L6-v2"
embedding_fn = SentenceTransformerEmbeddingFunction(model_name=EMBED_MODEL)

client = chromadb.Client()
collection = client.get_or_create_collection(name="rag_demo_collection", embedding_function=embedding_fn)

# Retrieval helper
def retrieve_top_k(query, k=3):
    r = collection.query(query_texts=[query], n_results=k, include=["documents","metadatas","distances"])
    docs = r.get("documents", [[]])[0]
    metas = r.get("metadatas", [[]])[0]
    dists = r.get("distances", [[]])[0]

    # âœ… Corrected: use separate variables doc_text and dist_val
    results = []
    for doc_text, meta_val, dist_val in zip(docs, metas, dists):
        results.append({
            "text": doc_text.strip(),  # now it's always text âœ…
            "metadata": meta_val,
            "distance": dist_val       # always float âœ…
        })
    return results

# Load generator model
MODEL_NAME = "google/flan-t5-small"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)

# Chunker
def chunk_text(text, max_words=300, overlap=50):
    sents = sent_tokenize(text)
    chunks, buf, count = [], [], 0
    for s in sents:
        w = len(s.split())
        if count + w <= max_words or not buf:
            buf.append(s); count += w
        else:
            chunks.append(" ".join(buf))
            prev = " ".join(" ".join(buf).split()[-overlap:]) if overlap>0 else ""
            buf = [prev] if prev else []
            buf.append(s); count = len(" ".join(buf).split())
    if buf: chunks.append(" ".join(buf))
    return chunks

# QA function
def ask(q, k=3, max_new=128):
    top = retrieve_top_k(q, k=k)
    ctx, srcs = [], []

    for t in top[:3]:
        txt = t["text"]
        if len(txt.split())>250:
            txt = " ".join(txt.split()[:250]) + " ..."
        src = t["metadata"].get("source_file","unknown")
        ctx.append(f"Source: {src}\n{txt}")
        if src not in srcs:
            srcs.append(src)

    prompt = "Answer using only context. If unknown say: I don't know.\n\nCONTEXT:\n" + "\n\n---\n".join(ctx) + f"\n\nQUESTION:\n{q}"
    inp = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
    gen = gen_model.generate(**inp, max_new_tokens=max_new)
    out = tokenizer.decode(gen[0], skip_special_tokens=True)
    return out.strip(), srcs

# Demo Queries
demo_qs = [
    "What is this publication about?",
    "Which tools are recommended in the documents?",
    "What limitation is mentioned?"
]

log_lines = ["=== RAG Demo Output Log ===\n"]
print("\nâœ… Running RAG Demo & Saving Log...\n")

for q in demo_qs:
    ans, src = ask(q)
    print("Q:", q)
    print("A:", ans)
    print("Sources:", ", ".join(src), "\n")
    log_lines.append(f"Q: {q}\nA: {ans}\nSources: {', '.join(src)}\n\n")

SAVE_LOG.write_text("".join(log_lines))
print("ðŸ“‚ Demo log saved at:", SAVE_LOG)


âœ… Running RAG Demo & Saving Log...

Q: What is this publication about?
A: Science/Tech
Sources: lesson1.txt 

Q: Which tools are recommended in the documents?
A: LangChain and vector databases
Sources: lesson1.txt 

Q: What limitation is mentioned?
A: small dataset demo and offline vector DB precision challenges
Sources: lesson1.txt 

ðŸ“‚ Demo log saved at: rag_proj\output_demo.txt


In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

MODEL_NAME = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)


In [10]:
log_lines = ["=== RAG Demo Output Log ===\n"]

for q in demo_qs:
    ans, src = ask(q)
    print("Q:", q)
    print("A:", ans)
    print("Sources:", ", ".join(src), "\n")
    log_lines.append(f"Q: {q}\nA: {ans}\nSources: {', '.join(src)}\n\n")

SAVE_LOG.write_text("".join(log_lines))
print("ðŸ“‚ Demo log saved at:", SAVE_LOG)


Q: What is this publication about?
A: Science/Tech
Sources: lesson1.txt 

Q: Which tools are recommended in the documents?
A: LangChain and vector databases
Sources: lesson1.txt 

Q: What limitation is mentioned?
A: small dataset demo and offline vector DB precision challenges
Sources: lesson1.txt 

ðŸ“‚ Demo log saved at: rag_proj\output_demo.txt
