<a href="https://colab.research.google.com/github/gandhirajj/Agentic_AI_Workshop/blob/main/RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install streamlit langchain langchain-community faiss-cpu sentence-transformers transformers accelerate torch docx2txt python-pptx PyPDF2

Collecting streamlit
  Downloading streamlit-1.49.1-py3-none-any.whl.metadata (9.5 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.29-py3-none-any.whl.metadata (2.9 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting docx2txt
  Downloading docx2txt-0.9-py3-none-any.whl.metadata (529 bytes)
Collecting python-pptx
  Downloading python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting requests<3,>=2.27 (from streamlit)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Downloadi

In [None]:
import io
import os
from typing import List, Tuple

# LangChain core / community
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.chains import ConversationalRetrievalChain
from langchain.docstore.document import Document

# Hugging Face inference
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# File readers
import docx2txt
from pptx import Presentation
from PyPDF2 import PdfReader


# -----------------------------
# Helpers: file parsing
# -----------------------------

def read_docx(path: str) -> str:
    text = docx2txt.process(path) or ""
    return text


def read_pptx(path: str) -> str:
    prs = Presentation(path)
    texts = []
    for i, slide in enumerate(prs.slides, start=1):
        slide_texts = []
        for shape in slide.shapes:
            if hasattr(shape, "has_text_frame") and shape.has_text_frame:
                slide_texts.append(shape.text)
        if slide_texts:
            texts.append(f"[Slide {i}]\n" + "\n".join(slide_texts))
    return "\n\n".join(texts)


def read_pdf(path: str) -> str:
    reader = PdfReader(path)
    pages = []
    for i, page in enumerate(reader.pages, start=1):
        try:
            content = page.extract_text() or ""
        except Exception:
            content = ""
        if content.strip():
            pages.append(f"[Page {i}]\n{content}")
    return "\n\n".join(pages)


def load_documents(paths: List[str]) -> List[Document]:
    docs: List[Document] = []
    for path in paths:
        suffix = path.lower().split(".")[-1]
        if suffix == "docx":
            text = read_docx(path)
        elif suffix in ("ppt", "pptx"):
            text = read_pptx(path)
        elif suffix == "pdf":
            text = read_pdf(path)
        else:
            with open(path, "r", encoding="utf-8", errors="ignore") as f:
                text = f.read()
        if text.strip():
            docs.append(Document(page_content=text, metadata={"source": os.path.basename(path)}))
    return docs


# -----------------------------
# Build vector store

# -----------------------------

def build_retriever(docs: List[Document], chunk_size=800, chunk_overlap=100, k=2):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", ". ", ".", " "]
    )
    splits = []
    for d in docs:
        for chunk in splitter.split_text(d.page_content):
            splits.append(Document(page_content=chunk, metadata={**d.metadata}))

    embedder = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = FAISS.from_documents(splits, embedder)
    return vectordb.as_retriever(search_kwargs={"k": k})


# -----------------------------
# LLM setup (No OpenAI)
# -----------------------------

def build_llm(model_name: str = "google/flan-t5-base", max_new_tokens: int = 256):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    gen_pipe = pipeline(
        task="text2text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=max_new_tokens,
        do_sample=False,
    )
    return HuggingFacePipeline(pipeline=gen_pipe)


# -----------------------------
# Conversational RAG chain
# -----------------------------

def build_cr_chain(retriever, llm):
    return ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        return_source_documents=True,
        chain_type="stuff",
    )


# -----------------------------
# CLI interface
# -----------------------------

def main():
    print("📚 RAG Study Bot — Local")

    # Example usage: pass file paths
    file_paths = [
        "/content/AI_General_English.pdf",
        "/content/AI_General_English.pdf",
        # "slides.pptx"
    ]

    docs = load_documents(file_paths)
    if not docs:
        print("No readable documents.")
        return

    retriever = build_retriever(docs, chunk_size=800, chunk_overlap=100, k=2)
    llm = build_llm(model_name="google/flan-t5-base", max_new_tokens=256)
    rag_chain = build_cr_chain(retriever, llm)

    chat_history = []

    while True:
        query = input("\n❓ Ask a question (or type 'exit'): ")
        if query.lower().strip() in ("exit", "quit"):
            break

        result = rag_chain({"question": query, "chat_history": chat_history})
        answer = result.get("answer", "(No answer)")
        sources = result.get("source_documents", [])

        print("\n🤖 Answer:", answer)
        if sources:
            print("\n📎 Sources:")
            for i, doc in enumerate(sources, start=1):
                src = doc.metadata.get("source", "uploaded file")
                snippet = doc.page_content[:200].strip().replace("\n", " ")
                print(f"  {i}. {src}: {snippet}...")

        chat_history.append(("user", query))
        chat_history.append(("assistant", answer))


if __name__ == "__main__":
    main()


📚 RAG Study Bot — Local


  embedder = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu
  return HuggingFacePipeline(pipeline=gen_pipe)



❓ Ask a question (or type 'exit'): what is mean by Disease Diagnosis Using AI


  result = rag_chain({"question": query, "chat_history": chat_history})



🤖 Answer: help in the early detection of diseases that are difficult for doctors to diagnose at early stages, such as cancer, heart diseases, and neurological disorders

📎 Sources:
  1. AI_General_English.pdf: [Page 43] treatment, and medical research process. AI helps doctors improve diagnostic accuracy , provide treatments faster and more effectively , and reduce medical errors. • Disease Diagnosis Using...
  2. AI_General_English.pdf: [Page 43] treatment, and medical research process. AI helps doctors improve diagnostic accuracy , provide treatments faster and more effectively , and reduce medical errors. • Disease Diagnosis Using...

❓ Ask a question (or type 'exit'): exit
