In [None]:
!curl -fsSL https://ollama.com/install.sh  sh

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [7]:
!ollama --version



In [2]:
# !ollama pull dengcao/Qwen3-14B:Q5_K_M

/bin/bash: line 1: ollama: command not found


In [None]:
!ollama pull gemma3:12b-it-q8_0

In [None]:
!ollama pull gemma3:12b-it-qat

In [None]:
!pip install -q langchain langchain-community langchain-ollama chromadb pypdf langchain-chroma

In [11]:
# Install Python client (only needed once)
!pip install ollama

# Optional — upgrade to latest version
!pip install --upgrade ollama



In [16]:
# ollama_rag_chat.py – Qwen3-14B Q5_K_M via Ollama + simple RAG-style context
# Now reading real content from gemma3.pdf instead of fake documents

import ollama
import time
from typing import List, Tuple

try:
    import fitz  # PyMuPDF — pip install pymupdf  (very common in Kaggle)
except ImportError:
    try:
        from pypdf import PdfReader  # fallback
    except ImportError:
        raise ImportError("Neither PyMuPDF (fitz) nor pypdf is installed. "
                          "In Kaggle: !pip install pymupdf  or  !pip install pypdf")

# ───────────────────────────────────────────────
#  Model selection
# ───────────────────────────────────────────────
# MODEL = "dengcao/Qwen3-14B:Q5_K_M"          # Community quantized Qwen3-14B (≈11 GB)
# MODEL = "gemma3:12b-it-q8_0"
MODEL = "gemma3:12b-it-qat"
PDF_PATH = "/kaggle/input/rag-test-doc/400k.pdf"


# ───────────────────────────────────────────────
#  Load PDF once at startup (simple in-memory store)
# ───────────────────────────────────────────────
def load_pdf_documents(pdf_path: str) -> List[str]:
    """Extract text from all pages of the PDF"""
    documents = []
    
    try:
        # Preferred: PyMuPDF (fitz)
        doc = fitz.open(pdf_path)
        for page in doc:
            text = page.get_text("text").strip()
            if text:
                documents.append(text)
        doc.close()
        print(f"Loaded {len(documents)} pages from {pdf_path}")
        
    except NameError:
        # Fallback: pypdf
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            text = page.extract_text() or ""
            text = text.strip()
            if text:
                documents.append(text)
        print(f"Loaded {len(documents)} pages from {pdf_path} (using pypdf fallback)")
        
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return []
    
    return documents


# Load once when script starts
ALL_DOCS = load_pdf_documents(PDF_PATH)


# ───────────────────────────────────────────────
#  Very naive "retriever" – top-k most relevant by length or simple keyword
#  (in production → use sentence-transformers + FAISS/Chroma)
# ───────────────────────────────────────────────
def retrieve(query: str, k: int = 4) -> List[Tuple[str, float]]:
    """
    Naive retrieval: returns k longest chunks that contain any query word.
    Replace this with real vector search in production!
    """
    if not ALL_DOCS:
        return [("PDF was not loaded or is empty.", 0.99)]
    
    query_words = set(w.lower() for w in query.split() if len(w) > 2)
    
    scored_docs = []
    for doc in ALL_DOCS:
        doc_lower = doc.lower()
        # very naive score = number of query words found + length bonus
        matches = sum(1 for w in query_words if w in doc_lower)
        score = matches * 0.4 + len(doc) / 4000.0  # normalize roughly
        scored_docs.append((doc, min(score, 0.99)))
    
    # Sort by score descending
    scored_docs.sort(key=lambda x: x[1], reverse=True)
    
    return scored_docs[:k]


# ───────────────────────────────────────────────
#  Prompt building – Qwen3 style
# ───────────────────────────────────────────────
def build_prompt(query: str, k: int = 4) -> str:
    retrieved = retrieve(query, k=k)
    
    context_parts = []
    for i, (doc, score) in enumerate(retrieved, 1):
        # Truncate very long chunks so prompt doesn't explode
        preview = doc.replace("\n", " ").strip()
        if len(preview) > 800:
            preview = preview[:750] + " … [truncated]"
        context_parts.append(f"[Doc {i} | score={score:.3f}] {preview}")
    
    context_str = "\n\n".join(context_parts)
    
    # Qwen3-style prompt
    prompt = f"""<|im_start|>system
Bạn là chuyên gia trợ giúp trả lời câu hỏi bằng tiếng VIệt, hảy lời các câu hỏi của người dùng và 
trả kết quả về tiếng Việt, không sử dụng tiếng Anh hoặc tiếng Trung<|im_end|>
<|im_start|>user
Context:
{context_str}

Question: {query}<|im_end|>
<|im_start|>assistant
"""
    return prompt


# ───────────────────────────────────────────────
#  Generate with streaming + history
# ───────────────────────────────────────────────
def generate_stream(query: str, max_tokens: int = 400, temperature: float = 0.7) -> str:
    prompt = build_prompt(query)
    
    print("Prompt preview (first 800 chars):")
    print(prompt[:800] + "..." if len(prompt) > 800 else prompt)
    print()
    
    messages = [{"role": "user", "content": prompt}]
    
    try:
        start_time = time.time()
        
        stream = ollama.chat(
            model=MODEL,
            messages=messages,
            stream=True,
            options={
                "temperature": temperature,
                "top_p": 0.9,
                # "num_predict": max_tokens,   # optional
            }
        )
        
        print("Qwen3-14B: ", end="", flush=True)
        full_response = ""
        
        for chunk in stream:
            content = chunk["message"]["content"]
            print(content, end="", flush=True)
            full_response += content
        
        print()  # final newline
        
        elapsed = time.time() - start_time
        print(f"({elapsed:.1f}s)\n")
        
        return full_response.strip()
    
    except Exception as e:
        print(f"\nError during generation: {e}")
        print("Make sure Ollama is running and the model is pulled:")
        print(f"  ollama pull {MODEL}")
        return ""


# ───────────────────────────────────────────────
#  Main – example usage
# ───────────────────────────────────────────────
if __name__ == "__main__":
    if not ALL_DOCS:
        print("Cannot continue — PDF loading failed.")
    else:
        print(f"Qwen3-14B Q5_K_M chat + RAG from gemma3.pdf via Ollama\n")
        print(f"Model : {MODEL}")
        print(f"PDF   : {PDF_PATH} ({len(ALL_DOCS)} pages loaded)")
        print("Type 'exit' or 'quit' to end the script early.\n")
        
        # Optional model check
        try:
            models = ollama.list()
            model_names = [m["name"] for m in models.get("models", [])]
            if not any(MODEL in name for name in model_names):
                print(f"Warning: Model '{MODEL}' not found.")
                print(f"Please run:  ollama pull {MODEL}\n")
        except Exception as e:
            print(f"Could not check Ollama models: {e}")
        
        # queries = [
        #     "What is the main topic of the document?",
        #     "Summarize the key contributions of the paper.",
        #     "What model sizes were released?",
        #     "How does the license work for Gemma 3?",
        #     "What is the context length of Gemma 3 models?"
        # ]

        queries = [
            "Ai được hưởng 400 nghìn?"
        ]
        
        
        for q in queries:
            print("\n" + "="*90)
            print(f"Query: {q}")
            print("-"*90)
            
            answer = generate_stream(q, max_tokens=320, temperature=0.7)
            print()

Loaded 6 pages from /kaggle/input/rag-test-doc/400k.pdf (using pypdf fallback)
Qwen3-14B Q5_K_M chat + RAG from gemma3.pdf via Ollama

Model : gemma3:12b-it-qat
PDF   : /kaggle/input/rag-test-doc/400k.pdf (6 pages loaded)
Type 'exit' or 'quit' to end the script early.

Could not check Ollama models: 'name'

Query: Ai được hưởng 400 nghìn?
------------------------------------------------------------------------------------------
Prompt preview (first 800 chars):
<|im_start|>system
Bạn là chuyên gia trợ giúp trả lời câu hỏi bằng tiếng VIệt, hảy lời các câu hỏi của người dùng và 
trả kết quả về tiếng Việt, không sử dụng tiếng Anh hoặc tiếng Trung<|im_end|>
<|im_start|>user
Context:
[Doc 1 | score=0.990] Chỉ  cần  người  dân  cung  cấp  thông  tin  tài  khoản,  mã  OTP  tài  khoản  ngân  hàng,  cài   các  app  (ứng  dụng)  lạ  trên  thiết  bị  thông  minh  sẽ  bị  chúng  chiếm  quyền  điều   khiển.   Từ   đó   tiếp   tục   dẫn   dụ   bị   hại   thao   tác   các   bước   xác   thực   nhằm  

In [18]:
# ollama_rag_chat_chroma_fixed.py
# Fixed version - works with modern ChromaDB (0.4.16+)
# Using Qwen / Gemma via Ollama + Chroma + sentence-transformers

import ollama
import time
from typing import List, Tuple
import torch

try:
    import fitz  # PyMuPDF
except ImportError:
    try:
        from pypdf import PdfReader
    except ImportError:
        raise ImportError("Install either pymupdf or pypdf: pip install pymupdf or pip install pypdf")

# ─── Core dependencies ──────────────────────────────────────────────────
try:
    import chromadb
    from chromadb.utils import embedding_functions
    from sentence_transformers import SentenceTransformer
except ImportError:
    raise ImportError(
        "Required packages missing. Install:\n"
        "pip install chromadb sentence-transformers"
    )

# ─── Configuration ──────────────────────────────────────────────────────
MODEL = "gemma3:12b-it-qat"              # or "qwen2:7b", "llama3.1:8b", etc.
# MODEL = "dengcao/Qwen3-14B:Q5_K_M"     # if you have this variant

PDF_PATH = "/kaggle/input/rag-test-doc/400k.pdf"

EMBEDDING_MODEL = "all-MiniLM-L6-v2"     # fast & good (~80 MB)
# Alternatives:
#   "paraphrase-multilingual-MiniLM-L12-v2"   # multilingual
#   "bkai-foundation-models/vietnamese-bi-encoder"   # Vietnamese-focused

CHROMA_PATH = "./chroma_db"
COLLECTION_NAME = "doc_400k"

CHUNK_SIZE = 600
CHUNK_OVERLAP = 120

# ─── 1. PDF → overlapping chunks ────────────────────────────────────────
def extract_and_chunk_pdf(pdf_path: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
    """Extract text from PDF and split into overlapping chunks"""
    full_text = []

    # Prefer PyMuPDF (faster, better layout)
    try:
        doc = fitz.open(pdf_path)
        for page in doc:
            text = page.get_text("text").strip()
            if text:
                full_text.append(text)
        doc.close()
        print(f"Loaded {len(full_text)} pages using PyMuPDF")
    except Exception:
        # Fallback to pypdf
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            text = page.extract_text() or ""
            text = text.strip()
            if text:
                full_text.append(text)
        print(f"Loaded {len(full_text)} pages using pypdf fallback")

    if not full_text:
        raise ValueError("No text could be extracted from the PDF")

    full_text_str = "\n\n".join(full_text)

    # Simple overlapping chunking with boundary awareness
    chunks = []
    start = 0
    while start < len(full_text_str):
        end = min(start + chunk_size, len(full_text_str))
        if end < len(full_text_str):
            # Try to end at natural break (paragraph, sentence)
            while end > start and full_text_str[end] not in "\n.?!":
                end -= 1
        chunk = full_text_str[start:end].strip()
        if chunk:
            chunks.append(chunk)
        # Move start position with overlap
        start = end - overlap if end - overlap > start else end

    print(f"Created {len(chunks)} overlapping chunks (≈{chunk_size} chars, overlap={overlap})")
    return chunks


# ─── 2. Chroma collection (create or load) ──────────────────────────────
def get_or_create_collection():
    client = chromadb.PersistentClient(path=CHROMA_PATH)

    # Modern ChromaDB-compatible embedding function
    embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=EMBEDDING_MODEL,
        device="cuda" if torch.cuda.is_available() else "cpu",
        normalize_embeddings=True
    )

    try:
        collection = client.get_collection(
            name=COLLECTION_NAME,
            embedding_function=embedding_function
        )
        print(f"Loaded existing collection '{COLLECTION_NAME}' ({collection.count()} items)")
    except:
        print(f"Creating new collection '{COLLECTION_NAME}' ...")

        collection = client.create_collection(
            name=COLLECTION_NAME,
            embedding_function=embedding_function,
            metadata={"hnsw:space": "cosine"}  # cosine / l2 / ip
        )

        # Index the document
        chunks = extract_and_chunk_pdf(PDF_PATH)
        if not chunks:
            raise RuntimeError("No chunks created from PDF")

        ids = [f"chunk_{i:04d}" for i in range(len(chunks))]
        collection.add(
            documents=chunks,
            ids=ids,
            metadatas=[{"source": "400k.pdf", "chunk_idx": i} for i in range(len(chunks))]
        )
        print(f"Indexed {len(chunks)} chunks into Chroma")

    return collection, embedding_function


# Global singleton (loaded once)
COLLECTION, EMBEDDING_FN = get_or_create_collection()


# ─── 3. Retrieve relevant chunks ────────────────────────────────────────
def retrieve(query: str, k: int = 5) -> List[Tuple[str, float, dict]]:
    results = COLLECTION.query(
        query_texts=[query],
        n_results=k,
        include=["documents", "distances", "metadatas"]
    )

    hits = []
    for doc, dist, meta in zip(
        results["documents"][0],
        results["distances"][0],
        results["metadatas"][0]
    ):
        similarity = 1.0 - dist if dist is not None else 0.0  # cosine distance → similarity
        hits.append((doc, similarity, meta))

    return hits


# ─── 4. Build prompt with context ───────────────────────────────────────
def build_prompt(query: str, k: int = 5) -> str:
    retrieved = retrieve(query, k=k)

    context_parts = []
    for i, (text, score, meta) in enumerate(retrieved, 1):
        preview = text.replace("\n", " ").strip()
        if len(preview) > 900:
            preview = preview[:850] + "… [truncated]"
        chunk_id = meta.get('chunk_idx', '?')
        context_parts.append(
            f"[Doc {i} | score={score:.3f} | chunk {chunk_id}] {preview}"
        )

    context_str = "\n\n".join(context_parts)

    prompt = f"""<|im_start|>system
Bạn là trợ lý AI hữu ích. Hãy trả lời ngắn gọn, chính xác dựa trên ngữ cảnh được cung cấp.
Nếu không có thông tin liên quan trong ngữ cảnh, hãy nói rõ "Không tìm thấy thông tin liên quan trong tài liệu".
Không bịa thông tin, không suy diễn quá mức.<|im_end|>
<|im_start|>user
Ngữ cảnh:
{context_str}

Câu hỏi: {query}<|im_end|>
<|im_start|>assistant
"""
    return prompt


# ─── 5. Generate answer (streaming) ─────────────────────────────────────
def generate_stream(query: str, max_tokens: int = 600, temperature: float = 0.7) -> str:
    prompt = build_prompt(query)

    print("┌──────────────────────────────────────────────────────────────┐")
    print("│ Prompt preview (first 1400 chars)                            │")
    print("└──────────────────────────────────────────────────────────────┘")
    preview = prompt[:1400] + "…" if len(prompt) > 1400 else prompt
    print(preview)
    print()

    messages = [{"role": "user", "content": prompt}]

    try:
        start_time = time.time()

        stream = ollama.chat(
            model=MODEL,
            messages=messages,
            stream=True,
            options={
                "temperature": temperature,
                "top_p": 0.92,
                # "num_predict": max_tokens,  # optional limit
            }
        )

        print(f"{MODEL}: ", end="", flush=True)
        full_response = ""

        for chunk in stream:
            content = chunk["message"]["content"]
            print(content, end="", flush=True)
            full_response += content

        print()  # final newline
        elapsed = time.time() - start_time
        print(f"({elapsed:.1f}s | ~{len(full_response.split())} tokens)\n")

        return full_response.strip()

    except Exception as e:
        print(f"\nError during generation: {e}")
        print(f"Make sure Ollama is running and model is pulled:  ollama pull {MODEL}")
        return ""


# ─── Main ────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    print(f"RAG pipeline ready • Model: {MODEL}")
    print(f"Embedding: {EMBEDDING_MODEL}")
    print(f"Collection: {COLLECTION_NAME} ({COLLECTION.count()} chunks total)\n")

    queries = [
        "Tiêu đề bài viết là gì?",
        "Ai được hưởng 400 nghìn từ chính sách của nhà nước?",
        "Tóm tắt nội dung chính của tài liệu",
        "Gemma 3 có những kích thước mô hình nào?",
        "Context length của Gemma 3 là bao nhiêu?",
    ]

    for q in queries:
        print("\n" + "═" * 80)
        print(f"Query: {q}")
        print("─" * 80)
        generate_stream(q, temperature=0.65)
        print()

Creating new collection 'doc_400k' ...
Loaded 6 pages using pypdf fallback
Created 14 overlapping chunks (≈600 chars, overlap=120)
Indexed 14 chunks into Chroma
RAG pipeline ready • Model: gemma3:12b-it-qat
Embedding: all-MiniLM-L6-v2
Collection: doc_400k (14 chunks total)


════════════════════════════════════════════════════════════════════════════════
Query: Tiêu đề bài viết là gì?
────────────────────────────────────────────────────────────────────────────────
┌──────────────────────────────────────────────────────────────┐
│ Prompt preview (first 1400 chars)                            │
└──────────────────────────────────────────────────────────────┘
<|im_start|>system
Bạn là trợ lý AI hữu ích. Hãy trả lời ngắn gọn, chính xác dựa trên ngữ cảnh được cung cấp.
Nếu không có thông tin liên quan trong ngữ cảnh, hãy nói rõ "Không tìm thấy thông tin liên quan trong tài liệu".
Không bịa thông tin, không suy diễn quá mức.<|im_end|>
<|im_start|>user
Ngữ cảnh:
[Doc 1 | score=0.624 | chunk 4]