# Vector Database Creation

Chroma DB ทำอะไรให้เราในโปรเจกต์นี้บ้าง

- เก็บ 467 chunks + embedding 384 มิติ ที่เราสร้างใน Step 2
- แปลงคำถามเป็น embedding → ไปหา chunk ที่ความหมายใกล้เคียงที่สุดโดยอัตโนมัติ
- ให้ผลลัพธ์ภายใน 0.01–0.05 วินาที แม้จะมีเป็นพัน chunk
- บันทึกถาวรลงดิสก์ ในโฟลเดอร์ models/chroma_db/
- → ปิดเครื่อง → เปิดใหม่ → ยังใช้ได้เลย ไม่ต้องสร้างใหม่
- รองรับการกรองด้วย similarity threshold → ใช้ตรวจจับ “no-answer questions” ได้ทันที

In [1]:

# Objectives:
# - Load chunks, embeddings & metadata from Step 2
# - Create persistent Chroma vector database (cosine similarity)
# - Add simple retrieval function
# - Test retrieval quality on various queries (including ones that should have no good match)
# - Prepare everything for the final RAG chatbot notebook

# %%
# Import libraries
import json
import numpy as np
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer
from pathlib import Path

print('Libraries imported successfully!')

# %%
# 1. Load Model & Data from Step 2

# Load the same embedding model used in Step 2
embedding_model = SentenceTransformer('BAAI/bge-base-en-v1.5')
print(f'Model loaded: BAAI/bge-base-en-v1.5')
print(f'   Dimension: {embedding_model.get_sentence_embedding_dimension()}')

# Load chunks & metadata
with open('../models/chunks_bge/chunks_data.json', 'r', encoding='utf-8') as f:
    chunks_data = json.load(f)

all_chunks = chunks_data['chunks']
chunk_metadata = chunks_data['metadata']
config = chunks_data['config']

embeddings = np.load('../models/chunks_bge/embeddings.npy')

print(f'\nLoaded {len(all_chunks)} chunks')
print(f'   Embedding shape: {embeddings.shape}')

# %%
# 2. Create Persistent Chroma Vector Database

# Create folder if not exists
Path("../models/chroma_db").mkdir(parents=True, exist_ok=True)

# Initialize persistent client
client = chromadb.PersistentClient(path="models/chroma_db")

collection_name = "rag_collection"

# Optional: delete old collection (uncomment if you want a fresh start)
# client.delete_collection(name=collection_name)

# Create collection with cosine similarity
collection = client.get_or_create_collection(
    name=collection_name,
    metadata={"hnsw:space": "cosine"}  # cosine similarity
)

# Prepare data for Chroma
ids = [f"doc{meta['doc_id']}_chunk{meta['chunk_id']}" for meta in chunk_metadata]
metadatas = [
    {"doc_id": meta["doc_id"], "chunk_id": meta["chunk_id"], "chunk_length": meta["chunk_length"]}
    for meta in chunk_metadata
]

# Add all data
print("Adding chunks to Chroma (this may take a minute)...")
collection.add(
    documents=all_chunks,
    embeddings=embeddings.tolist(),
    metadatas=metadatas,
    ids=ids
)

print(f'Vector database ready → {collection.count()} chunks indexed')
print(f'   Saved at: models/chroma_db/')

# %%
# 3. Simple Retrieval Function

def retrieve(query: str, top_k: int = 5, min_similarity: float = 0.0):
    """
    Retrieve top_k most similar chunks
    min_similarity: filter results below this threshold (cosine similarity)
    """
    query_emb = embedding_model.encode([query])
    results = collection.query(
        query_embeddings=query_emb.tolist(),
        n_results=top_k * 3,  # get more to allow filtering
        include=["documents", "distances", "metadatas"]
    )
    
    # Filter by similarity
    filtered_docs = []
    filtered_distances = []
    filtered_metas = []
    
    for doc, dist, meta in zip(results['documents'][0], 
                               results['distances'][0], 
                               results['metadatas'][0]):
        similarity = 1 - dist  # Chroma returns distance in cosine space
        if similarity >= min_similarity:
            filtered_docs.append(doc)
            filtered_distances.append(similarity)
            filtered_metas.append(meta)
            if len(filtered_docs) >= top_k:
                break
    
    return {
        'documents': filtered_docs,
        'similarities': filtered_distances,
        'metadatas': filtered_metas
    }

print('Retrieval function ready!')

# %%
# 4. Test Retrieval Quality

test_queries = [
    "What are Bullet Kin?",
    "How do Bullet Kin attack the player?",
    "What drops a key upon death?",
    "What is Retrieval-Augmented Generation (RAG)?",
    "Explain the EU AI Act risk categories",
    "What enemies use assault rifles?",
    "How much health does the Mutant Bullet Kin have?",  # should have low similarity
    "What is machine learning?",                        # unrelated / low similarity
]

for query in test_queries:
    print(f"\n{'='*100}")
    print(f"QUERY: {query}")
    print(f"{'='*100}")
    
    results = retrieve(query, top_k=5, min_similarity=0.3)
    
    if not results['documents']:
        print("No chunks above similarity threshold (0.30) → likely no answer in corpus")
        continue
        
    for i, (text, sim, meta) in enumerate(zip(results['documents'], 
                                              results['similarities'], 
                                              results['metadatas'])):
        print(f"#{i+1} | Similarity: {sim:.4f} | Doc {meta['doc_id']} | Chunk {meta['chunk_id']}")
        print(text[:280].replace('\n', ' ') + '...')
        print("-" * 90)

# %%
# 5. Save Summary

vector_db_summary = {
    "collection_name": collection_name,
    "num_chunks": collection.count(),
    "embedding_model": "BAAI/bge-base-en-v1.5",
    "embedding_dim": 384,
    "similarity_metric": "cosine",
    "database_path": "models/chroma_db",
    "status": "ready for RAG chatbot",
    "recommended_min_similarity_threshold": 0.30
}

Path("models").mkdir(exist_ok=True)
with open('../models/vector_db_summary.json', 'w', encoding='utf-8') as f:
    json.dump(vector_db_summary, f, indent=2, ensure_ascii=False)

print("\n" + "="*100)
print("VECTOR DATABASE CREATION COMPLETE")
print("="*100)
for k, v in vector_db_summary.items():
    print(f"{k}: {v}")
print("\nStep 3 Complete!")
print("Next → 04_rag_chatbot.ipynb (final chatbot with answer generation + no-answer detection)")

Libraries imported successfully!
Model loaded: BAAI/bge-base-en-v1.5
   Dimension: 768

Loaded 933 chunks
   Embedding shape: (933, 768)
Adding chunks to Chroma (this may take a minute)...
Vector database ready → 933 chunks indexed
   Saved at: models/chroma_db/
Retrieval function ready!

QUERY: What are Bullet Kin?
#1 | Similarity: 0.8050 | Doc 0 | Chunk 0
Bullet Kin Bullet Kin are one of the most common enemies. They slowly walk towards the player, occasionally firing a single bullet. They can flip tables and use them as cover. They will also deal contact damage if the player touches them. Occasionally, Bullet Kin will have assaul...
------------------------------------------------------------------------------------------
#2 | Similarity: 0.7093 | Doc 0 | Chunk 14
unless they are jammed. Red-Caped Bullet Kin's design may be based on The Kid from I Wanna Be The Guy. Rooms created by the Drill can have a Red-Caped Bullet Kin spawn inside them, even if a Red-Caped Bullet Kin has alread