# 1. Build Vector Index từ Knowledge Base

Notebook này tạo vector database từ file `knowledge_base.json` để sử dụng cho RAG retrieval.


## 1. Cài đặt thư viện


In [None]:
%pip install -q sentence-transformers rank-bm25 faiss-cpu underthesea numpy pandas tqdm


## 2. Import thư viện


In [None]:
import json
import numpy as np
import pickle
from pathlib import Path
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import faiss


## 3. Cấu hình


In [None]:
# Paths
KB_JSON_PATH = "/kaggle/input/vietnamese-knowledge-base/knowledge_base.json"  # Điều chỉnh theo dataset của bạn
OUTPUT_DIR = "/kaggle/working/models"
VECTOR_DB_PATH = f"{OUTPUT_DIR}/vector_db"

# Model for embeddings
EMBEDDING_MODEL = "keepitreal/vietnamese-sbert"

# Create output directory
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)


## 4. Load Knowledge Base


In [None]:
print("Loading knowledge base...")
with open(KB_JSON_PATH, 'r', encoding='utf-8') as f:
    kb_data = json.load(f)

print(f"Loaded {len(kb_data)} entries")


## 5. Tạo embeddings


In [None]:
print("Loading embedding model...")
embedding_model = SentenceTransformer(EMBEDDING_MODEL)

# Prepare texts for embedding
texts = []
for item in tqdm(kb_data, desc="Preparing texts"):
    # Combine entity, facts, summary into searchable text
    text = f"{item.get('entity', '')} {item.get('facts', '')} {item.get('summary', '')}".strip()
    texts.append(text)

print(f"Creating embeddings for {len(texts)} documents...")
embeddings = embedding_model.encode(
    texts,
    show_progress_bar=True,
    convert_to_numpy=True,
    batch_size=32
)

print(f"Embeddings shape: {embeddings.shape}")


## 6. Tạo FAISS Index


In [None]:
# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity

# Normalize embeddings for cosine similarity
faiss.normalize_L2(embeddings)

# Add embeddings to index
index.add(embeddings)

print(f"Index created with {index.ntotal} vectors")


## 7. Lưu Index và Metadata


In [None]:
# Save FAISS index
faiss.write_index(index, f"{VECTOR_DB_PATH}.index")
print(f"FAISS index saved to {VECTOR_DB_PATH}.index")

# Save metadata (knowledge base data)
with open(f"{VECTOR_DB_PATH}_metadata.pkl", 'wb') as f:
    pickle.dump(kb_data, f)
print(f"Metadata saved to {VECTOR_DB_PATH}_metadata.pkl")

# Save embedding model info
config = {
    "embedding_model": EMBEDDING_MODEL,
    "dimension": dimension,
    "num_vectors": len(kb_data)
}
with open(f"{VECTOR_DB_PATH}_config.json", 'w', encoding='utf-8') as f:
    json.dump(config, f, indent=2, ensure_ascii=False)
print(f"Config saved to {VECTOR_DB_PATH}_config.json")

print("\n Index building completed!")
