In [18]:
import os
from dotenv import find_dotenv, load_dotenv
_ = load_dotenv(find_dotenv())

from llama_index.core import SimpleDirectoryReader

In [2]:
SOURCE_DIR = os.path.join('data', 'source', 'lee')
os.listdir(SOURCE_DIR)

['lee.txt']

In [14]:
documents = SimpleDirectoryReader(SOURCE_DIR).load_data()
print(f"len: {len(documents)}")
doc = documents[0]
print('file_path' + doc.metadata['file_path'])
txt = doc.text_resource.text
print(f'txt len: {len(txt)}')
print(txt[:100])


len: 1
file_path/home/poyuan/workspace/rag30/days/day23/data/source/lee/lee.txt
txt len: 5264
各位同學大家好 我們來上課吧
剛才只是用Google的VO3
這個可以生成影片的人工智慧
隨便生了一些影片
作為開場,我們現在來正式開始這門課
我們先把投影片開起來
歡迎來到這門超級有趣的課程
很高興


In [16]:
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.openai import OpenAIEmbedding

In [19]:
# build a vector store from scratch: https://developers.llamaindex.ai/python/examples/low_level/vector_store/

# https://developers.llamaindex.ai/python/examples/vector_stores/faissindexdemo/

In [22]:
# pip install llama-index-vector-stores-faiss faiss-cpu
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core import StorageContext
import faiss

INDEX_DIR = "storage"  # 你的持久化資料夾
os.makedirs(INDEX_DIR, exist_ok=True)

# 1-1. 指定嵌入模型 (給 index 建立時用；之後查詢時也要用同一個維度的模型)
embed_model = OpenAIEmbedding(model_name="text-embedding-3-small")

# 1-2. 建一個 Faiss Index，維度要跟嵌入模型一致
d = 1536
faiss_index = faiss.IndexFlatL2(d)

# 1-3. 包成 LlamaIndex 的 FaissVectorStore
vector_store = FaissVectorStore(faiss_index=faiss_index)

# 1-4. 建 StorageContext，帶入 vector_store 與 persist_dir
storage_context = StorageContext.from_defaults(
    vector_store=vector_store
)

# 1-5. 由文件建立高階索引 (會把文件 embed 後寫進 Faiss)
index = VectorStoreIndex.from_documents(
    documents=documents,
    embed_model=embed_model,
    storage_context=storage_context,
)

#splitter = SentenceSplitter(chunk_size=800, chunk_overlap=100)
#index = VectorStoreIndex.from_documents(documents, node_parser=splitter)

# 1-6. 持久化 (會把 Faiss 索引與 LlamaIndex 的 meta 一起寫到 INDEX_DIR)
index.storage_context.persist(persist_dir=INDEX_DIR)
print("Index built and persisted to:", INDEX_DIR)


2025-10-07 15:04:16,230 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Index built and persisted to: storage


In [23]:
from llama_index.core import load_index_from_storage

# load index from disk
vector_store = FaissVectorStore.from_persist_dir(INDEX_DIR)
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir=INDEX_DIR
)
index = load_index_from_storage(storage_context=storage_context)

2025-10-07 15:04:53,376 - INFO - Loading llama_index.vector_stores.faiss.base from storage/default__vector_store.json.
2025-10-07 15:04:53,379 - INFO - Loading all indices.


Loading llama_index.core.storage.kvstore.simple_kvstore from storage/docstore.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from storage/index_store.json.


In [24]:
len(index.docstore.docs)

9

In [26]:
index.docstore.docs

{'eba16e20-d000-4b3e-b05a-b1589f6b9fe9': TextNode(id_='eba16e20-d000-4b3e-b05a-b1589f6b9fe9', embedding=None, metadata={'file_path': '/home/poyuan/workspace/rag30/days/day23/data/source/lee/lee.txt', 'file_name': 'lee.txt', 'file_type': 'text/plain', 'file_size': 14112, 'creation_date': '2025-10-07', 'last_modified_date': '2025-09-19'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='0b63edfd-2de7-45d8-993a-786b2b1967a6', node_type='4', metadata={'file_path': '/home/poyuan/workspace/rag30/days/day23/data/source/lee/lee.txt', 'file_name': 'lee.txt', 'file_type': 'text/plain', 'file_size': 14112, 'creation_date': '2025-10-07', 'last_modified_date': '2025-09-19'}, hash='93ea7cda1ebeec3bd19b0131a8b7a66cdc7f985e