In [None]:
from uuid import uuid4

import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import CharacterTextSplitter

loader = TextLoader("/Users/luxun/workspace/ai/mine/codeCopilot/train/local_training.py")
documents = loader.load()
# 这里的chunk_size不是字符数，需要理解下 
# ai-> CharacterTextSplitter 会按照逻辑结构（例如段落、换行符等）尽量保持原始文档的完整性
text_splitter = CharacterTextSplitter(chunk_size=10, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
embeddings = OllamaEmbeddings(model="quentinz/bge-large-zh-v1.5")
vectorstore = FAISS.from_documents(texts, embeddings)
# L2索引 
# 存储文档的向量嵌入。
# 执行高效的相似性搜索（如 L2 距离）。
# 返回的结果是向量索引（数值 ID）
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

#docstore（文档存储）:
#存储文档内容和元数据。
#将每个文档内容与向量索引关联。
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

document_1 = Document(
    page_content="I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

documents = [document_1, document_2]
uuids = [str(uuid4()) for _ in range(len(documents))]
vector_store.add_documents(documents, ids=uuids)
print(vectorstore)


In [None]:
# 持久index
faiss.write_index(vector_store.index, "faiss_index.bin")

# 持久doc 内容
import pickle

with open("vector_store_metadata.pkl", "wb") as f:
    pickle.dump({
        "docstore": vector_store.docstore,
        "index_to_docstore_id": vector_store.index_to_docstore_id,
    }, f)

In [None]:
import pickle
import faiss
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings
import os

embeddings = OllamaEmbeddings(model="quentinz/bge-large-zh-v1.5")

# 1. 加载 FAISS 索引
index = faiss.read_index("faiss_index.bin")

# 2. 加载文档存储和索引映射
with open("vector_store_metadata.pkl", "rb") as f:
    print(os.path.abspath(f.name))
    metadata = pickle.load(f)
    docstore = metadata["docstore"]
    index_to_docstore_id = metadata["index_to_docstore_id"]

# 3. 重建向量存储
vector_store = FAISS(
    embedding_function=embeddings,  # 重新设置嵌入函数
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id,
)

print("Vector store successfully restored!")

results = vector_store.similarity_search_with_score(
    "LangChain provides abstractions to make working with LLMs easy",
    k=2,
    #filter={"source": "tweet"},
)
# for res in results:
#     print(f"* {res.page_content} [{res.metadata}]")

# 这里的分数为什么为什么会大于0    
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

In [None]:
# 查询是否需要归一化
print("Metric Type:", vector_store.index.metric_type)
print(faiss.METRIC_INNER_PRODUCT, faiss.METRIC_L2)

In [None]:
results = vector_store.similarity_search_with_score(
    "Will it be hot tomorrow?", k=1, filter={"source": "news"}
)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

In [None]:
# 删除第1个
vector_store.delete(ids=[uuids[0]])