In [14]:
import os
from langchain_community.document_loaders import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

csv_path = "/root/autodl-tmp/web/law_data_3k.csv"

loader = CSVLoader(
    file_path=csv_path,
    encoding="utf-8",
    csv_args={
        "delimiter": ",",
        "quotechar": '"',
    },
)

documents = loader.load()
print(f"Loaded {len(documents)} documents")


Loaded 3013 documents


In [15]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,      # 每个 chunk 字符数
    chunk_overlap=50,    # 重叠字符数
    separators=["民法商法", "\""],  # 从粗到细
)

split_docs = text_splitter.split_documents(documents)
print(f"Split into {len(split_docs)} chunks")


Split into 3047 chunks


In [16]:
embedding_model_name = "/root/autodl-tmp/web/BAAI_bge"

embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    model_kwargs={"device": "cuda"},  # 没 GPU 改成 "cpu"
    encode_kwargs={
        "normalize_embeddings": True,  # bge 必须！
    },
)


In [17]:
vectorstore = FAISS.from_documents(
    documents=split_docs,
    embedding=embeddings,
)


In [15]:
save_dir = "faiss_index"
os.makedirs(save_dir, exist_ok=True)

vectorstore.save_local(save_dir)
print("FAISS index saved.")


FAISS index saved.
