In [2]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [3]:
# 你裝好了 langchain-community 後
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document


In [4]:
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
import pandas as pd

def build_faiss_from_dataframe(csv_path, output_folder, text_col="內文", metadata_cols=["分類", "標題", "URL"], model_name="shibing624/text2vec-base-multilingual"):
    """
    將 CSV 中的 DataFrame 轉成向量並建立 FAISS index，儲存在指定資料夾。
    """
    # 讀入資料
    df = pd.read_csv(csv_path)
    df = df.dropna(subset=[text_col]).reset_index(drop=True)

    # 建立 Document 清單（LangChain 格式）
    docs = []
    for _, row in df.iterrows():
        content = str(row[text_col])
        metadata = {col: str(row[col]) for col in metadata_cols if col in row}
        docs.append(Document(page_content=content, metadata=metadata))

    # 建立 embedding 模型
    embeddings = HuggingFaceEmbeddings(model_name=model_name)

    # 建立 FAISS 向量庫
    vectorstore = FAISS.from_documents(docs, embedding=embeddings)

    # 儲存到本地
    vectorstore.save_local(output_folder)

    print(f"✅ FAISS 向量庫儲存成功：{output_folder}")

# ✅ 請你執行這一行時替換成實際檔案路徑
file_path = "EuropeTravel/【    出國事宜      】.csv"
build_faiss_from_dataframe(file_path, "index_langchain_text2vec")


  embeddings = HuggingFaceEmbeddings(model_name=model_name)


✅ FAISS 向量庫儲存成功：index_langchain_text2vec


In [5]:
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.schema import Document
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from datetime import datetime
import pandas as pd

def load_index_and_setup_qa_local(index_path="index_langchain_text2vec", model_name="shibing624/text2vec-base-multilingual"):
    # 向量查詢部分
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    vectorstore = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 2})  # 只取前2段最相關內容

    # 本地 LLM 模型，例如 FLAN-T5
    llm_model = "google/flan-t5-base"  # 可換成更大模型如 "tiiuae/falcon-rw-1b"
    tokenizer = AutoTokenizer.from_pretrained(llm_model)
    model = AutoModelForSeq2SeqLM.from_pretrained(llm_model)

    hf_pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
    llm = HuggingFacePipeline(pipeline=hf_pipe)

    # 建立問答鏈
    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

    return qa_chain

def ask_and_log(qa_chain, question, log_path="qa_log.csv"):
    answer = qa_chain.invoke(question)
    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_entry = {"時間": now, "問題": question, "答案": answer}

    try:
        df_log = pd.read_csv(log_path)
    except FileNotFoundError:
        df_log = pd.DataFrame(columns=["時間", "問題", "答案"])

    df_log = pd.concat([df_log, pd.DataFrame([log_entry])], ignore_index=True)
    df_log.to_csv(log_path, index=False)

    return answer


In [6]:
qa = load_index_and_setup_qa_local()
ans = ask_and_log(qa, "有哪些歐洲火車查詢網站？")
print(ans)

{'query': '有哪些歐洲火車查詢網站？', 'result': ''}


In [7]:
#加入來源 metadata、一併顯示標題/分類/URL，我也可以幫你擴充版本，是否要加上這部分