In [26]:
# === 參數設定 ===
file_path = "nstc_jobs_full.csv"
model_name = "all-MiniLM-L6-v2"
chunk_size = 300
chunk_overlap = 30
encoding_batch_size = 32
top_k = 5
output_log_path = "rag_results_log.csv"


In [27]:
import pandas as pd

def load_and_prepare_data(file_path):
    """讀取爬蟲 CSV 並產生文本與中繼資料。
    參數: file_path (str) CSV 路徑
    回傳: texts, metadatas
    使用: texts, metas = load_and_prepare_data('nstc_jobs_full.csv')
    """
    df = pd.read_csv(file_path)
    df = df.dropna(subset=["職缺名稱", "發佈日期", "連結", "詳細內容"])

    texts = [
        f"職缺名稱：{row['職缺名稱']}\n發佈日期：{row['發佈日期']}\n詳細內容：\n{row['詳細內容']}"
        for _, row in df.iterrows()
    ]

    metadatas = [
        {"職缺名稱": row["職缺名稱"], "連結": row["連結"]}
        for _, row in df.iterrows()
    ]

    return texts, metadatas


In [28]:
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

def build_documents(texts, metadatas, chunk_size, chunk_overlap):
    """將文本依指定長度切割成文件片段。
    texts: 原始文本列表
    metadatas: 對應的中繼資料列表
    chunk_size: 每段最大字元數
    chunk_overlap: 重疊字元數
    使用: docs = build_documents(texts, metadatas, 300, 30)
    """
    docs = [Document(page_content=text, metadata=meta) for text, meta in zip(texts, metadatas)]
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_documents(docs)


In [29]:
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

def hybrid_chunking(text, metadata, chunk_size=300, chunk_overlap=30):
    """依標題或空行分段後再切割過長段落。
    text: 單篇文本
    metadata: 此文本的中繼資料
    chunk_size: 每段最大字元
    chunk_overlap: 段落重疊量
    使用: chunks = hybrid_chunking(texts[0], metadatas[0])
    """
    # Step 1: 先依據 【標題】或雙換行或條列符號切段
    segments = re.split(r"(?=【[^】]+】)|(?<=\n)\d+\.\s+|\n{2,}", text)

    # Step 2: 對每段做長度判斷，如太長則進一步切割
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    all_chunks = []

    for seg in segments:
        if len(seg.strip()) == 0:
            continue
        doc = Document(page_content=seg.strip(), metadata=metadata)
        sub_chunks = splitter.split_documents([doc])
        all_chunks.extend(sub_chunks)

    return all_chunks


In [30]:
from langchain.docstore.document import Document

def sentence_chunking(text, metadata):
    "Split text into sentences when punctuation is clear."
    import re
    sentences = re.split(r'[。.!?]\s*', text)
    return [Document(page_content=s.strip(), metadata=metadata) for s in sentences if s.strip()]

def paragraph_chunking(text, metadata):
    "Split text into paragraphs separated by blank lines."
    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
    return [Document(page_content=p, metadata=metadata) for p in paragraphs]


In [31]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

def build_faiss_index(split_docs, model_name, batch_size=32):
    """取得向量並建立 FAISS 索引。
    split_docs: Document 片段列表
    model_name: 嵌入模型名稱
    batch_size: 批次處理量
    使用: index, model, docs = build_faiss_index(split_docs, 'all-MiniLM-L6-v2')
    """
    model = SentenceTransformer(model_name)
    contents = [doc.page_content for doc in split_docs]
    embeddings = model.encode(contents, show_progress_bar=True, batch_size=batch_size)

    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings))

    return index, model, split_docs
import os
from sentence_transformers import SentenceTransformer

def load_or_build_index(docs, model_name, save_path_prefix, batch_size=32):
    index_file = f"{save_path_prefix}.index"
    docs_file = f"{save_path_prefix}_docs.pkl"

    if os.path.exists(index_file) and os.path.exists(docs_file):
        print(f"📦 載入已存在的 index：{save_path_prefix}")
        embedding_model = SentenceTransformer(model_name)
        index, loaded_docs = load_faiss_index(save_path_prefix)
        return index, embedding_model, loaded_docs
    else:
        print(f"🛠️ 建立新 index 並儲存至：{save_path_prefix}")
        index, embedding_model, split_docs = build_faiss_index(docs, model_name, batch_size)
        save_faiss_index(index, split_docs, save_path_prefix)
        return index, embedding_model, split_docs


In [32]:
import pickle

def save_faiss_index(index, docs, save_path_prefix="faiss_index"):
    # 儲存 FAISS index 本體 (.index)
    faiss.write_index(index, f"{save_path_prefix}.index")
    
    # 儲存對應的原始 documents (.pkl)
    with open(f"{save_path_prefix}_docs.pkl", "wb") as f:
        pickle.dump(docs, f)

    print(f"✅ 已儲存 FAISS index 和文件到：{save_path_prefix}.index / _docs.pkl")
def load_faiss_index(save_path_prefix="faiss_index"):
    # 讀取 FAISS index
    index = faiss.read_index(f"{save_path_prefix}.index")
    
    # 讀取原始文件 chunks
    with open(f"{save_path_prefix}_docs.pkl", "rb") as f:
        docs = pickle.load(f)

    print(f"📂 成功載入 index 與 documents，筆數：{len(docs)}")
    return index, docs


In [33]:
def rag_query(query, top_k, embedding_model, index, indexed_docs):
    """在向量索引中取得相關片段。
    query: 查詢字串
    top_k: 回傳前幾筆結果
    embedding_model: 用來編碼查詢的模型
    index: FAISS 索引
    indexed_docs: 與索引對應的文件片段
    使用: results = rag_query('關鍵字', 5, model, index, docs)
    """
    query_embedding = embedding_model.encode([query])
    D, I = index.search(np.array(query_embedding), top_k)

    results = []
    for i, idx in enumerate(I[0]):
        doc = indexed_docs[idx]
        snippet = doc.page_content.strip().replace("\n", " ")
        if len(snippet) > 200:
            snippet = snippet[:200] + "..."

        results.append({
            "query": query,
            "rank": i + 1,
            "職缺名稱": doc.metadata["職缺名稱"],
            "連結": doc.metadata["連結"],
            "摘要": snippet
        })

    return results


In [34]:
import csv
import os
from datetime import datetime

def save_rag_log(results, model_name, chunk_size, chunk_overlap, save_path="rag_results_log.csv"):
    """將查詢結果寫入 CSV 檔。
    results: rag_query 產生的結果
    model_name: 使用的模型名稱
    chunk_size: 切割長度設定
    chunk_overlap: 重疊長度設定
    save_path: 儲存的 CSV 路徑
    使用: save_rag_log(results, model_name, chunk_size, chunk_overlap)
    """
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    write_header = not os.path.exists(save_path)

    with open(save_path, "a", newline='', encoding="utf-8-sig") as f:  # 👈 修正編碼為 utf-8-sig
        writer = csv.DictWriter(
            f,
            fieldnames=["timestamp", "query", "rank", "職缺名稱", "連結", "摘要", "model_name", "chunk_size", "chunk_overlap"]
        )
        if write_header:
            writer.writeheader()
        for row in results:
            writer.writerow({
                "timestamp": timestamp,
                "query": row["query"],
                "rank": row["rank"],
                "職缺名稱": row["職缺名稱"],
                "連結": row["連結"],
                "摘要": row["摘要"],
                "model_name": model_name,
                "chunk_size": chunk_size,
                "chunk_overlap": chunk_overlap
            })


In [35]:
import pandas as pd
from datetime import datetime
import os

def save_rag_log_to_excel(results, model_name, chunk_size, chunk_overlap, chunking_strategy, save_path="rag_results_log.xlsx"):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # 將本次查詢結果轉成 DataFrame
    new_df = pd.DataFrame([{
        "timestamp": timestamp,
        "query": row["query"],
        "rank": row["rank"],
        "職缺名稱": row["職缺名稱"],
        "連結": row["連結"],
        "摘要": row["摘要"],
        "model_name": model_name,
        "chunk_size": chunk_size,
        "chunk_overlap": chunk_overlap,
        "chunking_strategy": chunking_strategy
    } for row in results])

    # 如果檔案已存在就讀入合併
    if os.path.exists(save_path):
        old_df = pd.read_excel(save_path)
        combined_df = pd.concat([old_df, new_df], ignore_index=True)
    else:
        combined_df = new_df

    # 寫入 Excel
    combined_df.to_excel(save_path, index=False)
    print(f"✅ 已寫入 {len(new_df)} 筆紀錄，累計：{len(combined_df)} 筆 ➜ {save_path}")


In [41]:
# 🚀 一次性初始化系統
texts, metadatas = load_and_prepare_data(file_path)
docs = hybrid_chunking(texts[0], metadatas[0], chunk_size, chunk_overlap)
index, embedding_model, indexed_docs = load_or_build_index(
    docs, model_name="all-MiniLM-L6-v2", save_path_prefix="faiss_recursive"
)



📦 載入已存在的 index：faiss_recursive
📂 成功載入 index 與 documents，筆數：3146


In [43]:
def rag_query(query, top_k, embedding_model, index, indexed_docs):
    query_embedding = embedding_model.encode([query])
    D, I = index.search(np.array(query_embedding), top_k)

    results = []
    for i, idx in enumerate(I[0]):
        if idx >= len(indexed_docs):
            print(f"⚠️ 無效索引：{idx} 超出 indexed_docs 範圍（{len(indexed_docs)}）")
            continue  # 跳過錯誤索引

        doc = indexed_docs[idx]
        snippet = doc.page_content.strip().replace("\n", " ")
        if len(snippet) > 200:
            snippet = snippet[:200] + "..."

        results.append({
            "query": query,
            "rank": i + 1,
            "職缺名稱": doc.metadata["職缺名稱"],
            "連結": doc.metadata["連結"],
            "摘要": snippet
        })

    return results

In [46]:
query = "生物相關的職缺有哪些"
rag_query(query, top_k, embedding_model, index, indexed_docs)

[{'query': '生物相關的職缺有哪些',
  'rank': 1,
  '職缺名稱': '中國醫藥大學生物醫學研究所誠徵博士後研究員',
  '連結': 'https://www.nstc.gov.tw/folksonomy/detail/cc8706e2-836d-4f85-94d2-89396360a823?l=ch',
  '摘要': '生物醫學相關'},
 {'query': '生物相關的職缺有哪些',
  'rank': 2,
  '職缺名稱': '馬偕紀念醫院血液腫瘤科蘇迺文醫師誠徵國科會補助計畫專任助理',
  '連結': 'https://www.nstc.gov.tw/folksonomy/detail/a78b7d93-b5b4-4bb9-bff5-5888b2d695e6?l=ch',
  '摘要': '生命科學相關系所'},
 {'query': '生物相關的職缺有哪些',
  'rank': 3,
  '職缺名稱': '高雄榮總教研部生殖暨粒線體醫學研究室---誠徵博士後研究員',
  '連結': 'https://www.nstc.gov.tw/folksonomy/detail/47729f59-955a-4b43-addd-5a18d1affa86?l=ch',
  '摘要': '生物醫學相關領域畢業。'},
 {'query': '生物相關的職缺有哪些',
  'rank': 4,
  '職缺名稱': '台大醫院耳鼻喉部楊宗霖教授徵博士後研究員',
  '連結': 'https://www.nstc.gov.tw/folksonomy/detail/e2f4f22d-3604-4ce0-854e-94b9a0ce8c10?l=ch',
  '摘要': '細胞生物相關技術'},
 {'query': '生物相關的職缺有哪些',
  'rank': 5,
  '職缺名稱': '國家衛生研究院癌症研究所 誠徵院內博士後研究員或研究助理一名',
  '連結': 'https://www.nstc.gov.tw/folksonomy/detail/95244417-fdb2-451f-812c-315ae9e234c4?l=ch',
  '摘要': '具有生化、細胞、分生背景及細胞培養等相關研究經驗。'}]

In [48]:
save_rag_log_to_excel(
    results,
    model_name=model_name,
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    chunking_strategy="hybrid_chunking",
    save_path="rag_results_log.xlsx"
)
save_rag_log(
    results,
    model_name=model_name,
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    save_path=output_log_path
)

✅ 已寫入 5 筆紀錄，累計：15 筆 ➜ rag_results_log.xlsx
