In [1]:
# === 參數設定 ===
file_path = "nstc_jobs_full.csv"
model_name = "all-MiniLM-L6-v2"
chunk_size = 300
chunk_overlap = 30
encoding_batch_size = 32
top_k = 5
output_log_path = "rag_results_log.csv"


In [2]:
import pandas as pd

def load_and_prepare_data(file_path):
    df = pd.read_csv(file_path)
    df = df.dropna(subset=["職缺名稱", "發佈日期", "連結", "詳細內容"])

    texts = [
        f"職缺名稱：{row['職缺名稱']}\n發佈日期：{row['發佈日期']}\n詳細內容：\n{row['詳細內容']}"
        for _, row in df.iterrows()
    ]

    metadatas = [
        {"職缺名稱": row["職缺名稱"], "連結": row["連結"]}
        for _, row in df.iterrows()
    ]

    return texts, metadatas


In [3]:
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

def build_documents(texts, metadatas, chunk_size, chunk_overlap):
    docs = [Document(page_content=text, metadata=meta) for text, meta in zip(texts, metadatas)]
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_documents(docs)


In [10]:
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

def hybrid_chunking(text, metadata, chunk_size=300, chunk_overlap=30):
    # Step 1: 先依據 【標題】或雙換行或條列符號切段
    segments = re.split(r"(?=【[^】]+】)|(?<=\n)\d+\.\s+|\n{2,}", text)

    # Step 2: 對每段做長度判斷，如太長則進一步切割
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    all_chunks = []

    for seg in segments:
        if len(seg.strip()) == 0:
            continue
        doc = Document(page_content=seg.strip(), metadata=metadata)
        sub_chunks = splitter.split_documents([doc])
        all_chunks.extend(sub_chunks)

    return all_chunks


In [None]:
from langchain.docstore.document import Document

def sentence_chunking(text, metadata):
    "Split text into sentences when punctuation is clear."
    import re
    sentences = re.split(r'[。.!?]\s*', text)
    return [Document(page_content=s.strip(), metadata=metadata) for s in sentences if s.strip()]

def paragraph_chunking(text, metadata):
    "Split text into paragraphs separated by blank lines."
    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
    return [Document(page_content=p, metadata=metadata) for p in paragraphs]


In [4]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

def build_faiss_index(split_docs, model_name, batch_size=32):
    model = SentenceTransformer(model_name)
    contents = [doc.page_content for doc in split_docs]
    embeddings = model.encode(contents, show_progress_bar=True, batch_size=batch_size)

    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings))

    return index, model, split_docs


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def rag_query(query, top_k, embedding_model, index, indexed_docs):
    query_embedding = embedding_model.encode([query])
    D, I = index.search(np.array(query_embedding), top_k)

    results = []
    for i, idx in enumerate(I[0]):
        doc = indexed_docs[idx]
        snippet = doc.page_content.strip().replace("\n", " ")
        if len(snippet) > 200:
            snippet = snippet[:200] + "..."

        results.append({
            "query": query,
            "rank": i + 1,
            "職缺名稱": doc.metadata["職缺名稱"],
            "連結": doc.metadata["連結"],
            "摘要": snippet
        })

    return results


In [6]:
import csv
import os
from datetime import datetime

def save_rag_log(results, model_name, chunk_size, chunk_overlap, save_path="rag_results_log.csv"):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    write_header = not os.path.exists(save_path)

    with open(save_path, "a", newline='', encoding="utf-8-sig") as f:  # 👈 修正編碼為 utf-8-sig
        writer = csv.DictWriter(
            f,
            fieldnames=["timestamp", "query", "rank", "職缺名稱", "連結", "摘要", "model_name", "chunk_size", "chunk_overlap"]
        )
        if write_header:
            writer.writeheader()
        for row in results:
            writer.writerow({
                "timestamp": timestamp,
                "query": row["query"],
                "rank": row["rank"],
                "職缺名稱": row["職缺名稱"],
                "連結": row["連結"],
                "摘要": row["摘要"],
                "model_name": model_name,
                "chunk_size": chunk_size,
                "chunk_overlap": chunk_overlap
            })


In [7]:
import pandas as pd

def save_rag_log_to_excel(results, model_name, chunk_size, chunk_overlap, save_path="rag_results_log.xlsx"):
    from datetime import datetime
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    df = pd.DataFrame([{
        "timestamp": timestamp,
        "query": row["query"],
        "rank": row["rank"],
        "職缺名稱": row["職缺名稱"],
        "連結": row["連結"],
        "摘要": row["摘要"],
        "model_name": model_name,
        "chunk_size": chunk_size,
        "chunk_overlap": chunk_overlap
    } for row in results])

    df.to_excel(save_path, index=False)  # ❗ Excel 不用擔心編碼問題


In [8]:
# 🚀 一次性初始化系統
texts, metadatas = load_and_prepare_data(file_path)
split_docs = build_documents(texts, metadatas, chunk_size, chunk_overlap)
index, embedding_model, indexed_docs = build_faiss_index(split_docs, model_name)



Batches: 100%|██████████| 48/48 [00:54<00:00,  1.14s/it]


In [9]:
# 🎯 查詢並儲存紀錄
query = "材料相關的職缺有哪些？"
results = rag_query(query, top_k, embedding_model, index, indexed_docs)
save_rag_log(results, model_name, chunk_size, chunk_overlap, save_path=output_log_path)
save_rag_log_to_excel(results, model_name, chunk_size, chunk_overlap, save_path="rag_results_log.xlsx")
# 👀 顯示查詢結果
for r in results:
    print(f"🔎 {r['rank']}. {r['職缺名稱']}")
    print(f"🔗 {r['連結']}")
    print(f"📝 {r['摘要']}\n")


🔎 1. 國立中山大學新海研3號貴重儀器使用中心誠徵專任技術員1名
🔗 https://www.nstc.gov.tw/folksonomy/detail/ddc2e921-92c5-4004-8c2f-be2373c53f52?l=ch
📝 相關應徵資料予以保密，合者約談，不合者恕不另行通知。 發佈日期：2025-07-04 00:00:00

🔎 2. [徵才] 國立臺灣大學防災減害與韌性學程 (綠‧韌性研究室) 徵求都市規劃/景觀/地理資訊專長 [專任計畫助理]
🔗 https://www.nstc.gov.tw/folksonomy/detail/2793c7ef-b68d-4f00-9388-e011b78b9553?l=ch
📝 3.其他有利申請之相關文件 發佈日期：2025-07-21 00:00:00

🔎 3. 中國醫藥大學 癌症生物精準醫學研究中心  王紹椿老師實驗室 誠徵 博士後研究員
🔗 https://www.nstc.gov.tw/folksonomy/detail/701ca4f1-a9f5-4a61-9b66-c4cf60f5c093?l=ch
📝 歡迎對癌症研究有興趣的夥伴加入我們的團隊！ 發佈日期：2025-07-14 00:00:00

🔎 4. 中國醫藥大學 癌症生物精準醫學研究中心  王紹椿老師實驗室 誠徵 碩士級研究助理
🔗 https://www.nstc.gov.tw/folksonomy/detail/2521ae27-55c0-4f27-9ded-b4bc908c1aff?l=ch
📝 歡迎對癌症研究有興趣的夥伴加入我們的團隊！ 發佈日期：2025-07-14 00:00:00

🔎 5. 國立臺東大學通識教育中心徵聘專任助理教授以上教師徵才公告，收件至114年8月15日止。
🔗 https://www.nstc.gov.tw/folksonomy/detail/e407fdbc-62c9-4e09-b08a-35a897cc4186?l=ch
📝 其    它： 相關訊息，請至本校首頁徵人啟事https://psn.nttu.edu.tw/p/406-1047-165359,r595.php?Lang=zh-tw查詢下載。 聯絡人姓名: 李家婕小姐 聯絡人電話: 089-517492 電子信箱：evalee@nt

In [11]:
docs_recursive = build_documents(texts, metadatas, chunk_size, chunk_overlap)  # 原法
docs_hybrid = build_documents_hybrid(texts, metadatas, chunk_size, chunk_overlap)  # 改用 hybrid chunking


NameError: name 'build_documents_hybrid' is not defined