<h1>Cohere 版本</h1>

In [None]:
import os
import pandas as pd
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.schema import Document
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import SQLChatMessageHistory
from langchain_cohere import CohereRerank
from langchain_core.runnables import RunnableMap, RunnablePassthrough
# ✅ API Key 設定
os.environ["OPENAI_API_KEY"] = "api key"
os.environ["COHERE_API_KEY"] = "api key"

# ✅ 載入資料與建構 Chroma 向量庫
persist_dir = "./chroma_db"
text_units_df = pd.read_json("高商合金0406_text_units.json", lines=True)
embeddings_df = pd.read_json("高商合金0406_embeddings.text_unit.text.json", lines=True)
data_df = pd.merge(text_units_df, embeddings_df, on="id")

documents = [
    Document(page_content=row["text"], metadata={"id": row["id"]})
    for _, row in data_df.iterrows()
]

embedding_model = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embedding_model,
    persist_directory=persist_dir
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 100})

# ✅ 提示模板
prompt = ChatPromptTemplate.from_messages([
    ("system", 
     """You are a professional assistant specializing in materials science and engineering.
        Your main role is to help users search, analyze, and summarize materials-related literature and data.
        Always provide factual, source-backed responses and clearly state when information is uncertain or not available.
        When applicable, extract key material properties (e.g., conductivity, thermal stability, mechanical strength) and link them to relevant studies or datasets.
        Avoid making assumptions not supported by data.\n\n{context}"""),
    MessagesPlaceholder(variable_name="history"),
    ("human", "{input}")
])

# ✅ 設定 LLM
llm = ChatOpenAI(model_name="gpt-4o", temperature=0, max_tokens=8192)

# ✅ 建立文件問答鏈
document_chain = create_stuff_documents_chain(llm, prompt)

# ✅ 建立 Cohere reranker
reranker = CohereRerank(top_n=5, model="rerank-english-v2.0")

# ✅ 自定義 retriever 包含 rerank
class RerankRetriever:
    def __init__(self, base_retriever, reranker):
        self._base_retriever = base_retriever
        self._reranker = reranker

    def get_relevant_documents(self, query: str):
        # 取得候選文件（這裡得到的是一個 Document 物件列表）
        docs = self._base_retriever.get_relevant_documents(query)
        # 提取文本內容，因為 Cohere Rerank API 需要一個文字列表
        doc_texts = [doc.page_content for doc in docs]
        
        # 使用命名參數，傳入 query 及純文字文件列表
        reranked_texts = self._reranker.rerank(query=query, documents=doc_texts)
        
        # 由於 reranker API 返回的是重排序後的文件文字列表
        # 你可以依據返回的結果順序，在原始的候選文件中挑選出對應的 Document
        # 這裡假設返回的文字與原本的 text 完全一致，所以用一個簡單比對來還原文件物件
        reranked_docs = []
        for text in reranked_texts:
            for doc in docs:
                if doc.page_content == text:
                    reranked_docs.append(doc)
                    break
        return reranked_docs


# ✅ 包裝 retriever
rerank_retriever = RerankRetriever(retriever, reranker)

# ✅ 建立 Retrieval Chain with rerank
# retrieval_chain_with_rerank = create_retrieval_chain(rerank_retriever, document_chain)
# 手動組合 retrieval + document_chain

retrieval_chain_with_rerank = (
    RunnableMap({
        "context": lambda x: rerank_retriever.get_relevant_documents(x["input"]),
        "input": lambda x: x["input"],
        "history": lambda x: x.get("history", [])
    }) 
    | document_chain
)

# ✅ 定義 SQL 記憶儲存
def get_session_history(session_id):
    return SQLChatMessageHistory(session_id, "sqlite:///./langchain.db")

# ✅ 包裝成具記憶功能的多輪對話鏈（使用 rerank 版本）
chain_with_history = RunnableWithMessageHistory(
    retrieval_chain_with_rerank,
    get_session_history,
    input_messages_key="input",
    output_messages_key="answer",
    history_messages_key="history",
)


query_en = "Which element combinations are commonly used in high-entropy alloy catalysts?"
docs_test = rerank_retriever.get_relevant_documents(query_en)
print(f"➡️ 英文查詢 rerank 結果數量: {len(docs_test)}")



➡️ 英文查詢 rerank 結果數量: 0


<h1>Hugging Face 版本</h1>

In [None]:
import os
import pandas as pd
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.schema import Document
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import SQLChatMessageHistory
from langchain_core.runnables import RunnableMap

# ✅ 安裝需先手動執行：pip install sentence-transformers
from sentence_transformers import CrossEncoder

# ✅ HuggingFace Reranker 類別（取代 Cohere）
class HF_Reranker:
    def __init__(self, model_name="cross-encoder/ms-marco-MiniLM-L-6-v2", top_n=5):
        self.model = CrossEncoder(model_name)
        self.top_n = top_n

    def rerank(self, query, documents):
        pairs = [[query, doc] for doc in documents]
        scores = self.model.predict(pairs)
        scored_docs = list(zip(documents, scores))
        scored_docs.sort(key=lambda x: x[1], reverse=True)
        top_docs = [doc for doc, _ in scored_docs[:self.top_n]]
        return top_docs

# ✅ API Key 設定（HF 不需）
os.environ["OPENAI_API_KEY"] = "api key"

# ✅ 載入資料與建構 Chroma 向量庫
persist_dir = "./chroma_db"
text_units_df = pd.read_json("高商合金0406_text_units.json", lines=True)
embeddings_df = pd.read_json("高商合金0406_embeddings.text_unit.text.json", lines=True)
data_df = pd.merge(text_units_df, embeddings_df, on="id")

documents = [
    Document(page_content=row["text"], metadata={"id": row["id"]})
    for _, row in data_df.iterrows()
]

embedding_model = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embedding_model,
    persist_directory=persist_dir
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 100})

# ✅ 提示模板
prompt = ChatPromptTemplate.from_messages([
    ("system", 
     """You are a professional assistant specializing in materials science and engineering.
        Your main role is to help users search, analyze, and summarize materials-related literature and data.
        Always provide factual, source-backed responses and clearly state when information is uncertain or not available.
        When applicable, extract key material properties (e.g., conductivity, thermal stability, mechanical strength) and link them to relevant studies or datasets.
        Avoid making assumptions not supported by data.\n\n{context}"""),
    MessagesPlaceholder(variable_name="history"),
    ("human", "{input}")
])

# ✅ 設定 LLM
llm = ChatOpenAI(model_name="gpt-4o", temperature=0, max_tokens=8192)

# ✅ 建立文件問答鏈
document_chain = create_stuff_documents_chain(llm, prompt)

# ✅ 使用 HF reranker（取代 Cohere）
reranker = HF_Reranker(top_n=5)

# ✅ 自定義 retriever 包含 rerank
class RerankRetriever:
    def __init__(self, base_retriever, reranker):
        self._base_retriever = base_retriever
        self._reranker = reranker

    def get_relevant_documents(self, query: str):
        docs = self._base_retriever.get_relevant_documents(query)
        doc_texts = [doc.page_content for doc in docs]
        reranked_texts = self._reranker.rerank(query=query, documents=doc_texts)
        reranked_docs = []
        for text in reranked_texts:
            for doc in docs:
                if doc.page_content == text:
                    reranked_docs.append(doc)
                    break
        return reranked_docs

# ✅ 包裝 retriever
rerank_retriever = RerankRetriever(retriever, reranker)

# ✅ 建立 Retrieval Chain with rerank
retrieval_chain_with_rerank = (
    RunnableMap({
        "context": lambda x: rerank_retriever.get_relevant_documents(x["input"]),
        "input": lambda x: x["input"],
        "history": lambda x: x.get("history", [])
    }) 
    | document_chain
)

# ✅ 定義 SQL 記憶儲存
def get_session_history(session_id):
    return SQLChatMessageHistory(session_id, "sqlite:///./langchain.db")

# ✅ 包裝多輪對話鏈
chain_with_history = RunnableWithMessageHistory(
    retrieval_chain_with_rerank,
    get_session_history,
    input_messages_key="input",
    output_messages_key="answer",
    history_messages_key="history",
)

# ✅ 測試 HuggingFace rerank 結果
query_en = "Which element combinations are commonly used in high-entropy alloy catalysts?"
docs_test = rerank_retriever.get_relevant_documents(query_en)
print(f"➡️ 英文查詢 rerank 結果數量: {len(docs_test)}")
for i, doc in enumerate(docs_test, 1):
    print(f"\n🔹 Top {i}: {doc.page_content[:300]}")


  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


➡️ 英文查詢 rerank 結果數量: 5

🔹 Top 1: odes. 
They exhibit a high hydrogen evolution potential, which limits the 
hydrogen evolution reaction [16]. In the aluminium industry, primary 
aluminium typically contains approximately 0.1 wt% Fe after the 
smelting process. In response to demands for sustainable development, a 
significant porti

🔹 Top 2: odes. 
They exhibit a high hydrogen evolution potential, which limits the 
hydrogen evolution reaction [16]. In the aluminium industry, primary 
aluminium typically contains approximately 0.1 wt% Fe after the 
smelting process. In response to demands for sustainable development, a 
significant porti

🔹 Top 3: odes. 
They exhibit a high hydrogen evolution potential, which limits the 
hydrogen evolution reaction [16]. In the aluminium industry, primary 
aluminium typically contains approximately 0.1 wt% Fe after the 
smelting process. In response to demands for sustainable development, a 
significant porti

🔹 Top 4: odes. 
They exhibit a high hydroge

In [22]:
docs_raw = retriever.get_relevant_documents(query_en)
print(f"🧪 原始檢索結果數量: {len(docs_raw)}")
for i, doc in enumerate(docs_raw[:3], 1):
    print(f"\n🔹 Doc {i} 預覽：{doc.page_content[:300]}")

docs_test = rerank_retriever.get_relevant_documents(query_en)
print(f"➡️ rerank 結果數量: {len(docs_test)}")
for i, doc in enumerate(docs_test, 1):
    print(f"\n🔹 Top {i}: {doc.page_content[:300]}")

🧪 原始檢索結果數量: 100

🔹 Doc 1 預覽：odes. 
They exhibit a high hydrogen evolution potential, which limits the 
hydrogen evolution reaction [16]. In the aluminium industry, primary 
aluminium typically contains approximately 0.1 wt% Fe after the 
smelting process. In response to demands for sustainable development, a 
significant porti

🔹 Doc 2 預覽：odes. 
They exhibit a high hydrogen evolution potential, which limits the 
hydrogen evolution reaction [16]. In the aluminium industry, primary 
aluminium typically contains approximately 0.1 wt% Fe after the 
smelting process. In response to demands for sustainable development, a 
significant porti

🔹 Doc 3 預覽：odes. 
They exhibit a high hydrogen evolution potential, which limits the 
hydrogen evolution reaction [16]. In the aluminium industry, primary 
aluminium typically contains approximately 0.1 wt% Fe after the 
smelting process. In response to demands for sustainable development, a 
significant porti
➡️ rerank 結果數量: 5

🔹 Top 1: odes. 
They exhib

In [None]:
# 🔍 查看這個 query 被 rerank 選中的前五篇文件
query_zh = "請問在目前的高熵合金應用中，哪些元素組合常見於催化劑？"
docs_reranked = rerank_retriever.get_relevant_documents(query_zh)

print("\n📊 被選中的 Top 5 文檔如下：")
for i, doc in enumerate(docs_reranked, 1):
    print(f"\n🔹 Top {i}:")
    print(doc.page_content[:500])  # 顯示前 500 字
    print(f"📎 ID: {doc.metadata.get('id')}")


In [24]:
# ✅ 多輪查詢：使用相同 session_id 進行對話
config = {"configurable": {"session_id": "chad-session-0408"}}

# 第一輪提問
response1 = chain_with_history.invoke(
    {"input": "請問在目前的高熵合金應用中，哪些元素組合常見於催化劑？"},
    config=config
)
print(response1)

# 第二輪提問（延續上一題）
response2 = chain_with_history.invoke(
    {"input": "哪一組的表現比較好？"},
    config=config
)
print(response2)


Error in RootListenersTracer.on_chain_end callback: KeyError('answer')


在高熵合金（High-Entropy Alloys, HEAs）作為催化劑的應用中，以下元素組合較為常見：

1. **Ni-Co-Fe-Cr-Mn**：這組合在氧還原反應（ORR）和氫演化反應（HER）中被廣泛研究，因為這些元素提供了良好的催化活性和穩定性。

2. **Pt-Pd-Rh-Ru-Ir**：這些貴金屬組合在催化劑中非常常見，特別是在汽車尾氣處理和燃料電池中，因為它們具有優異的催化性能。

3. **Cu-Ni-Co-Cr-Fe**：這組合在甲烷重整和其他碳氫化合物轉化反應中表現出色，因為它們能有效地促進反應並提高選擇性。

4. **V-Nb-Ta-Mo-W**：這些過渡金屬組合在氨合成和其他高溫催化反應中被研究，因為它們具有高熔點和良好的化學穩定性。

這些組合的選擇通常基於它們的電子結構、原子半徑、熔點和化學穩定性等因素，以優化催化性能和耐久性。具體的應用和性能還需根據實驗數據和理論計算進行進一步驗證。


Error in RootListenersTracer.on_chain_end callback: KeyError('answer')


要確定哪一組高熵合金在催化應用中表現較好，通常需要具體的實驗數據和應用背景。不同的合金組合在不同的催化反應中可能表現出不同的優勢。以下是一些考量因素：

1. **氧還原反應（ORR）和氫演化反應（HER）**：Ni-Co-Fe-Cr-Mn組合在這些反應中通常表現良好，因為這些元素能提供良好的催化活性和穩定性。

2. **貴金屬催化劑**：Pt-Pd-Rh-Ru-Ir組合在許多催化應用中表現出色，特別是在汽車尾氣處理和燃料電池中，因為它們具有優異的催化性能。然而，這些合金的高成本可能限制其大規模應用。

3. **高溫反應**：V-Nb-Ta-Mo-W組合在高溫催化反應中可能表現較好，因為它們具有高熔點和良好的化學穩定性。

4. **碳氫化合物轉化**：Cu-Ni-Co-Cr-Fe組合在甲烷重整和其他碳氫化合物轉化反應中表現出色，因為它們能有效地促進反應並提高選擇性。

最終，哪一組合表現較好需要根據具體的應用需求、操作條件和經濟考量來決定。實驗研究和理論計算是評估這些合金性能的關鍵。若有具體的應用或反應需求，建議參考相關的科學文獻和研究報告以獲取更準確的信息。
