In [1]:
from qdrant_client import QdrantClient, models
from langchain_qdrant import QdrantVectorStore, FastEmbedSparse, RetrievalMode
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter, HTMLSectionSplitter
from langchain.storage import LocalFileStore
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage._lc_store import create_kv_docstore
import pandas as pd
from tqdm.auto import tqdm
import json
from qdrant_workflow import run_qdrant_rag_workflow
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Embedding model 
model_handle = "jinaai/jina-embeddings-v2-base-zh"
embeddings = FastEmbedEmbeddings(model_name=model_handle)
sparse_embeddings = FastEmbedSparse(model_name="Qdrant/BM25")

In [3]:
#Create vector store and doc store
url = "http://localhost:6333"
collection_name = "zhiwei_DAG"

client = QdrantClient(url=url, prefer_grpc=True)
vectorstore = QdrantVectorStore(
    embedding=embeddings,
    client=client,
    collection_name=collection_name,
    sparse_embedding=sparse_embeddings,
    retrieval_mode=RetrievalMode.HYBRID
)


fs = LocalFileStore("./store_location")
doc_store = create_kv_docstore(fs)

INFO:httpx:HTTP Request: GET http://localhost:6333 "HTTP/1.1 200 OK"


In [4]:
# Child chunk splitter
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=doc_store,
    child_splitter=child_splitter,
    search_kwargs={"k": 5},
    id_key="source_id"
)

In [5]:
gold_standard_dat = pd.read_csv("ground-truth-data.csv")
gold_standard = gold_standard_dat.to_dict(orient='records')

In [6]:
result_total = []
for q in tqdm(gold_standard):
    id_tmp = q['document']
    q_tmp = q['question']
    found_docs = retriever.invoke(q_tmp)
    result = [d.metadata['id'] == id_tmp for d in found_docs]
    result_total.append(result)

100%|██████████| 1140/1140 [00:36<00:00, 31.48it/s]


In [7]:
def calc_hit_rate(result_total):
    count = 0

    for result in result_total:
        if True in result:
            count += 1

    hit_rate = count/len(result_total)
    return hit_rate

In [8]:
def calc_mrr(result_total):
    total_score = 0.0

    for result in result_total:
        for i in range(len(result)):
            if result[i] == True:
                total_score = total_score +  1/(i+1)

    return total_score/len(result_total)

In [9]:
print(calc_hit_rate(result_total))
print(calc_mrr(result_total))

0.5771929824561404
0.4469590643274848


# Offline evaluation - LLM as a judge (QA)

In [None]:
questions = pd.read_csv("ground_truth_forLLMjudge.csv")
result = {}

for i,row in enumerate(questions.itertuples(index=False)):
    result_tmp  = run_qdrant_rag_workflow(
            birth_date=row.date,
            gender=row.gender,
            birth_hour=row.time,
            top_n_queries=5,
            question=row.question,
        )
    result[i] = {"question":result_tmp['question'],
                 "chart":result_tmp['user_chart'],
                 "queries":result_tmp['queries'],
                 "answer":result_tmp['answer']}
    
df = pd.DataFrame.from_dict(result, orient='index')
df.to_csv("LLM_QA_generated_answers.csv",index=False)

In [None]:
import json
from openai import OpenAI

client = OpenAI()

SYSTEM_PROMPT = """
角色：你是离线评测用的紫微斗数 RAG 评审官。
任务：仅依据“检索文段”和“命盘资料”判断助手最终回答的质量。
评分维度(整数0-1,1为最佳):
1. faithfulness —— 回答是否忠于命盘，不臆造星曜、格局或结论。
2. relevance —— 回答是否紧扣用户问题，避免跑题或遗漏问点。
请在判断后给出简短中文说明，解释评分理由。
只输出 JSON 对象,包含键:faithfulness、relevance、explanation_faithfulness,explanation_relevance;其中 explanation 为不超过 120 字的中文说明。
""".strip()

def make_user_prompt(row):
    return f"""
用户问题：
{row.question}

命盘资料(JSON):
{row.chart}

助手最终回答：
{row.answer}
""".strip()



In [142]:
def judge_example(row, model="gpt-4o-mini"):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": make_user_prompt(row)},
    ]
    completion = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    content = completion.choices[0].message.content.strip()

    # grab the JSON object from the reply
    start = content.find("{")
    end = content.rfind("}")
    if start == -1 or end == -1:
        raise ValueError(f"Judge did not return JSON: {content}")
    payload = json.loads(content[start:end + 1])
    return payload

In [162]:
df = pd.read_csv('LLM_QA_generated_answers.csv')
results = []
for row in df.itertuples(index=False):
    judge_result = judge_example(row)
    results.append(judge_result)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"

In [176]:
result_dat = pd.DataFrame(results)
print("failthfulness:",np.mean(result_dat['faithfulness']))
print("relevance:",np.mean(result_dat['relevance']))

failthfulness: 4.705882352941177
relevance: 4.970588235294118
