In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from package.databases.management.longterm import LongTermManagement, LongTerm
from package.databases.management.document import DocumentManagement, Document
from package.databases.models.document import DocumentStatus
from package.databases.management.term import TermManagement, Term
from package.databases.session import Depends, get_session

ltm = LongTermManagement()
dm = DocumentManagement()
tm = TermManagement()

In [3]:
from package.embedding.baai import BAAIEmbedding

embedder = BAAIEmbedding()

  from .autonotebook import tqdm as notebook_tqdm


🔍 Loading model from: BAAI/bge-m3


Fetching 30 files: 100%|██████████| 30/30 [00:00<?, ?it/s]


In [4]:
from package.cross_encoder.cross_encoder import ReRanker

reranker = ReRanker()

In [5]:
from package.llm.ollama import BedrockOllamaChat

model = BedrockOllamaChat()

In [6]:
from package.agents.term_detector import TermDetector

term_detector = TermDetector()

In [7]:
documents = dm.read_documents(session=Depends(get_session))
documents


[Document(id='050e6a5c-8b0f-4649-bd05-547cf962f701', status=<DocumentStatus.COMPLETED: 'completed'>, updated_at=datetime.datetime(2025, 7, 3, 13, 44, 16, 562913), source='RARE Retrieval-Aware Robustness Evaluation for Retrieval-Augmented Generation Systems.pdf', type='pdf', created_at=datetime.datetime(2025, 7, 2, 17, 48, 50, 72407)),
 Document(id='0ab7ef49-011d-4409-ac69-254d6b88eba1', status=<DocumentStatus.COMPLETED: 'completed'>, updated_at=datetime.datetime(2025, 7, 3, 13, 44, 16, 562913), source='ClueAnchor Clue-Anchored Knowledge Reasoning Exploration and Optimization for Retrieval-Augmented Generation.pdf', type='pdf', created_at=datetime.datetime(2025, 7, 2, 17, 40, 10, 746752)),
 Document(id='16514705-ab87-4115-883c-8ebf8ed58f9d', status=<DocumentStatus.COMPLETED: 'completed'>, updated_at=datetime.datetime(2025, 7, 3, 13, 44, 16, 562913), source='DiscoVLA Discrepancy Reduction in Vision, Language, and Alignment for Parameter-Efficient Video-Text Retrieval.pdf', type='pdf', cr

In [8]:
import json

with open("./dataset/trainset.json", 'r', encoding='utf-8') as f:
    trainset = json.load(f)

In [9]:
for ts in trainset:
    ts['metadata']['source'] = ts['metadata']['source'].replace(":", "")

In [10]:
from pydantic import BaseModel

class Response(BaseModel):
    question:str
    ground_truth:str
    predict:str | None
    source:str


In [11]:
from tqdm import tqdm
def bare_model(trainset):
    responses = []
    for ts in tqdm(trainset):
        source = ts['metadata']['source']
        question = ts['question']
        ground_truth = ts['answer']
        system_prompt="You are a helpful assistant."
        query = f"{question}"
        messages = [model.UserMessage(text=query)]
        predict = model.run(system_prompt=system_prompt, messages=messages)
        responses.append(
            Response(
                question=question,
                ground_truth=ground_truth,
                predict=predict,
                source=source
            )
        )
    return responses


In [12]:
bare_response = bare_model(trainset=trainset)

100%|██████████| 33/33 [01:37<00:00,  2.95s/it]


In [13]:
from rouge_score import rouge_scorer
metric = "rougeL"
scorer = rouge_scorer.RougeScorer(rouge_types=[metric], use_stemmer=True)

In [14]:
def get_score(method, responses):
    scores = []
    for response in responses:
        score = scorer.score(target=response.ground_truth, prediction=response.predict)[metric]
        scores.append(
            dict(
                method=method,
                question=response.question,
                ground_truth=response.ground_truth,
                predict=response.predict,
                precision=score.precision,
                recall=score.recall,
                fmeasure=score.fmeasure
            )
        )
    return scores

In [15]:
bare_scores = get_score("bare_model", bare_response)

In [16]:
def simple_rag(trainset):
    responses = []
    for ts in tqdm(trainset):
        source = ts['metadata']['source']
        question = ts['question']
        ground_truth = ts['answer']
        system_prompt="You are a helpful assistant. Read QUESTION carefully and give an answer based on the provided CONTEXT."
        document = dm.read_document_by_source(source, session=Depends(get_session))
        vector = embedder.run(sentences=[question])[0]
        longterms = ltm.read_similar_text_with_like_source(vector, embed_method="raw", session=Depends(get_session), sources=[document.source])
        contexts = "\n".join([l.raw for l in longterms])
        query = f"CONTEXT:\n\n{contexts}\n\nQUESTION:\n\n{question}\n\n"
        messages = [model.UserMessage(text=query)]
        predict = model.run(system_prompt=system_prompt, messages=messages)
        responses.append(
            Response(
                question=question,
                ground_truth=ground_truth,
                predict=predict,
                source=source
            )
        )
    return responses

In [17]:
simple_rag_response = simple_rag(trainset)

  0%|          | 0/33 [00:00<?, ?it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 33/33 [00:57<00:00,  1.76s/it]


In [18]:
simple_rag_scores = get_score(method="simple_rag_score", responses=simple_rag_response)

In [19]:
def rag_with_rerank(trainset):
    responses = []
    for ts in tqdm(trainset):
        source = ts['metadata']['source']
        question = ts['question']
        ground_truth = ts['answer']
        system_prompt="You are a helpful assistant. Read QUESTION carefully and give an answer based on the provided CONTEXT."
        document = dm.read_document_by_source(source, session=Depends(get_session))
        vector = embedder.run(sentences=[question])[0]
        longterms = ltm.read_similar_text_with_like_source(vector, embed_method="raw", session=Depends(get_session), sources=[document.source], limit=10)
        reranked_longterms, _ = reranker.run(search_query=question, longterms=longterms, embed_method="raw")
        contexts = "\n".join([l.raw for l in reranked_longterms])
        query = f"CONTEXT:\n\n{contexts}\n\nQUESTION:\n\n{question}\n\n"
        messages = [model.UserMessage(text=query)]
        predict = model.run(system_prompt=system_prompt, messages=messages)
        responses.append(
            Response(
                question=question,
                ground_truth=ground_truth,
                predict=predict,
                source=source
            )
        )
    return responses

In [20]:
rag_with_rerank_response = rag_with_rerank(trainset)

100%|██████████| 33/33 [00:58<00:00,  1.79s/it]


In [21]:
rag_with_rerank_scores = get_score(method="rag_with_rerank", responses=rag_with_rerank_response)

In [22]:
def simple_rag_term(trainset):
    responses = []
    for ts in tqdm(trainset):
        source = ts['metadata']['source']
        question = ts['question']
        potential_terms = term_detector.run(message=question)
        ground_truth = ts['answer']
        system_prompt="You are a helpful assistant. Read QUESTION carefully and give an answer based on the provided TERM."
        document = dm.read_document_by_source(source, session=Depends(get_session))
        terms = []
        for term in potential_terms:
            similar_terms = tm.read_similar_terms(term=term, session=Depends(get_session), document_ids=[document.id])
            terms.extend([st for st in similar_terms if st])
        evidences = "\n".join([t.evidence for t in terms])

        
        # vector = embedder.run(sentences=[question])[0]
    #     longterms = ltm.read_similar_text_with_like_source(vector, embed_method="raw", session=Depends(get_session), sources=[document.source])
    #     contexts = "\n".join([l.raw for l in longterms])
        query = f"TERM:\n\n{evidences}\n\nQUESTION:\n\n{question}\n\n"
        messages = [model.UserMessage(text=query)]
        predict = model.run(system_prompt=system_prompt, messages=messages)
        responses.append(
            Response(
                question=question,
                ground_truth=ground_truth,
                predict=predict,
                source=source
            )
        )
    return responses

In [23]:
simple_rag_term_response = simple_rag_term(trainset)

100%|██████████| 33/33 [01:20<00:00,  2.44s/it]


In [24]:
simple_rag_term_scores = get_score(method="simple_rag_term", responses=simple_rag_term_response)

In [25]:
def simple_rag_term_context(trainset):
    responses = []
    for ts in tqdm(trainset):
        source = ts['metadata']['source']
        question = ts['question']
        potential_terms = term_detector.run(message=question)
        ground_truth = ts['answer']
        system_prompt="You are a helpful assistant. Read QUESTION carefully and give an answer based on the provided TERM and CONTEXT."
        document = dm.read_document_by_source(source, session=Depends(get_session))
        terms = []
        for term in potential_terms:
            similar_terms = tm.read_similar_terms(term=term, session=Depends(get_session), document_ids=[document.id])
            terms.extend([st for st in similar_terms if st])
        evidences = "\n".join([t.evidence for t in terms])
        vector = embedder.run(sentences=[question])[0]
        longterms = ltm.read_similar_text_with_like_source(vector, embed_method="raw", session=Depends(get_session), sources=[document.source])
        contexts = "\n".join([l.raw for l in longterms])
        query = f"TERM:\n\n{evidences}\n\nCONTEXT:\n\n{contexts}\n\nQUESTION:\n\n{question}\n\n"
        messages = [model.UserMessage(text=query)]
        predict = model.run(system_prompt=system_prompt, messages=messages)
        responses.append(
            Response(
                question=question,
                ground_truth=ground_truth,
                predict=predict,
                source=source
            )
        )
    return responses

In [26]:
simple_rag_term_context_response = simple_rag_term_context(trainset)

100%|██████████| 33/33 [01:35<00:00,  2.90s/it]


In [27]:
simple_rag_term_context_scores = get_score(method="simple_rag_term_context", responses=simple_rag_term_context_response)

In [28]:
def simple_rag_term_context_with_rerank(trainset):
    responses = []
    for ts in tqdm(trainset):
        source = ts['metadata']['source']
        question = ts['question']
        potential_terms = term_detector.run(message=question)
        ground_truth = ts['answer']
        system_prompt="You are a helpful assistant. Read QUESTION carefully and give an answer based on the provided TERM and CONTEXT."
        document = dm.read_document_by_source(source, session=Depends(get_session))
        terms = []
        for term in potential_terms:
            similar_terms = tm.read_similar_terms(term=term, session=Depends(get_session), document_ids=[document.id])
            terms.extend([st for st in similar_terms if st])
        evidences = "\n".join([t.evidence for t in terms])
        vector = embedder.run(sentences=[question])[0]
        longterms = ltm.read_similar_text_with_like_source(vector, embed_method="raw", session=Depends(get_session), sources=[document.source])
        reranked_longterms, _ = reranker.run(search_query=question, longterms=longterms, embed_method="raw")
        contexts = "\n".join([l.raw for l in reranked_longterms])        
        query = f"TERM:\n\n{evidences}\n\nCONTEXT:\n\n{contexts}\n\nQUESTION:\n\n{question}\n\n"
        messages = [model.UserMessage(text=query)]
        predict = model.run(system_prompt=system_prompt, messages=messages)
        responses.append(
            Response(
                question=question,
                ground_truth=ground_truth,
                predict=predict,
                source=source
            )
        )
    return responses

In [29]:
simple_rag_term_context_with_rerank_response = simple_rag_term_context_with_rerank(trainset)

100%|██████████| 33/33 [01:39<00:00,  3.00s/it]


In [30]:
simple_rag_term_context_with_rerank_scores = get_score(method="simple_rag_term_context_with_rerank", responses=simple_rag_term_context_with_rerank_response)

In [41]:
import pandas as pd

datas = [
    bare_scores, 
    simple_rag_scores, 
    rag_with_rerank_scores,
    simple_rag_term_scores,
    simple_rag_term_context_scores,
    simple_rag_term_context_with_rerank_scores
]

experiments = pd.concat([pd.DataFrame(data) for data in datas])

benchmark = experiments.groupby('method').agg(
    precision=pd.NamedAgg(column='precision', aggfunc='mean'),
    recall=pd.NamedAgg(column='recall', aggfunc='mean'),
    fmeasure=pd.NamedAgg(column='fmeasure', aggfunc='mean'),
).sort_values("fmeasure", ascending=False)

benchmark

Unnamed: 0_level_0,precision,recall,fmeasure
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
simple_rag_term_context,0.40423,0.867416,0.53181
rag_with_rerank,0.39629,0.826829,0.51862
simple_rag_score,0.36811,0.800594,0.487404
simple_rag_term_context_with_rerank,0.362819,0.804194,0.481223
simple_rag_term,0.406814,0.60714,0.442018
bare_model,0.041691,0.194941,0.064541


In [35]:
experiments.to_csv("./dataset/experiments.csv", index=False)
benchmark.to_csv("./dataset/benchmark.csv", index=False)

In [72]:
idx = 0

In [73]:
check = trainset[idx]['question']
ground_truth = trainset[idx]['answer']
idx += 1
print("QUESTION:", check)
print("ANSWER:", ground_truth)
check_df = experiments.loc[experiments['question']==check,:].drop(["question", "ground_truth"], axis=1).sort_values("fmeasure", ascending=False)
for p in check_df['predict'].tolist():
    print(p)
    print("="*10)
check_df

QUESTION: What does DiscoVLA stand for?
ANSWER: Discrepancy Reduction in Vision, Language, and Alignment
DiscoVLA stands for Discrepancy Reduction in Vision, Language, and Alignment.
DiscoVLA stands for Discrepancy Reduction in Vision, Language, and Alignment for Parameter-Efficient Video-Text Retrieval.
DiscoVLA stands for "Disc repancy Reducti o n in V ision, L anguage, and A lignment".
DiscoVLA stands for "Disc repancy Reducti o n in V ision, L anguage, and A lignment".
I couldn't find any information on "DiscoVLA". However, I found that VLA stands for Very Large Array, which is a radio astronomy observatory in New Mexico, USA.
Unfortunately, the provided TERM does not explicitly state what DiscoVLA stands for. However, based on the context, it seems to be a type of algorithm or model, possibly related to machine learning or data analysis.

Given the mention of "α" (alpha) and "parameter insensitivity," it appears that DiscoVLA is a model that achieves optimal performance at a certa

Unnamed: 0,method,predict,precision,recall,fmeasure
0,simple_rag_term_context,DiscoVLA stands for Discrepancy Reduction in V...,0.7,1.0,0.823529
0,rag_with_rerank,DiscoVLA stands for Discrepancy Reduction in V...,0.4375,1.0,0.608696
0,simple_rag_term_context_with_rerank,"DiscoVLA stands for ""Disc repancy Reducti o n ...",0.125,0.285714,0.173913
0,simple_rag_score,"DiscoVLA stands for ""Disc repancy Reducti o n ...",0.125,0.285714,0.173913
0,bare_model,"I couldn't find any information on ""DiscoVLA""....",0.035714,0.142857,0.057143
0,simple_rag_term,"Unfortunately, the provided TERM does not expl...",0.011111,0.142857,0.020619
