In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from package.databases.management.longterm import LongTermManagement, LongTerm
from package.databases.management.document import DocumentManagement, Document
from package.databases.models.document import DocumentStatus
from package.databases.management.term import TermManagement, Term
from package.databases.session import Depends, get_session

ltm = LongTermManagement()
dm = DocumentManagement()
tm = TermManagement()

In [3]:
from package.embedding.baai import BAAIEmbedding

embedder = BAAIEmbedding()

  from .autonotebook import tqdm as notebook_tqdm


🔍 Loading model from: BAAI/bge-m3


Fetching 30 files: 100%|██████████| 30/30 [00:00<?, ?it/s]


In [4]:
from package.cross_encoder.cross_encoder import ReRanker

reranker = ReRanker()

In [5]:
from package.llm.ollama import BedrockOllamaChat

model = BedrockOllamaChat()

In [6]:
from package.agents.term_detector import TermDetector

term_detector = TermDetector()

In [7]:
# documents = dm.read_documents(session=Depends(get_session))
# documents


In [8]:
import json

with open("./dataset/trainset.json", 'r', encoding='utf-8') as f:
    trainset = json.load(f)

In [9]:
for ts in trainset:
    ts['metadata']['source'] = ts['metadata']['source'].replace(":", "")

In [10]:
from pydantic import BaseModel

class Response(BaseModel):
    question:str
    ground_truth:str
    predict:str | None
    source:str


In [11]:
from tqdm import tqdm
def bare_model(trainset):
    responses = []
    for ts in tqdm(trainset):
        source = ts['metadata']['source']
        question = ts['question']
        ground_truth = ts['answer']
        system_prompt="You are a helpful assistant."
        query = f"{question}"
        messages = [model.UserMessage(text=query)]
        predict = model.run(system_prompt=system_prompt, messages=messages)
        responses.append(
            Response(
                question=question,
                ground_truth=ground_truth,
                predict=predict,
                source=source
            )
        )
    return responses


In [12]:
bare_response = bare_model(trainset=trainset)

100%|██████████| 33/33 [02:12<00:00,  4.02s/it]


In [13]:
from rouge_score import rouge_scorer
metric = "rougeL"
scorer = rouge_scorer.RougeScorer(rouge_types=[metric], use_stemmer=True)

In [14]:
# def get_score(method, responses):
#     scores = []
#     for response in responses:
#         score = scorer.score(target=response.ground_truth, prediction=response.predict)[metric]
#         scores.append(
#             dict(
#                 method=method,
#                 question=response.question,
#                 ground_truth=response.ground_truth,
#                 predict=response.predict,
#                 precision=score.precision,
#                 recall=score.recall,
#                 fmeasure=score.fmeasure
#             )
#         )
#     return scores

In [15]:
import re

def preprocess_text(text):
    if not text:
        return ""
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)  # normalize whitespace
    return text

def get_score(method, responses):
    scores = []
    for response in responses:
        # Preprocess both texts
        ground_truth = preprocess_text(response.ground_truth)
        predict = preprocess_text(response.predict)
        
        score = scorer.score(target=ground_truth, prediction=predict)[metric]
        scores.append(dict(
            method=method,
            question=response.question,
            ground_truth=ground_truth,
            predict=predict,
            precision=score.precision,
            recall=score.recall,
            fmeasure=score.fmeasure
        ))
    return scores

In [16]:
bare_scores = get_score("bare_model", bare_response)

In [17]:
def simple_rag(trainset):
    responses = []
    for ts in tqdm(trainset):
        source = ts['metadata']['source']
        question = ts['question']
        ground_truth = ts['answer']
        system_prompt="You are a helpful assistant. Read QUESTION carefully and give an answer based on the provided CONTEXT."
        document = dm.read_document_by_source(source, session=Depends(get_session))
        vector = embedder.run(sentences=[question])[0]
        longterms = ltm.read_similar_text_with_like_source(vector, embed_method="raw", session=Depends(get_session), sources=[document.source])
        contexts = "\n".join([l.raw for l in longterms])
        query = f"CONTEXT:\n\n{contexts}\n\nQUESTION:\n\n{question}\n\n"
        messages = [model.UserMessage(text=query)]
        predict = model.run(system_prompt=system_prompt, messages=messages)
        responses.append(
            Response(
                question=question,
                ground_truth=ground_truth,
                predict=predict,
                source=source
            )
        )
    return responses

In [18]:
simple_rag_response = simple_rag(trainset)

  0%|          | 0/33 [00:00<?, ?it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 33/33 [00:52<00:00,  1.60s/it]


In [19]:
simple_rag_scores = get_score(method="simple_rag_score", responses=simple_rag_response)

In [20]:
def rag_with_rerank(trainset):
    responses = []
    for ts in tqdm(trainset):
        source = ts['metadata']['source']
        question = ts['question']
        ground_truth = ts['answer']
        system_prompt="You are a helpful assistant. Read QUESTION carefully and give an answer based on the provided CONTEXT."
        document = dm.read_document_by_source(source, session=Depends(get_session))
        vector = embedder.run(sentences=[question])[0]
        longterms = ltm.read_similar_text_with_like_source(vector, embed_method="raw", session=Depends(get_session), sources=[document.source], limit=10)
        reranked_longterms, _ = reranker.run(search_query=question, longterms=longterms, embed_method="raw")
        contexts = "\n".join([l.raw for l in reranked_longterms])
        query = f"CONTEXT:\n\n{contexts}\n\nQUESTION:\n\n{question}\n\n"
        messages = [model.UserMessage(text=query)]
        predict = model.run(system_prompt=system_prompt, messages=messages)
        responses.append(
            Response(
                question=question,
                ground_truth=ground_truth,
                predict=predict,
                source=source
            )
        )
    return responses

In [21]:
rag_with_rerank_response = rag_with_rerank(trainset)

100%|██████████| 33/33 [00:52<00:00,  1.59s/it]


In [22]:
rag_with_rerank_scores = get_score(method="rag_with_rerank", responses=rag_with_rerank_response)

In [23]:
def simple_rag_term(trainset, method="evidence"):
    responses = []
    for ts in tqdm(trainset):
        source = ts['metadata']['source']
        question = ts['question']
        potential_terms = term_detector.run(message=question)
        ground_truth = ts['answer']
        system_prompt="You are a helpful assistant. Read QUESTION carefully and give an answer based on the provided TERM."
        document = dm.read_document_by_source(source, session=Depends(get_session))
        terms = []
        for term in potential_terms:
            similar_terms = tm.read_similar_terms(term=term, session=Depends(get_session), document_ids=[document.id])
            terms.extend([st for st in similar_terms if st])
        if method == "evidence":
            evidences = "\n".join([t.evidence for t in terms])
        else:
            evidences = "\n".join([t.explanation for t in terms])
        query = f"TERM:\n\n{evidences}\n\nQUESTION:\n\n{question}\n\n"
        messages = [model.UserMessage(text=query)]
        predict = model.run(system_prompt=system_prompt, messages=messages)
        responses.append(
            Response(
                question=question,
                ground_truth=ground_truth,
                predict=predict,
                source=source
            )
        )
    return responses

In [24]:
simple_rag_term_response = simple_rag_term(trainset, method="evidence")

100%|██████████| 33/33 [01:14<00:00,  2.24s/it]


In [25]:
simple_rag_term_scores = get_score(method="simple_rag_term", responses=simple_rag_term_response)

In [26]:
simple_rag_term_explanation_response = simple_rag_term(trainset, method="explanation")

100%|██████████| 33/33 [01:13<00:00,  2.23s/it]


In [27]:
simple_rag_term_explanation_scores = get_score(method="simple_rag_term_explanation", responses=simple_rag_term_explanation_response)

In [28]:
def simple_rag_term_context(trainset, method="evidence"):
    responses = []
    for ts in tqdm(trainset):
        source = ts['metadata']['source']
        question = ts['question']
        potential_terms = term_detector.run(message=question)
        ground_truth = ts['answer']
        system_prompt="You are a helpful assistant. Read QUESTION carefully and give an answer based on the provided TERM and CONTEXT."
        document = dm.read_document_by_source(source, session=Depends(get_session))
        terms = []
        for term in potential_terms:
            similar_terms = tm.read_similar_terms(term=term, session=Depends(get_session), document_ids=[document.id])
            terms.extend([st for st in similar_terms if st])
        if method == "evidence":
            evidences = "\n".join([t.evidence for t in terms])
        else:
            evidences = "\n".join([t.explanation for t in terms])
        vector = embedder.run(sentences=[question])[0]
        longterms = ltm.read_similar_text_with_like_source(vector, embed_method="raw", session=Depends(get_session), sources=[document.source])
        contexts = "\n".join([l.raw for l in longterms])
        query = f"TERM:\n\n{evidences}\n\nCONTEXT:\n\n{contexts}\n\nQUESTION:\n\n{question}\n\n"
        messages = [model.UserMessage(text=query)]
        predict = model.run(system_prompt=system_prompt, messages=messages)
        responses.append(
            Response(
                question=question,
                ground_truth=ground_truth,
                predict=predict,
                source=source
            )
        )
    return responses

In [29]:
simple_rag_term_context_response = simple_rag_term_context(trainset)

100%|██████████| 33/33 [01:33<00:00,  2.82s/it]


In [30]:
simple_rag_term_context_scores = get_score(method="simple_rag_term_context", responses=simple_rag_term_context_response)

In [31]:
simple_rag_term_explanation_context_response = simple_rag_term_context(trainset, method="explanation")

100%|██████████| 33/33 [01:30<00:00,  2.73s/it]


In [32]:
simple_rag_term_explanation_context_scores = get_score(method="simple_rag_term_explanation_context", responses=simple_rag_term_explanation_context_response)

In [33]:
def simple_rag_term_context_with_rerank(trainset, method="evidence"):
    responses = []
    for ts in tqdm(trainset):
        source = ts['metadata']['source']
        question = ts['question']
        potential_terms = term_detector.run(message=question)
        ground_truth = ts['answer']
        system_prompt="You are a helpful assistant. Read QUESTION carefully and give an answer based on the provided TERM and CONTEXT."
        document = dm.read_document_by_source(source, session=Depends(get_session))
        terms = []
        for term in potential_terms:
            similar_terms = tm.read_similar_terms(term=term, session=Depends(get_session), document_ids=[document.id])
            terms.extend([st for st in similar_terms if st])
        if method == "evidence":
            evidences = "\n".join([t.evidence for t in terms])
        else:
            evidences = "\n".join([t.explanation for t in terms])
        vector = embedder.run(sentences=[question])[0]
        longterms = ltm.read_similar_text_with_like_source(vector, embed_method="raw", session=Depends(get_session), sources=[document.source])
        reranked_longterms, _ = reranker.run(search_query=question, longterms=longterms, embed_method="raw")
        contexts = "\n".join([l.raw for l in reranked_longterms])        
        query = f"TERM:\n\n{evidences}\n\nCONTEXT:\n\n{contexts}\n\nQUESTION:\n\n{question}\n\n"
        messages = [model.UserMessage(text=query)]
        predict = model.run(system_prompt=system_prompt, messages=messages)
        responses.append(
            Response(
                question=question,
                ground_truth=ground_truth,
                predict=predict,
                source=source
            )
        )
    return responses

In [34]:
simple_rag_term_context_with_rerank_response = simple_rag_term_context_with_rerank(trainset)

100%|██████████| 33/33 [01:31<00:00,  2.77s/it]


In [35]:
simple_rag_term_context_with_rerank_scores = get_score(method="simple_rag_term_context_with_rerank", responses=simple_rag_term_context_with_rerank_response)

In [36]:
simple_rag_term_explanation_context_with_rerank_response = simple_rag_term_context_with_rerank(trainset, method="explanation")

100%|██████████| 33/33 [01:33<00:00,  2.85s/it]


In [37]:
simple_rag_term_explanation_context_with_rerank_scores = get_score(method="simple_rag_term_explanation_context_with_rerank", responses=simple_rag_term_explanation_context_with_rerank_response)

In [38]:
import pandas as pd

datas = [
    bare_scores, 
    simple_rag_scores, 
    rag_with_rerank_scores,
    simple_rag_term_scores,
    simple_rag_term_context_scores,
    simple_rag_term_context_with_rerank_scores,
    simple_rag_term_explanation_scores,
    simple_rag_term_explanation_context_scores,
    simple_rag_term_explanation_context_with_rerank_scores,
]

experiments = pd.concat([pd.DataFrame(data) for data in datas])

benchmark = experiments.groupby('method').agg(
    precision=pd.NamedAgg(column='precision', aggfunc='mean'),
    recall=pd.NamedAgg(column='recall', aggfunc='mean'),
    fmeasure=pd.NamedAgg(column='fmeasure', aggfunc='mean'),
).sort_values("fmeasure", ascending=False)

benchmark

Unnamed: 0_level_0,precision,recall,fmeasure
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rag_with_rerank,0.396685,0.828041,0.519255
simple_rag_term_context_with_rerank,0.379236,0.824551,0.498211
simple_rag_term_context,0.3724,0.841716,0.491798
simple_rag_score,0.360697,0.802277,0.477898
simple_rag_term_explanation_context_with_rerank,0.351793,0.768794,0.458704
simple_rag_term_explanation_context,0.348908,0.780887,0.45193
simple_rag_term,0.347323,0.610977,0.416818
simple_rag_term_explanation,0.294213,0.49734,0.329696
bare_model,0.042348,0.189722,0.065825


In [39]:
experiments.to_csv("./dataset/experiments.csv", index=False)
benchmark.to_csv("./dataset/benchmark.csv", index=False)

In [40]:
idx = 0

In [41]:
check = trainset[idx]['question']
ground_truth = trainset[idx]['answer']
idx += 1
print("QUESTION:", check)
print("ANSWER:", ground_truth)
check_df = experiments.loc[experiments['question']==check,:].drop(["question", "ground_truth"], axis=1).sort_values("fmeasure", ascending=False)
for p in check_df['predict'].tolist():
    print(p)
    print("="*10)
check_df

QUESTION: What does DiscoVLA stand for?
ANSWER: Discrepancy Reduction in Vision, Language, and Alignment
DiscoVLA stands for Discrepancy Reduction in Vision, Language, and Alignment.
DiscoVLA stands for "Discrepancy Reduction in Vision, Language, and Alignment" for Parameter-Efficient Video-Text Retrieval.
DiscoVLA stands for Discrepancy Reduction in Vision, Language, and Alignment for Parameter-Efficient Video-Text Retrieval.
DiscoVLA stands for "Discrepancy Reduction in Vision, Language, and Alignment" for Parameter-Efficient Video-Text Retrieval.
DiscoVLA stands for Discrepancy Reduction in Vision, Language, and Alignment for Parameter-Efficient Video-Text Retrieval.
DiscoVLA stands for "Disc repancy Reducti o n in V ision, L anguage, and A lignment".
DiscoVLA stands for "Disc repancy Reducti o n in V ision, L anguage, and A lignment".
I couldn't find any information about "DiscoVLA." However, I found that VLA stands for Very Large Array, which is a radio astronomy observatory in Ne

Unnamed: 0,method,predict,precision,recall,fmeasure
0,simple_rag_term_explanation_context,DiscoVLA stands for Discrepancy Reduction in V...,0.7,1.0,0.823529
0,simple_rag_term_context_with_rerank,"DiscoVLA stands for ""Discrepancy Reduction in ...",0.4375,1.0,0.608696
0,rag_with_rerank,DiscoVLA stands for Discrepancy Reduction in V...,0.4375,1.0,0.608696
0,simple_rag_term_context,"DiscoVLA stands for ""Discrepancy Reduction in ...",0.4375,1.0,0.608696
0,simple_rag_term,DiscoVLA stands for Discrepancy Reduction in V...,0.4375,1.0,0.608696
0,simple_rag_term_explanation_context_with_rerank,"DiscoVLA stands for ""Disc repancy Reducti o n ...",0.125,0.285714,0.173913
0,simple_rag_score,"DiscoVLA stands for ""Disc repancy Reducti o n ...",0.125,0.285714,0.173913
0,bare_model,"I couldn't find any information about ""DiscoVL...",0.035714,0.142857,0.057143
0,simple_rag_term_explanation,"Unfortunately, the provided TERM does not expl...",0.0,0.0,0.0


In [14]:
import pandas as pd

experiments = pd.read_csv("./dataset/experiments.csv")

In [28]:
experiments.columns

Index(['method', 'question', 'ground_truth', 'predict', 'precision', 'recall',
       'fmeasure'],
      dtype='object')

In [32]:
acronyms = [obj for idx, obj in enumerate(trainset) if obj['type']=='acronym']
# acronyms

In [40]:
idx = 0
check = trainset[idx]['question']
ground_truth = trainset[idx]['answer']
# idx += 1
print("QUESTION:", check)
print("ANSWER:", ground_truth)
check_df = experiments.loc[experiments['question']==check,:].drop(["question", "ground_truth"], axis=1).sort_values("fmeasure", ascending=False)
for p in check_df['predict'].tolist():
    print(p)
    print("="*10)
check_df

QUESTION: What does DiscoVLA stand for?
ANSWER: Discrepancy Reduction in Vision, Language, and Alignment
DiscoVLA stands for Discrepancy Reduction in Vision, Language, and Alignment.
DiscoVLA stands for "Discrepancy Reduction in Vision, Language, and Alignment" for Parameter-Efficient Video-Text Retrieval.
DiscoVLA stands for Discrepancy Reduction in Vision, Language, and Alignment for Parameter-Efficient Video-Text Retrieval.
DiscoVLA stands for "Discrepancy Reduction in Vision, Language, and Alignment" for Parameter-Efficient Video-Text Retrieval.
DiscoVLA stands for Discrepancy Reduction in Vision, Language, and Alignment for Parameter-Efficient Video-Text Retrieval.
DiscoVLA stands for "Disc repancy Reducti o n in V ision, L anguage, and A lignment".
DiscoVLA stands for "Disc repancy Reducti o n in V ision, L anguage, and A lignment".
I couldn't find any information about "DiscoVLA." However, I found that VLA stands for Very Large Array, which is a radio astronomy observatory in Ne

Unnamed: 0,method,predict,precision,recall,fmeasure
231,simple_rag_term_explanation_context,DiscoVLA stands for Discrepancy Reduction in V...,0.7,1.0,0.823529
165,simple_rag_term_context_with_rerank,"DiscoVLA stands for ""Discrepancy Reduction in ...",0.4375,1.0,0.608696
66,rag_with_rerank,DiscoVLA stands for Discrepancy Reduction in V...,0.4375,1.0,0.608696
132,simple_rag_term_context,"DiscoVLA stands for ""Discrepancy Reduction in ...",0.4375,1.0,0.608696
99,simple_rag_term,DiscoVLA stands for Discrepancy Reduction in V...,0.4375,1.0,0.608696
264,simple_rag_term_explanation_context_with_rerank,"DiscoVLA stands for ""Disc repancy Reducti o n ...",0.125,0.285714,0.173913
33,simple_rag_score,"DiscoVLA stands for ""Disc repancy Reducti o n ...",0.125,0.285714,0.173913
0,bare_model,"I couldn't find any information about ""DiscoVL...",0.035714,0.142857,0.057143
198,simple_rag_term_explanation,"Unfortunately, the provided TERM does not expl...",0.0,0.0,0.0


In [36]:
question_type = [obj for idx, obj in enumerate(trainset) if obj['type']=='acronym']
mask = experiments['question'].isin([qt['question'] for qt in question_type])
experiments.loc[mask,:].groupby('method').agg(
    precision=pd.NamedAgg(column='precision', aggfunc='mean'),
    recall=pd.NamedAgg(column='recall', aggfunc='mean'),
    fmeasure=pd.NamedAgg(column='fmeasure', aggfunc='mean'),
).sort_values("fmeasure", ascending=False)

Unnamed: 0_level_0,precision,recall,fmeasure
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
simple_rag_term_context,0.538474,0.977273,0.688497
simple_rag_term_context_with_rerank,0.538474,0.977273,0.688497
rag_with_rerank,0.522312,0.931818,0.664521
simple_rag_term_explanation_context,0.52018,0.977273,0.663047
simple_rag_score,0.510065,0.912338,0.648971
simple_rag_term_explanation_context_with_rerank,0.467907,0.912338,0.603991
simple_rag_term,0.454561,0.911157,0.598384
simple_rag_term_explanation,0.314397,0.721074,0.422351
bare_model,0.005719,0.043979,0.009659


In [37]:
question_type = [obj for idx, obj in enumerate(trainset) if obj['type']!='acronym']
mask = experiments['question'].isin([qt['question'] for qt in question_type])
experiments.loc[mask,:].groupby('method').agg(
    precision=pd.NamedAgg(column='precision', aggfunc='mean'),
    recall=pd.NamedAgg(column='recall', aggfunc='mean'),
    fmeasure=pd.NamedAgg(column='fmeasure', aggfunc='mean'),
).sort_values("fmeasure", ascending=False)

Unnamed: 0_level_0,precision,recall,fmeasure
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rag_with_rerank,0.333871,0.776153,0.446621
simple_rag_term_context_with_rerank,0.299617,0.748191,0.403068
simple_rag_term_context,0.289362,0.773937,0.393448
simple_rag_score,0.286013,0.747247,0.392362
simple_rag_term_explanation_context_with_rerank,0.293737,0.697022,0.386061
simple_rag_term_explanation_context,0.263273,0.682694,0.346371
simple_rag_term,0.293705,0.460886,0.326035
simple_rag_term_explanation,0.284122,0.385472,0.283368
bare_model,0.060662,0.262593,0.093908


In [None]:
# test: tune prompt all, and update model to be bigger e.g. maverick