In [None]:
## AI Usage Log
"""
- **Tool**: ChatGPT 5.0
- **Purpose**: 
    Assistance to create the RAG pipelines and evaluation
    Assistance debugging pipeline problems
    Code Documentation (Help Developing the README)
- **Input**:
    You are a code assistant. Your task is to crate a version of the README file 
    for this project, it might include at least these items:
        setup instructions
        Inline code comments explaining complex logic 
        Configuration files with parameter explanations 
        Error handling and logging throughout
- **Output Usage**: I first read the entire code proposed by the AI, and contrast that 
    against my current implementation or the steps I followed until that point.
    Based on how accurate the code was, I incorporate those functions, one at a time,
    and then test it to ensure a correct setting.
- **Verification**: I ran tests with a low number of samples to asses how the proposed code
    was working.
"""


In [12]:
# Make project root importable from inside notebooks/
import os
import sys
import pathlib
import numpy as np
import pandas as pd
from argparse import Namespace

PROJECT_ROOT = pathlib.Path.cwd().parent              # notebooks/.. -> project root
SRC_DIR = PROJECT_ROOT / "src"
(SRC_DIR / "__init__.py").touch(exist_ok=True)        # ensure src is a package
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import src.evaluation as evalmod
from src.utils import DEFAULT_SYSTEM_PROMPT
from src import naive_rag, utils, enhanced_rag 
import src.ragas_evaluation as ragas_evaluation 

# Quiet tokenizers warning; force Milvus Lite (avoid server env vars)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
for v in ["MILVUS_URI","MILVUS_HOST","MILVUS_ADDRESS","MILVUS_PORT"]:
    os.environ.pop(v, None)

# Define common paths (adjust if needed)
PASSAGES = PROJECT_ROOT / "data" / "processed" / "passages.csv"
QUERIES  = PROJECT_ROOT / "data" / "evaluation" / "test_dataset.csv"
OUTDIR   = PROJECT_ROOT / "results"
DBFILE   ="rag_wikipedia_mini.db"

PASSAGES, QUERIES, OUTDIR, DBFILE

(PosixPath('/Users/diegoalejandrobermudezsierra/Desktop/CMU/Courses/Third Semester/Mini 1/Application of NLX/HW/assignment2-rag/data/processed/passages.csv'),
 PosixPath('/Users/diegoalejandrobermudezsierra/Desktop/CMU/Courses/Third Semester/Mini 1/Application of NLX/HW/assignment2-rag/data/evaluation/test_dataset.csv'),
 PosixPath('/Users/diegoalejandrobermudezsierra/Desktop/CMU/Courses/Third Semester/Mini 1/Application of NLX/HW/assignment2-rag/results'),
 'rag_wikipedia_mini.db')

In [2]:
from pymilvus import MilvusClient
import os
abs_db = os.path.abspath(str(DBFILE))
for v in ("MILVUS_URI","MILVUS_HOST","MILVUS_ADDRESS","MILVUS_PORT"):
    os.environ.pop(v, None)

# should succeed on 2.6.2
c = MilvusClient("rag_wikipedia_mini.db")
print("Milvus Lite OK:", abs_db)

Milvus Lite OK: /Users/diegoalejandrobermudezsierra/Desktop/CMU/Courses/Third Semester/Mini 1/Application of NLX/HW/assignment2-rag/notebooks/rag_wikipedia_mini.db



## Step 2: Preliminary Exploration of Results


In [3]:
# Small run: top-1 context, first 5 queries
args = Namespace(
    passages_uri=str(PASSAGES),
    queries_uri=str(QUERIES),
    embed_model="sentence-transformers/all-MiniLM-L6-v2",
    gen_model="google/flan-t5-base",
    embed_batch_size=128,
    top_k=1,
    max_new_tokens=128,
    temperature=0.0,
    max_queries=100,    
    output_file="naive_results.json",
    seed=42,
    milvus_db_path=str(DBFILE),
    milvus_collection="rag_mini",
    milvus_drop=True,  
    milvus_index_type="IVF_FLAT",
    milvus_metric="IP",
    prompt=DEFAULT_SYSTEM_PROMPT
)

naive_rag.run(args)

RESULTS_PATH = OUTDIR / "naive_results.json"
print("Wrote:", RESULTS_PATH)

[2025-10-03 05:47:46,531] [INFO] Loaded passages: 3114
[2025-10-03 05:47:46,531] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

[2025-10-03 05:47:54,178] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 05:47:55,101] [INFO] Loaded queries: 100


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
100%|██████████| 100/100 [00:00<00:00, 911805.22it/s]
[2025-10-03 05:48:12,668] [INFO] Wrote results to ../results/naive_results.json


Wrote: /Users/diegoalejandrobermudezsierra/Desktop/CMU/Courses/Third Semester/Mini 1/Application of NLX/HW/assignment2-rag/results/naive_results.json


## Step 3: Evaluation I

In [4]:
args = Namespace(
    passages_uri=str(PASSAGES),
    queries_uri=str(QUERIES),
    embed_model="sentence-transformers/all-MiniLM-L6-v2",
    gen_model="google/flan-t5-base",
    embed_batch_size=128,
    top_k=1,
    max_new_tokens=128,
    temperature=0.0,
    max_queries=200,    
    output_file="naive_results_baseline.json",
    seed=42,
    milvus_db_path=str(DBFILE),
    milvus_collection="rag_mini",
    milvus_drop=True,  
    milvus_index_type="IVF_FLAT",
    milvus_metric="IP",
    prompt=DEFAULT_SYSTEM_PROMPT
)

naive_rag.run(args)

RESULTS_PATH = OUTDIR / "naive_results_baseline.json"
print("Wrote:", RESULTS_PATH)

[2025-10-03 05:48:12,803] [INFO] Loaded passages: 3114
[2025-10-03 05:48:12,803] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

[2025-10-03 05:48:19,275] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 05:48:20,162] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 1005828.30it/s]
[2025-10-03 05:48:56,073] [INFO] Wrote results to ../results/naive_results_baseline.json


Wrote: /Users/diegoalejandrobermudezsierra/Desktop/CMU/Courses/Third Semester/Mini 1/Application of NLX/HW/assignment2-rag/results/naive_results_baseline.json


In [5]:
RESULTS_PATH = OUTDIR / "naive_results_baseline.json"
records = evalmod.load_results(str(RESULTS_PATH))
base_df = pd.DataFrame(records)

base_df["top1_pid"] = base_df["context_ids"].apply(lambda x: x[0] if isinstance(x, list) and len(x)>0 else None)

passages_df = pd.read_csv(PASSAGES)[["id","passage"]].rename(columns={"id":"pid"})
df = base_df.merge(passages_df, how="left", left_on="top1_pid", right_on="pid")
df.rename(columns={"passage":"context_text"}, inplace=True)

print("Rows:", len(df))
df.head(3)

Rows: 200


Unnamed: 0,qid,question,prediction,gold,context_ids,top1_pid,pid,context_text
0,0,Was Abraham Lincoln the sixteenth President of...,[285],yes,[285],285,285.0,"Lincoln was just seven years old when, in 1816..."
1,2,Did Lincoln sign the National Banking Act of 1...,Yes.,yes,[357],357,357.0,"Lincoln's second inauguration on March 4, 1865..."
2,4,Did his mother die of pneumonia?,No.,no,[259],259,259.0,T. S. Eliot described him during this phase of...


In [6]:
prompt_instruction = (
    "Role: Grounded QA assistant.\n"
    "Policy:\n"
    "  • Use ONLY the text in <context> — do not infer or add outside facts.\n"
    "  • If the answer is not in <context>, reply exactly: I don't know.\n"
    "  • Prefer verbatim entity names, numbers, and dates from <context>.\n"
    "  • Keep answers concise (≤ 35 words) and neutral.\n"
    "Output format:\n"
    "  Answer: <your answer>\n"
    "  Citations: [<passage_ids>]\n"
)
prompt_cot = (
    "Role: Grounded QA assistant.\n"
    "Policy:\n"
    "  • Use ONLY the text in <context>. If insufficient, say: I don't know.\n"
    "  • First extract 1–2 short evidence snippets with their [id].\n"
    "  • Then give a FINAL answer in ≤ 30 words; no hedging.\n"
    "Output format:\n"
    "  Evidence:\n"
    "    - [<id>] \"<≤12-word quote>\"\n"
    "    - [<id>] \"<≤12-word quote>\" (optional)\n"
    "  Final: <your answer>\n"
    "  Citations: [<passage_ids>]\n"
)


prompt_persona = (
    "You are a senior Wikipedia fact-checker.\n"
    "Style: precise, encyclopedic, and terse; no speculation or filler.\n"
    "Grounding rules:\n"
    "  • Answer strictly from <context>; otherwise say: I don't know.\n"
    "  • Preserve canonical names/labels; include units and dates as written.\n"
    "  • Prefer a single definitive sentence; lists only when the question asks for them.\n"
    "Output format:\n"
    "  Answer: <your answer>\n"
    "  Citations: [<passage_ids>]\n"
)

prompt_strategies = {
    "instruction": prompt_instruction,
    "cot":         prompt_cot,
    "persona":     prompt_persona,
}

In [7]:
for strat in ["instruction","cot","persona"]:

    args = Namespace(
        passages_uri=str(PASSAGES),
        queries_uri=str(QUERIES),
        embed_model="sentence-transformers/all-MiniLM-L6-v2",
        gen_model="google/flan-t5-base",
        embed_batch_size=128,
        top_k=1,
        max_new_tokens=128,
        temperature=0.0,
        max_queries=200,    
        output_file=f"naive_results_{strat}.json",
        seed=42,
        milvus_db_path=str(DBFILE),
        milvus_collection="rag_mini",
        milvus_drop=True,  
        milvus_index_type="IVF_FLAT",
        milvus_metric="IP",
        prompt=prompt_strategies[strat]
    )



    naive_rag.run(args)

    evalmod.evaluate_once(
        results_path=str(OUTDIR / f"naive_results_{strat}.json"),
        tag=f"top1_{strat}"
    )
    

[2025-10-03 05:48:56,248] [INFO] Loaded passages: 3114
[2025-10-03 05:48:56,248] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

[2025-10-03 05:49:02,751] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 05:49:03,655] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 999834.09it/s]
[2025-10-03 05:49:39,853] [INFO] Wrote results to ../results/naive_results_instruction.json
[2025-10-03 05:49:39,856] [INFO] N=200 | F1=0.443[0.378,0.508] | unknown_rate=0.155
[2025-10-03 05:49:39,870] [INFO] HF SQuAD | EM=37.00% | F1=44.37%
[2025-10-03 05:49:39,872] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top1_instruction', 'n': 200, 'f1_mean': 0.4428235916748713, 'unknown_rate': 0.155, 'f1_lo': 0.3780875183444191, 'f1_hi': 0.5075596650053235, 'em_mean': 0.37, 'hf_f1': 0.443734456997173}
[2025-10-03 05:49:40,004] [INFO] Loaded passages: 3114
[2025-10-03 05:49:40,004] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

[2025-10-03 05:49:46,713] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 05:49:47,633] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 1086607.25it/s]
[2025-10-03 05:50:26,295] [INFO] Wrote results to ../results/naive_results_cot.json
[2025-10-03 05:50:26,299] [INFO] N=200 | F1=0.489[0.424,0.554] | unknown_rate=0.020
[2025-10-03 05:50:26,311] [INFO] HF SQuAD | EM=41.50% | F1=49.01%
[2025-10-03 05:50:26,315] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top1_cot', 'n': 200, 'f1_mean': 0.48915347435255335, 'unknown_rate': 0.02, 'f1_lo': 0.4241258003847635, 'f1_hi': 0.5541811483203432, 'em_mean': 0.415, 'hf_f1': 0.4900643396748817}
[2025-10-03 05:50:26,435] [INFO] Loaded passages: 3114
[2025-10-03 05:50:26,435] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

[2025-10-03 05:50:33,105] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 05:50:34,089] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 1082401.03it/s]
[2025-10-03 05:51:10,592] [INFO] Wrote results to ../results/naive_results_persona.json
[2025-10-03 05:51:10,596] [INFO] N=200 | F1=0.497[0.432,0.563] | unknown_rate=0.055
[2025-10-03 05:51:10,607] [INFO] HF SQuAD | EM=42.50% | F1=49.84%
[2025-10-03 05:51:10,609] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top1_persona', 'n': 200, 'f1_mean': 0.4974510934001717, 'unknown_rate': 0.055, 'f1_lo': 0.43215458236585164, 'f1_hi': 0.5627476044344917, 'em_mean': 0.425, 'hf_f1': 0.49836195872250066}


## Step 4: Experimentation

In [9]:
for emb_size in [256, 384, 512]:
    for ret_docs in [3, 5, 10]:
        for strat in ["instruction","cot","persona"]:

            args = Namespace(
                passages_uri=str(PASSAGES),
                queries_uri=str(QUERIES),
                embed_model="sentence-transformers/all-MiniLM-L6-v2",
                gen_model="google/flan-t5-base",
                embed_batch_size=ret_docs,
                top_k=1,
                max_new_tokens=emb_size,
                temperature=0.0,
                max_queries=200,    
                output_file=f"naive_results_{ret_docs}_emb_{emb_size}_{strat}.json",
                seed=42,
                milvus_db_path=str(DBFILE),
                milvus_collection="rag_mini",
                milvus_drop=True,  
                milvus_index_type="IVF_FLAT",
                milvus_metric="IP",
                prompt=prompt_strategies[strat]
            )


            naive_rag.run(args)

            evalmod.evaluate_once(
                results_path=str(OUTDIR / f"naive_results_{ret_docs}_emb_{emb_size}_{strat}.json"),
                tag=f"top_{ret_docs}_emb_{emb_size}_{strat}"
            )
        

[2025-10-03 05:54:24,079] [INFO] Loaded passages: 3114
[2025-10-03 05:54:24,079] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/1038 [00:00<?, ?it/s]

[2025-10-03 05:54:39,349] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 05:54:40,267] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 885808.66it/s]
[2025-10-03 05:55:18,398] [INFO] Wrote results to ../results/naive_results_3_emb_256_instruction.json
[2025-10-03 05:55:18,402] [INFO] N=200 | F1=0.443[0.378,0.508] | unknown_rate=0.155
[2025-10-03 05:55:18,416] [INFO] HF SQuAD | EM=37.00% | F1=44.37%
[2025-10-03 05:55:18,418] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_3_emb_256_instruction', 'n': 200, 'f1_mean': 0.4428235916748713, 'unknown_rate': 0.155, 'f1_lo': 0.3780875183444191, 'f1_hi': 0.5075596650053235, 'em_mean': 0.37, 'hf_f1': 0.443734456997173}
[2025-10-03 05:55:18,557] [INFO] Loaded passages: 3114
[2025-10-03 05:55:18,558] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/1038 [00:00<?, ?it/s]

[2025-10-03 05:55:33,101] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 05:55:34,021] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 947865.31it/s]
[2025-10-03 05:56:12,124] [INFO] Wrote results to ../results/naive_results_3_emb_256_cot.json
[2025-10-03 05:56:12,128] [INFO] N=200 | F1=0.489[0.424,0.554] | unknown_rate=0.020
[2025-10-03 05:56:12,142] [INFO] HF SQuAD | EM=41.50% | F1=49.01%
[2025-10-03 05:56:12,153] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_3_emb_256_cot', 'n': 200, 'f1_mean': 0.48915347435255335, 'unknown_rate': 0.02, 'f1_lo': 0.4241258003847635, 'f1_hi': 0.5541811483203432, 'em_mean': 0.415, 'hf_f1': 0.4900643396748817}
[2025-10-03 05:56:12,320] [INFO] Loaded passages: 3114
[2025-10-03 05:56:12,321] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/1038 [00:00<?, ?it/s]

[2025-10-03 05:56:26,570] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 05:56:27,535] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 916787.76it/s]
[2025-10-03 05:57:04,515] [INFO] Wrote results to ../results/naive_results_3_emb_256_persona.json
[2025-10-03 05:57:04,519] [INFO] N=200 | F1=0.497[0.432,0.563] | unknown_rate=0.055
[2025-10-03 05:57:04,531] [INFO] HF SQuAD | EM=42.50% | F1=49.84%
[2025-10-03 05:57:04,533] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_3_emb_256_persona', 'n': 200, 'f1_mean': 0.4974510934001717, 'unknown_rate': 0.055, 'f1_lo': 0.43215458236585164, 'f1_hi': 0.5627476044344917, 'em_mean': 0.425, 'hf_f1': 0.49836195872250066}
[2025-10-03 05:57:04,676] [INFO] Loaded passages: 3114
[2025-10-03 05:57:04,676] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/623 [00:00<?, ?it/s]

[2025-10-03 05:57:51,890] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 05:57:52,790] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 952168.90it/s]
[2025-10-03 05:58:30,723] [INFO] Wrote results to ../results/naive_results_5_emb_256_instruction.json
[2025-10-03 05:58:30,726] [INFO] N=200 | F1=0.443[0.378,0.508] | unknown_rate=0.155
[2025-10-03 05:58:30,739] [INFO] HF SQuAD | EM=37.00% | F1=44.37%
[2025-10-03 05:58:30,742] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_5_emb_256_instruction', 'n': 200, 'f1_mean': 0.4428235916748713, 'unknown_rate': 0.155, 'f1_lo': 0.3780875183444191, 'f1_hi': 0.5075596650053235, 'em_mean': 0.37, 'hf_f1': 0.443734456997173}
[2025-10-03 05:58:30,882] [INFO] Loaded passages: 3114
[2025-10-03 05:58:30,882] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/623 [00:00<?, ?it/s]

[2025-10-03 05:58:41,179] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 05:58:42,158] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 858608.80it/s]
[2025-10-03 05:59:20,430] [INFO] Wrote results to ../results/naive_results_5_emb_256_cot.json
[2025-10-03 05:59:20,434] [INFO] N=200 | F1=0.489[0.424,0.554] | unknown_rate=0.020
[2025-10-03 05:59:20,447] [INFO] HF SQuAD | EM=41.50% | F1=49.01%
[2025-10-03 05:59:20,450] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_5_emb_256_cot', 'n': 200, 'f1_mean': 0.48915347435255335, 'unknown_rate': 0.02, 'f1_lo': 0.4241258003847635, 'f1_hi': 0.5541811483203432, 'em_mean': 0.415, 'hf_f1': 0.4900643396748817}
[2025-10-03 05:59:20,585] [INFO] Loaded passages: 3114
[2025-10-03 05:59:20,585] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/623 [00:00<?, ?it/s]

[2025-10-03 05:59:30,890] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 05:59:31,784] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 916787.76it/s]
[2025-10-03 06:00:08,839] [INFO] Wrote results to ../results/naive_results_5_emb_256_persona.json
[2025-10-03 06:00:08,844] [INFO] N=200 | F1=0.497[0.432,0.563] | unknown_rate=0.055
[2025-10-03 06:00:08,857] [INFO] HF SQuAD | EM=42.50% | F1=49.84%
[2025-10-03 06:00:08,859] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_5_emb_256_persona', 'n': 200, 'f1_mean': 0.4974510934001717, 'unknown_rate': 0.055, 'f1_lo': 0.43215458236585164, 'f1_hi': 0.5627476044344917, 'em_mean': 0.425, 'hf_f1': 0.49836195872250066}
[2025-10-03 06:00:08,991] [INFO] Loaded passages: 3114
[2025-10-03 06:00:08,992] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/312 [00:00<?, ?it/s]

[2025-10-03 06:00:58,456] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:00:59,355] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 943600.45it/s]
[2025-10-03 06:01:36,681] [INFO] Wrote results to ../results/naive_results_10_emb_256_instruction.json
[2025-10-03 06:01:36,686] [INFO] N=200 | F1=0.443[0.378,0.508] | unknown_rate=0.155
[2025-10-03 06:01:36,698] [INFO] HF SQuAD | EM=37.00% | F1=44.37%
[2025-10-03 06:01:36,701] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_10_emb_256_instruction', 'n': 200, 'f1_mean': 0.4428235916748713, 'unknown_rate': 0.155, 'f1_lo': 0.3780875183444191, 'f1_hi': 0.5075596650053235, 'em_mean': 0.37, 'hf_f1': 0.443734456997173}
[2025-10-03 06:01:36,836] [INFO] Loaded passages: 3114
[2025-10-03 06:01:36,836] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/312 [00:00<?, ?it/s]

[2025-10-03 06:01:45,244] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:01:46,153] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 847334.14it/s]
[2025-10-03 06:02:23,999] [INFO] Wrote results to ../results/naive_results_10_emb_256_cot.json
[2025-10-03 06:02:24,004] [INFO] N=200 | F1=0.489[0.424,0.554] | unknown_rate=0.020
[2025-10-03 06:02:24,016] [INFO] HF SQuAD | EM=41.50% | F1=49.01%
[2025-10-03 06:02:24,019] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_10_emb_256_cot', 'n': 200, 'f1_mean': 0.48915347435255335, 'unknown_rate': 0.02, 'f1_lo': 0.4241258003847635, 'f1_hi': 0.5541811483203432, 'em_mean': 0.415, 'hf_f1': 0.4900643396748817}
[2025-10-03 06:02:24,160] [INFO] Loaded passages: 3114
[2025-10-03 06:02:24,160] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/312 [00:00<?, ?it/s]

[2025-10-03 06:02:32,001] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:02:32,908] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 930000.89it/s]
[2025-10-03 06:03:10,328] [INFO] Wrote results to ../results/naive_results_10_emb_256_persona.json
[2025-10-03 06:03:10,332] [INFO] N=200 | F1=0.497[0.432,0.563] | unknown_rate=0.055
[2025-10-03 06:03:10,345] [INFO] HF SQuAD | EM=42.50% | F1=49.84%
[2025-10-03 06:03:10,347] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_10_emb_256_persona', 'n': 200, 'f1_mean': 0.4974510934001717, 'unknown_rate': 0.055, 'f1_lo': 0.43215458236585164, 'f1_hi': 0.5627476044344917, 'em_mean': 0.425, 'hf_f1': 0.49836195872250066}
[2025-10-03 06:03:10,487] [INFO] Loaded passages: 3114
[2025-10-03 06:03:10,487] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/1038 [00:00<?, ?it/s]

[2025-10-03 06:03:25,093] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:03:25,996] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 916787.76it/s]
[2025-10-03 06:04:02,750] [INFO] Wrote results to ../results/naive_results_3_emb_384_instruction.json
[2025-10-03 06:04:02,753] [INFO] N=200 | F1=0.443[0.378,0.508] | unknown_rate=0.155
[2025-10-03 06:04:02,765] [INFO] HF SQuAD | EM=37.00% | F1=44.37%
[2025-10-03 06:04:02,769] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_3_emb_384_instruction', 'n': 200, 'f1_mean': 0.4428235916748713, 'unknown_rate': 0.155, 'f1_lo': 0.3780875183444191, 'f1_hi': 0.5075596650053235, 'em_mean': 0.37, 'hf_f1': 0.443734456997173}
[2025-10-03 06:04:02,900] [INFO] Loaded passages: 3114
[2025-10-03 06:04:02,901] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/1038 [00:00<?, ?it/s]

[2025-10-03 06:04:16,936] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:04:17,837] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 920813.17it/s]
[2025-10-03 06:04:56,353] [INFO] Wrote results to ../results/naive_results_3_emb_384_cot.json
[2025-10-03 06:04:56,357] [INFO] N=200 | F1=0.489[0.424,0.554] | unknown_rate=0.020
[2025-10-03 06:04:56,370] [INFO] HF SQuAD | EM=41.50% | F1=49.01%
[2025-10-03 06:04:56,373] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_3_emb_384_cot', 'n': 200, 'f1_mean': 0.48915347435255335, 'unknown_rate': 0.02, 'f1_lo': 0.4241258003847635, 'f1_hi': 0.5541811483203432, 'em_mean': 0.415, 'hf_f1': 0.4900643396748817}
[2025-10-03 06:04:56,512] [INFO] Loaded passages: 3114
[2025-10-03 06:04:56,512] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/1038 [00:00<?, ?it/s]

[2025-10-03 06:05:10,500] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:05:11,397] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 917790.81it/s]
[2025-10-03 06:05:47,650] [INFO] Wrote results to ../results/naive_results_3_emb_384_persona.json
[2025-10-03 06:05:47,654] [INFO] N=200 | F1=0.497[0.432,0.563] | unknown_rate=0.055
[2025-10-03 06:05:47,665] [INFO] HF SQuAD | EM=42.50% | F1=49.84%
[2025-10-03 06:05:47,668] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_3_emb_384_persona', 'n': 200, 'f1_mean': 0.4974510934001717, 'unknown_rate': 0.055, 'f1_lo': 0.43215458236585164, 'f1_hi': 0.5627476044344917, 'em_mean': 0.425, 'hf_f1': 0.49836195872250066}
[2025-10-03 06:05:47,805] [INFO] Loaded passages: 3114
[2025-10-03 06:05:47,805] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/623 [00:00<?, ?it/s]

[2025-10-03 06:05:58,452] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:05:59,364] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 900065.24it/s]
[2025-10-03 06:06:36,051] [INFO] Wrote results to ../results/naive_results_5_emb_384_instruction.json
[2025-10-03 06:06:36,055] [INFO] N=200 | F1=0.443[0.378,0.508] | unknown_rate=0.155
[2025-10-03 06:06:36,068] [INFO] HF SQuAD | EM=37.00% | F1=44.37%
[2025-10-03 06:06:36,070] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_5_emb_384_instruction', 'n': 200, 'f1_mean': 0.4428235916748713, 'unknown_rate': 0.155, 'f1_lo': 0.3780875183444191, 'f1_hi': 0.5075596650053235, 'em_mean': 0.37, 'hf_f1': 0.443734456997173}
[2025-10-03 06:06:36,213] [INFO] Loaded passages: 3114
[2025-10-03 06:06:36,213] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/623 [00:00<?, ?it/s]

[2025-10-03 06:06:46,521] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:06:47,424] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 932067.56it/s]
[2025-10-03 06:07:25,387] [INFO] Wrote results to ../results/naive_results_5_emb_384_cot.json
[2025-10-03 06:07:25,391] [INFO] N=200 | F1=0.489[0.424,0.554] | unknown_rate=0.020
[2025-10-03 06:07:25,404] [INFO] HF SQuAD | EM=41.50% | F1=49.01%
[2025-10-03 06:07:25,407] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_5_emb_384_cot', 'n': 200, 'f1_mean': 0.48915347435255335, 'unknown_rate': 0.02, 'f1_lo': 0.4241258003847635, 'f1_hi': 0.5541811483203432, 'em_mean': 0.415, 'hf_f1': 0.4900643396748817}
[2025-10-03 06:07:25,541] [INFO] Loaded passages: 3114
[2025-10-03 06:07:25,542] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/623 [00:00<?, ?it/s]

[2025-10-03 06:07:36,193] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:07:37,098] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 947865.31it/s]
[2025-10-03 06:08:13,808] [INFO] Wrote results to ../results/naive_results_5_emb_384_persona.json
[2025-10-03 06:08:13,812] [INFO] N=200 | F1=0.497[0.432,0.563] | unknown_rate=0.055
[2025-10-03 06:08:13,824] [INFO] HF SQuAD | EM=42.50% | F1=49.84%
[2025-10-03 06:08:13,826] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_5_emb_384_persona', 'n': 200, 'f1_mean': 0.4974510934001717, 'unknown_rate': 0.055, 'f1_lo': 0.43215458236585164, 'f1_hi': 0.5627476044344917, 'em_mean': 0.425, 'hf_f1': 0.49836195872250066}
[2025-10-03 06:08:13,967] [INFO] Loaded passages: 3114
[2025-10-03 06:08:13,967] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/312 [00:00<?, ?it/s]

[2025-10-03 06:08:22,050] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:08:22,956] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 966429.49it/s]
[2025-10-03 06:08:59,265] [INFO] Wrote results to ../results/naive_results_10_emb_384_instruction.json
[2025-10-03 06:08:59,268] [INFO] N=200 | F1=0.443[0.378,0.508] | unknown_rate=0.155
[2025-10-03 06:08:59,281] [INFO] HF SQuAD | EM=37.00% | F1=44.37%
[2025-10-03 06:08:59,283] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_10_emb_384_instruction', 'n': 200, 'f1_mean': 0.4428235916748713, 'unknown_rate': 0.155, 'f1_lo': 0.3780875183444191, 'f1_hi': 0.5075596650053235, 'em_mean': 0.37, 'hf_f1': 0.443734456997173}
[2025-10-03 06:08:59,415] [INFO] Loaded passages: 3114
[2025-10-03 06:08:59,416] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/312 [00:00<?, ?it/s]

[2025-10-03 06:09:07,283] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:09:08,179] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 934143.43it/s]
[2025-10-03 06:09:46,227] [INFO] Wrote results to ../results/naive_results_10_emb_384_cot.json
[2025-10-03 06:09:46,230] [INFO] N=200 | F1=0.489[0.424,0.554] | unknown_rate=0.020
[2025-10-03 06:09:46,243] [INFO] HF SQuAD | EM=41.50% | F1=49.01%
[2025-10-03 06:09:46,246] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_10_emb_384_cot', 'n': 200, 'f1_mean': 0.48915347435255335, 'unknown_rate': 0.02, 'f1_lo': 0.4241258003847635, 'f1_hi': 0.5541811483203432, 'em_mean': 0.415, 'hf_f1': 0.4900643396748817}
[2025-10-03 06:09:46,370] [INFO] Loaded passages: 3114
[2025-10-03 06:09:46,371] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/312 [00:00<?, ?it/s]

[2025-10-03 06:09:54,097] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:09:54,996] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 939373.80it/s]
[2025-10-03 06:10:31,750] [INFO] Wrote results to ../results/naive_results_10_emb_384_persona.json
[2025-10-03 06:10:31,754] [INFO] N=200 | F1=0.497[0.432,0.563] | unknown_rate=0.055
[2025-10-03 06:10:31,766] [INFO] HF SQuAD | EM=42.50% | F1=49.84%
[2025-10-03 06:10:31,769] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_10_emb_384_persona', 'n': 200, 'f1_mean': 0.4974510934001717, 'unknown_rate': 0.055, 'f1_lo': 0.43215458236585164, 'f1_hi': 0.5627476044344917, 'em_mean': 0.425, 'hf_f1': 0.49836195872250066}
[2025-10-03 06:10:31,904] [INFO] Loaded passages: 3114
[2025-10-03 06:10:31,905] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/1038 [00:00<?, ?it/s]

[2025-10-03 06:10:46,433] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:10:47,344] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 943600.45it/s]
[2025-10-03 06:11:23,806] [INFO] Wrote results to ../results/naive_results_3_emb_512_instruction.json
[2025-10-03 06:11:23,810] [INFO] N=200 | F1=0.443[0.378,0.508] | unknown_rate=0.155
[2025-10-03 06:11:23,823] [INFO] HF SQuAD | EM=37.00% | F1=44.37%
[2025-10-03 06:11:23,826] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_3_emb_512_instruction', 'n': 200, 'f1_mean': 0.4428235916748713, 'unknown_rate': 0.155, 'f1_lo': 0.3780875183444191, 'f1_hi': 0.5075596650053235, 'em_mean': 0.37, 'hf_f1': 0.443734456997173}
[2025-10-03 06:11:23,952] [INFO] Loaded passages: 3114
[2025-10-03 06:11:23,952] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/1038 [00:00<?, ?it/s]

[2025-10-03 06:11:37,956] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:11:38,870] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 925894.92it/s]
[2025-10-03 06:12:17,058] [INFO] Wrote results to ../results/naive_results_3_emb_512_cot.json
[2025-10-03 06:12:17,062] [INFO] N=200 | F1=0.489[0.424,0.554] | unknown_rate=0.020
[2025-10-03 06:12:17,074] [INFO] HF SQuAD | EM=41.50% | F1=49.01%
[2025-10-03 06:12:17,077] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_3_emb_512_cot', 'n': 200, 'f1_mean': 0.48915347435255335, 'unknown_rate': 0.02, 'f1_lo': 0.4241258003847635, 'f1_hi': 0.5541811483203432, 'em_mean': 0.415, 'hf_f1': 0.4900643396748817}
[2025-10-03 06:12:17,209] [INFO] Loaded passages: 3114
[2025-10-03 06:12:17,210] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/1038 [00:00<?, ?it/s]

[2025-10-03 06:12:31,215] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:12:32,118] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 897177.33it/s]
[2025-10-03 06:13:09,456] [INFO] Wrote results to ../results/naive_results_3_emb_512_persona.json
[2025-10-03 06:13:09,460] [INFO] N=200 | F1=0.497[0.432,0.563] | unknown_rate=0.055
[2025-10-03 06:13:09,472] [INFO] HF SQuAD | EM=42.50% | F1=49.84%
[2025-10-03 06:13:09,476] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_3_emb_512_persona', 'n': 200, 'f1_mean': 0.4974510934001717, 'unknown_rate': 0.055, 'f1_lo': 0.43215458236585164, 'f1_hi': 0.5627476044344917, 'em_mean': 0.425, 'hf_f1': 0.49836195872250066}
[2025-10-03 06:13:09,609] [INFO] Loaded passages: 3114
[2025-10-03 06:13:09,609] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/623 [00:00<?, ?it/s]

[2025-10-03 06:13:20,021] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:13:20,931] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 917790.81it/s]
[2025-10-03 06:13:57,120] [INFO] Wrote results to ../results/naive_results_5_emb_512_instruction.json
[2025-10-03 06:13:57,123] [INFO] N=200 | F1=0.443[0.378,0.508] | unknown_rate=0.155
[2025-10-03 06:13:57,137] [INFO] HF SQuAD | EM=37.00% | F1=44.37%
[2025-10-03 06:13:57,141] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_5_emb_512_instruction', 'n': 200, 'f1_mean': 0.4428235916748713, 'unknown_rate': 0.155, 'f1_lo': 0.3780875183444191, 'f1_hi': 0.5075596650053235, 'em_mean': 0.37, 'hf_f1': 0.443734456997173}
[2025-10-03 06:13:57,270] [INFO] Loaded passages: 3114
[2025-10-03 06:13:57,270] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/623 [00:00<?, ?it/s]

[2025-10-03 06:14:07,611] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:14:08,514] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 946795.49it/s]
[2025-10-03 06:14:46,526] [INFO] Wrote results to ../results/naive_results_5_emb_512_cot.json
[2025-10-03 06:14:46,530] [INFO] N=200 | F1=0.489[0.424,0.554] | unknown_rate=0.020
[2025-10-03 06:14:46,542] [INFO] HF SQuAD | EM=41.50% | F1=49.01%
[2025-10-03 06:14:46,545] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_5_emb_512_cot', 'n': 200, 'f1_mean': 0.48915347435255335, 'unknown_rate': 0.02, 'f1_lo': 0.4241258003847635, 'f1_hi': 0.5541811483203432, 'em_mean': 0.415, 'hf_f1': 0.4900643396748817}
[2025-10-03 06:14:46,671] [INFO] Loaded passages: 3114
[2025-10-03 06:14:46,671] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/623 [00:00<?, ?it/s]

[2025-10-03 06:14:56,921] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:14:57,818] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 893355.48it/s]
[2025-10-03 06:15:35,398] [INFO] Wrote results to ../results/naive_results_5_emb_512_persona.json
[2025-10-03 06:15:35,402] [INFO] N=200 | F1=0.497[0.432,0.563] | unknown_rate=0.055
[2025-10-03 06:15:35,416] [INFO] HF SQuAD | EM=42.50% | F1=49.84%
[2025-10-03 06:15:35,419] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_5_emb_512_persona', 'n': 200, 'f1_mean': 0.4974510934001717, 'unknown_rate': 0.055, 'f1_lo': 0.43215458236585164, 'f1_hi': 0.5627476044344917, 'em_mean': 0.425, 'hf_f1': 0.49836195872250066}
[2025-10-03 06:15:35,557] [INFO] Loaded passages: 3114
[2025-10-03 06:15:35,557] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/312 [00:00<?, ?it/s]

[2025-10-03 06:15:43,719] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:15:44,628] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 812849.61it/s]
[2025-10-03 06:16:21,398] [INFO] Wrote results to ../results/naive_results_10_emb_512_instruction.json
[2025-10-03 06:16:21,402] [INFO] N=200 | F1=0.443[0.378,0.508] | unknown_rate=0.155
[2025-10-03 06:16:21,417] [INFO] HF SQuAD | EM=37.00% | F1=44.37%
[2025-10-03 06:16:21,421] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_10_emb_512_instruction', 'n': 200, 'f1_mean': 0.4428235916748713, 'unknown_rate': 0.155, 'f1_lo': 0.3780875183444191, 'f1_hi': 0.5075596650053235, 'em_mean': 0.37, 'hf_f1': 0.443734456997173}
[2025-10-03 06:16:21,561] [INFO] Loaded passages: 3114
[2025-10-03 06:16:21,562] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/312 [00:00<?, ?it/s]

[2025-10-03 06:16:29,443] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:16:30,354] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 975419.53it/s]
[2025-10-03 06:17:08,530] [INFO] Wrote results to ../results/naive_results_10_emb_512_cot.json
[2025-10-03 06:17:08,534] [INFO] N=200 | F1=0.489[0.424,0.554] | unknown_rate=0.020
[2025-10-03 06:17:08,546] [INFO] HF SQuAD | EM=41.50% | F1=49.01%
[2025-10-03 06:17:08,548] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_10_emb_512_cot', 'n': 200, 'f1_mean': 0.48915347435255335, 'unknown_rate': 0.02, 'f1_lo': 0.4241258003847635, 'f1_hi': 0.5541811483203432, 'em_mean': 0.415, 'hf_f1': 0.4900643396748817}
[2025-10-03 06:17:08,685] [INFO] Loaded passages: 3114
[2025-10-03 06:17:08,686] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/312 [00:00<?, ?it/s]

[2025-10-03 06:17:17,716] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:17:18,681] [INFO] Loaded queries: 200


Entity count: 3114
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [00:00<00:00, 943600.45it/s]
[2025-10-03 06:17:55,303] [INFO] Wrote results to ../results/naive_results_10_emb_512_persona.json
[2025-10-03 06:17:55,307] [INFO] N=200 | F1=0.497[0.432,0.563] | unknown_rate=0.055
[2025-10-03 06:17:55,318] [INFO] HF SQuAD | EM=42.50% | F1=49.84%
[2025-10-03 06:17:55,321] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'top_10_emb_512_persona', 'n': 200, 'f1_mean': 0.4974510934001717, 'unknown_rate': 0.055, 'f1_lo': 0.43215458236585164, 'f1_hi': 0.5627476044344917, 'em_mean': 0.425, 'hf_f1': 0.49836195872250066}



## Step 5 & 6: RAGAs (faithfulness & retrieval quality)


In [11]:
import importlib

args_enh = Namespace(
    passages_uri=str(PASSAGES),
    queries_uri=str(QUERIES),
    embed_model="sentence-transformers/all-MiniLM-L6-v2",
    gen_model="google/flan-t5-base",
    embed_batch_size=128,
    retrieve_candidates=50,     
    top_k=5,                    
    rerank_model="BAAI/bge-reranker-base",
    rerank_max_length=512,
    min_conf=0.25,              
    min_margin=0.05,
    abstain_on_low_conf=True,
    max_new_tokens=128,
    temperature=0.0,
    max_queries=200,
    output_file="enhanced_results_baseline_top5.json",
    seed=42,
    milvus_db_path=str(DBFILE),
    milvus_collection="rag_enhanced_top5",
    milvus_drop=True,
    milvus_index_type="IVF_FLAT",
    milvus_metric="IP",
    prompt=utils.DEFAULT_SYSTEM_PROMPT
)

importlib.reload(enhanced_rag)
enhanced_rag.run(args_enh)

enhanced_path = OUTDIR / args_enh.output_file
evalmod.evaluate_once(
    results_path=str(enhanced_path),
    tag="enhanced_top5_persona_rerank"
)

[2025-10-03 06:17:55,483] [INFO] Loaded passages=3114 queries=200
[2025-10-03 06:17:55,483] [INFO] Encoding 3114 passages with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

[2025-10-03 06:18:02,635] [INFO] Embeddings shape: (3114, 384)
[2025-10-03 06:18:03,578] [INFO] Loading CrossEncoder: BAAI/bge-reranker-base


Entity count: 3114
Collection schema: {'collection_name': 'rag_enhanced_top5', 'auto_id': False, 'num_shards': 0, 'description': 'RAG passages (id, passage, embedding)', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


100%|██████████| 200/200 [07:36<00:00,  2.28s/it]
[2025-10-03 06:25:42,827] [INFO] Wrote enhanced results to ../results/enhanced_results_baseline_top5.json
[2025-10-03 06:25:42,843] [INFO] N=200 | F1=0.090[0.054,0.127] | unknown_rate=0.545
[2025-10-03 06:25:42,858] [INFO] HF SQuAD | EM=6.50% | F1=9.10%
[2025-10-03 06:25:42,861] [INFO] Appended to ../results/comparison_analysis.csv: {'tag': 'enhanced_top5_persona_rerank', 'n': 200, 'f1_mean': 0.09040029385639127, 'unknown_rate': 0.545, 'f1_lo': 0.05413691067911971, 'f1_hi': 0.12666367703366282, 'em_mean': 0.065, 'hf_f1': 0.09099614276934323}


Unnamed: 0,qid,f1,unknown
0,0,0.000000,1
1,2,0.000000,1
2,4,0.000000,1
3,6,0.000000,1
4,8,0.000000,1
...,...,...,...
195,436,0.000000,1
196,438,1.000000,0
197,440,0.000000,1
198,442,0.000000,1


In [40]:
# Map tags -> results json paths you want to score with RAGAs
runs = {
    "naive_top1_instruction": OUTDIR / "naive_results_instruction.json",
    "naive_top1_cot":         OUTDIR / "naive_results_cot.json",
    "naive_top1_persona":     OUTDIR / "naive_results_persona.json",
   "enhanced_top5_persona_rerank": OUTDIR / "enhanced_results_baseline_top5.json",
}

runs

{'naive_top1_instruction': PosixPath('/Users/diegoalejandrobermudezsierra/Desktop/CMU/Courses/Third Semester/Mini 1/Application of NLX/HW/assignment2-rag/results/naive_results_instruction.json'),
 'naive_top1_cot': PosixPath('/Users/diegoalejandrobermudezsierra/Desktop/CMU/Courses/Third Semester/Mini 1/Application of NLX/HW/assignment2-rag/results/naive_results_cot.json'),
 'naive_top1_persona': PosixPath('/Users/diegoalejandrobermudezsierra/Desktop/CMU/Courses/Third Semester/Mini 1/Application of NLX/HW/assignment2-rag/results/naive_results_persona.json'),
 'enhanced_top5_persona_rerank': PosixPath('/Users/diegoalejandrobermudezsierra/Desktop/CMU/Courses/Third Semester/Mini 1/Application of NLX/HW/assignment2-rag/results/enhanced_results_baseline_top5.json')}

In [None]:
import importlib
importlib.reload(enhanced_rag)

# Judge model (good balance of cost/latency)
RAGAS_JUDGE_MODEL = "gpt-4o-mini"

# Env knobs used by ragas_evaluation.eval_one()
import os
os.environ.update({
    "RAGAS_BATCH": "50",          
    "RAGAS_WORKERS": "2",         
    "RAGAS_CTX_CAP": "5",         
    "RAGAS_CTX_MAX_CHARS": "1800",
    "RAGAS_LLM_TIMEOUT": "60",    
    "RAGAS_LLM_RETRIES": "6",     
    "RAGAS_SAMPLE_N": "200",
})


summaries = []
for tag, path in runs.items():
    print(f"--- RAGAs evaluating: {tag} ---")
    s = ragas_evaluation.eval_one(
        results_json=str(path),
        passages_csv=str(PASSAGES),
        tag=tag,
        out_dir=str(OUTDIR),
        model_name=RAGAS_JUDGE_MODEL,
    )
    summaries.append(s)

import pandas as pd
pd.DataFrame(summaries).sort_values("tag")

--- RAGAs evaluating: naive_top1_instruction ---


Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

Exception raised in Job[0]: TimeoutError()
Exception raised in Job[6]: TimeoutError()
Exception raised in Job[8]: TimeoutError()
Exception raised in Job[16]: TimeoutError()
Exception raised in Job[20]: TimeoutError()
Exception raised in Job[29]: TimeoutError()
Exception raised in Job[48]: TimeoutError()
Exception raised in Job[76]: TimeoutError()
Exception raised in Job[80]: TimeoutError()
Exception raised in Job[88]: TimeoutError()
Exception raised in Job[89]: TimeoutError()
Exception raised in Job[96]: TimeoutError()
Exception raised in Job[102]: TimeoutError()
Exception raised in Job[147]: IndexError(list index out of range)
Exception raised in Job[136]: TimeoutError()
Exception raised in Job[140]: TimeoutError()
Exception raised in Job[159]: IndexError(list index out of range)
Exception raised in Job[173]: TimeoutError()
Exception raised in Job[174]: TimeoutError()


Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

Exception raised in Job[2]: TimeoutError()
Exception raised in Job[32]: TimeoutError()
Exception raised in Job[44]: TimeoutError()
Exception raised in Job[60]: TimeoutError()
Exception raised in Job[99]: IndexError(list index out of range)
Exception raised in Job[76]: TimeoutError()
Exception raised in Job[97]: TimeoutError()
Exception raised in Job[100]: TimeoutError()
Exception raised in Job[115]: TimeoutError()
Exception raised in Job[120]: TimeoutError()
Exception raised in Job[138]: TimeoutError()
Exception raised in Job[167]: TimeoutError()
Exception raised in Job[176]: TimeoutError()
Exception raised in Job[180]: TimeoutError()
Exception raised in Job[188]: TimeoutError()
Exception raised in Job[195]: TimeoutError()


Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

Exception raised in Job[0]: TimeoutError()
Exception raised in Job[15]: TimeoutError()
Exception raised in Job[51]: IndexError(list index out of range)
Exception raised in Job[38]: TimeoutError()
Exception raised in Job[42]: TimeoutError()
Exception raised in Job[54]: TimeoutError()
Exception raised in Job[55]: TimeoutError()
Exception raised in Job[79]: IndexError(list index out of range)
Exception raised in Job[76]: TimeoutError()
Exception raised in Job[88]: TimeoutError()
Exception raised in Job[96]: TimeoutError()
Exception raised in Job[107]: IndexError(list index out of range)


KeyboardInterrupt: 

: 

In [39]:
summary_csv = OUTDIR / "ragas_summary.csv"

summary_df = pd.read_csv(summary_csv)
summary_df.sort_values(["tag"]).reset_index(drop=True)


summary_df = summary_df[((summary_df['n'] > 100) & (~pd.isnull(summary_df['ctx_char_cap'])))]
summary_df = summary_df.sort_values(['faithfulness', 'context_precision', 'context_recall', 'answer_relevancy'], ascending=False)
summary_df = summary_df.drop_duplicates('tag').reset_index(drop=True)
summary_df.to_csv(summary_csv, index=False)
summary_df = summary_df.drop(['ctx_max_chars', 'sample_n'], axis = 1)
summary_df

Unnamed: 0,tag,n,faithfulness,context_precision,context_recall,answer_relevancy,ctx_cap,batch,judge,ctx_char_cap
0,naive_top1_instruction,197,0.025079,0.023142,0.02279,0.612359,5,25,gpt-4o-mini,1800.0
1,naive_top1_persona,197,0.015881,0.025381,0.035533,0.641975,5,25,gpt-4o-mini,1800.0
2,naive_top1_cot,197,0.00705,0.032416,0.025804,0.70025,5,25,gpt-4o-mini,1800.0
3,enhanced_top5_persona_rerank,200,0.0,0.0,0.054167,0.28809,5,25,gpt-4o-mini,1800.0
