In [4]:
!wget -r -N -c -np --user jashrajm --ask-password https://physionet.org/files/archehr-qa-bionlp-task-2025/1.1/

Password for user ‘jashrajm’: ^C


In [5]:
import xml.etree.ElementTree as ET
import pandas as pd

# Load XML file
xml_file ="/content/physionet.org/files/archehr-qa-bionlp-task-2025/1.1/dev/archehr-qa.xml" # Replace with your actual file
tree = ET.parse(xml_file)
root = tree.getroot()

# List to store extracted data
data = []

# Iterate over each <case> element
for case in root.findall("case"):
    row_data = {}

    # Extracting attributes
    row_data["case_id"] = case.get("id", "N/A")

    # Extracting text fields (strip to remove unwanted spaces/newlines)
    row_data["patient_narrative"] = case.find("patient_narrative").text.strip() if case.find("patient_narrative") is not None else "N/A"
    row_data["clinician_question"] = case.find("clinician_question").text.strip() if case.find("clinician_question") is not None else "N/A"
    row_data["note_excerpt"] = case.find("note_excerpt").text.strip() if case.find("note_excerpt") is not None else "N/A"

    # Extracting phrases from <patient_question> into a dictionary
    phrases_dict = {
        phrase.get("id", "N/A"): phrase.text.strip()
        for phrase in case.findall("patient_question/phrase") if phrase.text
    }
    row_data["patient_question"] = phrases_dict

    # Extracting sentences from <note_excerpt_sentences> into a dictionary
    sentences_dict = {
        sentence.get("id", "N/A"): sentence.text.strip()
        for sentence in case.findall("note_excerpt_sentences/sentence") if sentence.text
    }
    row_data["note_excerpt_sentences"] = sentences_dict

    # Append row data
    data.append(row_data)

# Convert to DataFrame
df = pd.DataFrame(data)

# Display DataFrame
print(df)


   case_id                                  patient_narrative  \
0        1  I had severe abdomen pain and was hospitalised...   
1        2  I just wrote about my dad given multiple shots...   
2        3  my son fell and lost conciousness for a couple...   
3        4  I am 48 years old. On February 20, I passed ou...   
4        5  I overdosed October 4th on trihexyphenidyl, th...   
5        6  My dad has been killed recently by doctors tre...   
6        7  My mother n law had a surgery about 10 years a...   
7        8  I developed jaundice and had liver failure.I w...   
8        9  I 8month delivered baby twins but the operatio...   
9       10  Hi There, my best friend who is 36 yrs old hav...   
10      11  A friend went to the emergency room this past ...   
11      12  My Mom just called me complaining of stomach p...   
12      13  I have a tumor in my back and I have 3 disc bu...   
13      14  I have had for almost one month extreme pain i...   
14      15  Hi. I came to

In [6]:
df.head(1)

Unnamed: 0,case_id,patient_narrative,clinician_question,note_excerpt,patient_question,note_excerpt_sentences
0,1,I had severe abdomen pain and was hospitalised...,Why was ERCP recommended to him over continuin...,Brief Hospital Course:\n\nDuring the ERCP a pa...,{'0': 'My question is if the sludge was there ...,"{'0': 'Brief Hospital Course:', '1': 'During t..."


In [7]:
df['patient_question'][3]

{'0': 'My doctor performed a cardiac catherization.',
 '1': 'Was this invasive, risky procedure necessary.'}

In [8]:
df['note_excerpt_sentences'][2]

{'0': 'Discharge Instructions:\nYou were admitted to the hospital after sustaining a trauamtic\nbrain injury due to a fall.',
 '1': 'This injury did not require any\noperations.',
 '2': 'It was also noted on chest xray that there is an\narea of consolidation in your lung.',
 '3': 'The infectious Disease\ndoctors [**First Name (Titles) **] [**Last Name (Titles) 4221**] for recommendations on antibiotic\ntreatment.',
 '4': 'Because of your head injury you may experience some of the\nfollowing symptoms: drowsiness, headaches, dizziness,\nirritability, short term memory loss - these are all normal and\nshould decrease over the next\nseveral weeks.',
 '5': 'It is being strongly recommended that you follow\nup with Dr. [**First Name8 (NamePattern2) **] [**Last Name (NamePattern1) **] who is a doctor [**First Name (Titles) **] [**Last Name (Titles) 91506**]\nin trauamtic brain injuries.',
 '6': 'His contact information has been\nprovided to you.',
 '7': 'Please report any:',
 '8': '*fever\n*n

In [9]:
import json

# Load JSON file
json_file = "/content/physionet.org/files/archehr-qa-bionlp-task-2025/1.1/dev/archehr-qa_key.json"  # Replace with your actual file path
with open(json_file, "r") as file:
    json_data = json.load(file)

# Convert JSON into a dictionary for quick lookup
case_relevance_dict = {}
for item in json_data:  # Iterating over each case in the JSON file
    case_id = item["case_id"]
    relevance_dict = {"not-relevant": [], "supplementary": [], "essential": []}

    for answer in item["answers"]:
        sentence_id = answer["sentence_id"]
        relevance = answer["relevance"]
        if relevance in relevance_dict:
            relevance_dict[relevance].append(sentence_id)

    case_relevance_dict[case_id] = relevance_dict

# Function to get sentence lists based on case_id
def get_relevance(case_id, relevance_type):
    return case_relevance_dict.get(str(case_id), {}).get(relevance_type, [])

# Adding new columns to the DataFrame
df["not-relevant"] = df["case_id"].apply(lambda x: get_relevance(x, "not-relevant"))
df["supplementary"] = df["case_id"].apply(lambda x: get_relevance(x, "supplementary"))
df["essential"] = df["case_id"].apply(lambda x: get_relevance(x, "essential"))

# Display updated DataFrame
print(df)


   case_id                                  patient_narrative  \
0        1  I had severe abdomen pain and was hospitalised...   
1        2  I just wrote about my dad given multiple shots...   
2        3  my son fell and lost conciousness for a couple...   
3        4  I am 48 years old. On February 20, I passed ou...   
4        5  I overdosed October 4th on trihexyphenidyl, th...   
5        6  My dad has been killed recently by doctors tre...   
6        7  My mother n law had a surgery about 10 years a...   
7        8  I developed jaundice and had liver failure.I w...   
8        9  I 8month delivered baby twins but the operatio...   
9       10  Hi There, my best friend who is 36 yrs old hav...   
10      11  A friend went to the emergency room this past ...   
11      12  My Mom just called me complaining of stomach p...   
12      13  I have a tumor in my back and I have 3 disc bu...   
13      14  I have had for almost one month extreme pain i...   
14      15  Hi. I came to

In [10]:
df.head(1)

Unnamed: 0,case_id,patient_narrative,clinician_question,note_excerpt,patient_question,note_excerpt_sentences,not-relevant,supplementary,essential
0,1,I had severe abdomen pain and was hospitalised...,Why was ERCP recommended to him over continuin...,Brief Hospital Course:\n\nDuring the ERCP a pa...,{'0': 'My question is if the sludge was there ...,"{'0': 'Brief Hospital Course:', '1': 'During t...","[0, 2, 3, 4, 8]",[],"[1, 5, 6, 7]"


In [11]:
df['not-relevant']

Unnamed: 0,not-relevant
0,"[0, 2, 3, 4, 8]"
1,"[0, 2, 3, 5, 6, 8, 9, 10]"
2,"[0, 1, 2, 3, 6]"
3,"[0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
4,"[0, 1, 3, 4, 5, 6, 7, 13, 14, 19, 20, 21, 22]"
5,"[0, 2, 3, 5, 7, 8, 9, 10, 23, 24]"
6,"[0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16..."
7,"[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14..."
8,"[4, 5, 6, 8, 9, 10, 11, 12, 16, 17, 18]"
9,"[0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1..."


In [19]:
import pandas as pd
import numpy as np

# Convert comma-separated indices to lists and calculate their lengths
df["essential_length"] = df["essential"].apply(lambda x: len(x) if x else 0)
df["supplementary_length"] = df["supplementary"].apply(lambda x: len(x) if x else 0)

# Create a combined DataFrame with statistics
stats_df = pd.DataFrame({
    "Category": ["Essential", "Supplementary"],
    "Average Length": [np.mean(df["essential_length"]), np.mean(df["supplementary_length"])],
    "Median Length": [np.median(df["essential_length"]), np.median(df["supplementary_length"])],
    "max len":[np.max(df["essential_length"]), np.max(df["supplementary_length"])],
    "min len":[np.min(df["essential_length"]), np.min(df["supplementary_length"])]
})

# Display final DataFrame
print(stats_df)


        Category  Average Length  Median Length  max len  min len
0      Essential            6.90            5.5       15        2
1  Supplementary            2.55            1.0       14        0


In [15]:
# Convert comma-separated indices to lists and calculate their lengths
df["essential_length"] = df["essential"].apply(lambda x: len(x) if x else 0)
df["supplementary_length"] = df["supplementary"].apply(lambda x: len(x) if x else 0)

# Combine both lengths into a single series
all_lengths = df["essential_length"].tolist() + df["supplementary_length"].tolist()

# Compute overall mean and median
overall_mean = np.mean(all_lengths)
overall_median = np.median(all_lengths)

# Create a final DataFrame
stats_df = pd.DataFrame({
    "Metric": ["Overall Mean", "Overall Median"],
    "Value": [overall_mean, overall_median]
})

# Display final DataFrame
print(stats_df)

           Metric  Value
0    Overall Mean  4.725
1  Overall Median  3.500


In [18]:
overall_mean = np.mean(all_lengths)
overall_median = np.median(all_lengths)
overall_max = np.max(all_lengths)
overall_min=np.min(all_lengths)

# Create a final DataFrame
stats_df = pd.DataFrame({
    "Metric": ["Overall Mean", "Overall Median", "Overall Max","Overall_min"],
    "Value": [overall_mean, overall_median, overall_max,overall_min]
})

# Display final DataFrame
print(stats_df)

           Metric   Value
0    Overall Mean   4.725
1  Overall Median   3.500
2     Overall Max  15.000
3     Overall_min   0.000


In [25]:
!pip install chromadb



In [26]:
!rm -rf chroma_db

In [None]:
import chromadb
import pandas as pd
from sentence_transformers import SentenceTransformer

# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Define embedding models
embedding_models = [
    "pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb"
]


results_df = pd.DataFrame(columns=["Model", "case_id", "Query", "Retrieved Keys", "Essential Keys", "Supplementary Keys", "Precision", "Recall", "F1-Score"])

# Iterate over embedding models
for model_name in embedding_models:
    print("model_name")
    model = SentenceTransformer(model_name)

    # Iterate over each row
    for index, row in df.iterrows():
        case_id = row["case_id"]
        query_text = row["clinician_question"]

        # Directly use the dictionary (no need for json.loads)
        note_excerpt_dict = row["note_excerpt_sentences"]
        if not isinstance(note_excerpt_dict, dict):  # Ensure it is a dictionary
            continue

        note_excerpt_pairs = list(note_excerpt_dict.items())  # Extract (key, sentence) pairs

        essential_keys = set(row["essential"])
        print(essential_keys)  # Ensure string conversion
        supplementary_keys = set(row["supplementary"])

        # Reset ChromaDB for each row
        collection_name = f"temp_collection_{case_id}"
        try:
            chroma_client.delete_collection(collection_name)
        except:
            pass  # Ignore if collection doesn't exist

        collection = chroma_client.create_collection(name=collection_name)

        # Store embeddings with keys
        for key, sentence in note_excerpt_pairs:
            embedding = model.encode(sentence).tolist()
            collection.add(
                ids=[key],  # Store the key
                embeddings=[embedding],
                metadatas=[{"key": key, "text": sentence}]
            )

        # Retrieve based on query
        query_embedding = model.encode(query_text).tolist()
        k = len(note_excerpt_pairs)//2
        retrieved_results = collection.query(query_embedding, n_results=k)

        # Extract retrieved keys
        retrieved_keys = set(res["key"] for res in retrieved_results["metadatas"][0])
        print(retrieved_keys)
        # Calculate metrics
        tp = len(retrieved_keys & (essential_keys | supplementary_keys))
        fp = len(retrieved_keys - (essential_keys | supplementary_keys))
        fn = len(essential_keys - retrieved_keys)
        print(tp,fp,fn)
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        # Store results
        results_df = pd.concat([results_df, pd.DataFrame([{
            "Model": model_name,
            "case_id": case_id,
            "Query": query_text,
            "Retrieved Keys": list(retrieved_keys),
            "Essential Keys": list(essential_keys),
            "Supplementary Keys": list(supplementary_keys),
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1
        }])], ignore_index=True)

# Save results
results_df.to_csv("retrieval_results.csv", index=False)


model_name
{'1', '6', '7', '5'}
{'1', '6', '3', '5'}
3 1 1
{'1', '4', '7'}


  results_df = pd.concat([results_df, pd.DataFrame([{


{'3', '1', '5', '6', '4'}
2 3 1
{'4', '5'}
{'0', '6', '5', '4', '7'}
3 2 0
{'15', '4', '17', '18'}
{'15', '18', '8', '11', '17', '0', '1', '6', '10', '7'}
3 7 1
{'15', '16', '18', '8', '11', '2', '12', '17', '9', '10'}
{'15', '22', '8', '11', '3', '2', '17', '9', '1', '16', '20'}
7 4 3
{'15', '22', '14', '19', '18', '11', '12', '13', '17', '21', '1', '6', '16', '4', '20'}
{'15', '22', '19', '23', '16', '12', '0', '1', '10', '4', '7', '24'}
7 5 8
{'22', '14', '19', '23', '18', '13', '2', '20'}
{'22', '15', '25', '19', '18', '8', '11', '2', '3', '12', '29', '9', '5', '10', '20'}
5 10 3
{'22', '38', '41', '23', '44', '52', '53', '48', '51', '40', '42', '43', '45', '37', '49'}
{'41', '29', '17', '53', '10', '26', '44', '46', '43', '40', '38', '22', '14', '52', '11', '2', '31', '47', '13', '16', '19', '21', '36', '1', '42', '37', '24'}
12 15 5
{'15', '14', '19', '2', '0', '1', '7', '20'}
{'18', '8', '11', '2', '3', '0', '9', '6', '10', '4'}
3 7 6
{'22', '26', '23', '30', '21', '2', '29', '2

In [None]:
import chromadb
import pandas as pd
from sentence_transformers import SentenceTransformer

# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Define embedding models
embedding_models = [
    "pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb"
]

# DataFrame to store results
results_df = pd.DataFrame(columns=[
    "Model", "case_id", "Query", "Retrieved Keys", "Essential Keys", "Supplementary Keys", "Precision", "Recall", "F1-Score"
])

# Initialize cumulative metrics
total_tp, total_fp, total_fn = 0, 0, 0

# Iterate over embedding models
for model_name in embedding_models:
    print("Processing model:", model_name)
    model = SentenceTransformer(model_name)

    # Iterate over each row
    for index, row in df.iterrows():
        case_id = row["case_id"]
        query_text = row["clinician_question"]

        # Ensure note_excerpt_sentences is a dictionary
        note_excerpt_dict = row["note_excerpt_sentences"]
        if not isinstance(note_excerpt_dict, dict):
            continue

        note_excerpt_pairs = list(note_excerpt_dict.items())  # Extract (key, sentence) pairs

        essential_keys = set(row["essential"])
        supplementary_keys = set(row["supplementary"])

        # Reset ChromaDB for each row
        collection_name = f"temp_collection_{case_id}"
        try:
            chroma_client.delete_collection(collection_name)
        except:
            pass  # Ignore if collection doesn't exist

        collection = chroma_client.create_collection(name=collection_name)

        # Store embeddings with keys
        for key, sentence in note_excerpt_pairs:
            embedding = model.encode(sentence).tolist()
            collection.add(
                ids=[key],  # Store the key
                embeddings=[embedding],
                metadatas=[{"key": key, "text": sentence}]
            )

        # Retrieve based on query
        query_embedding = model.encode(query_text).tolist()
        k = max(1, len(note_excerpt_pairs) // 2)
        retrieved_results = collection.query(query_embedding, n_results=k)

        # Extract retrieved keys
        retrieved_keys = set(res["key"] for res in retrieved_results["metadatas"][0])

        # Compute TP, FP, FN
        tp = len(retrieved_keys & (essential_keys | supplementary_keys))
        fp = len(retrieved_keys - (essential_keys | supplementary_keys))
        fn = len((essential_keys | supplementary_keys) - retrieved_keys)
        # Update cumulative metrics
        total_tp += tp
        total_fp += fp
        total_fn += fn

        # Compute precision, recall, F1-score for this query
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        # Store per-query results
        results_df = pd.concat([results_df, pd.DataFrame([{
            "Model": model_name,
            "case_id": case_id,
            "Query": query_text,
            "Retrieved Keys": list(retrieved_keys),
            "Essential Keys": list(essential_keys),
            "Supplementary Keys": list(supplementary_keys),
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1
        }])], ignore_index=True)

    # **Compute Overall Metrics**
    overall_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    overall_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
    overall_f1 = 2 * (overall_precision * overall_recall) / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0

    # **Print Overall Metrics**
    print("\nOverall Model Performance:")
    print(f"Overall Precision: {overall_precision:.4f}")
    print(f"Overall Recall: {overall_recall:.4f}")
    print(f"Overall F1-Score: {overall_f1:.4f}")

    # **Store Overall Metrics in results_df**
    overall_metrics_df = pd.DataFrame([{
        "Model": model_name,
        "case_id": "Overall",
        "Query": "All",
        "Retrieved Keys": "All",
        "Essential Keys": "All",
        "Supplementary Keys": "All",
        "Precision": overall_precision,
        "Recall": overall_recall,
        "F1-Score": overall_f1
    }])

    results_df = pd.concat([results_df, overall_metrics_df], ignore_index=True)

    # Save results
results_df.to_csv("retrieval_results.csv", index=False)


Processing model: pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/691 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/412 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  results_df = pd.concat([results_df, pd.DataFrame([{



Overall Model Performance:
Overall Precision: 0.4976
Overall Recall: 0.5503
Overall F1-Score: 0.5226


In [26]:
import chromadb
import pandas as pd
from sentence_transformers import SentenceTransformer

# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Define embedding models from MedEmbed benchmark
embedding_models = [
    "abhinand/MedEmbed-base-v0.1",
    "BAAI/bge-base-en-v1.5",
    "sentence-transformers/all-MiniLM-L6-v2",
    "thenlper/gte-base"
]

# DataFrame to store results
results_df = pd.DataFrame(columns=["Model", "case_id", "Query", "Retrieved Keys", "Essential Keys", "Supplementary Keys", "Accuracy"])

# Initialize cumulative accuracy variables
total_correct = 0
total_relevant = 0

# Iterate over embedding models
for model_name in embedding_models:
    print("Processing model:", model_name)
    model = SentenceTransformer(model_name)

    # Iterate over each row
    for index, row in df.iterrows():
        case_id = row["case_id"]
        query_text = row["clinician_question"]

        # Ensure note_excerpt_sentences is a dictionary
        note_excerpt_dict = row["note_excerpt_sentences"]
        if not isinstance(note_excerpt_dict, dict):
            continue

        note_excerpt_pairs = list(note_excerpt_dict.items())  # Extract (key, sentence) pairs

        essential_keys = set(row["essential"])
        supplementary_keys = set(row["supplementary"])
        total_keys = essential_keys | supplementary_keys  # All relevant keys

        # Reset ChromaDB for each row
        collection_name = f"temp_collection_{case_id}"
        try:
            chroma_client.delete_collection(collection_name)
        except:
            pass  # Ignore if collection doesn't exist

        collection = chroma_client.create_collection(name=collection_name)

        # Store embeddings with keys
        for key, sentence in note_excerpt_pairs:
            embedding = model.encode(sentence).tolist()
            collection.add(
                ids=[key],  # Store the key
                embeddings=[embedding],
                metadatas=[{"key": key, "text": sentence}]
            )

        # Retrieve based on query
        query_embedding = model.encode(query_text).tolist()
        k = max(1, (len(note_excerpt_pairs)*2) // 3)
        retrieved_results = collection.query(query_embedding, n_results=k)

        # Extract retrieved keys
        retrieved_keys = set(res["key"] for res in retrieved_results["metadatas"][0])

        # Compute accuracy
        correct_retrieved = len(retrieved_keys & total_keys)
        total_relevant_keys = len(total_keys)

        # Avoid division by zero; if no relevant keys exist, assume accuracy is 1
        accuracy = correct_retrieved / total_relevant_keys if total_relevant_keys > 0 else 1

        # Update cumulative metrics
        total_correct += correct_retrieved
        total_relevant += total_relevant_keys

        # Store per-query results
        results_df = pd.concat([results_df, pd.DataFrame([{
            "Model": model_name,
            "case_id": case_id,
            "Query": query_text,
            "Retrieved Keys": list(retrieved_keys),
            "Essential Keys": list(essential_keys),
            "Supplementary Keys": list(supplementary_keys),
            "Accuracy": accuracy
        }])], ignore_index=True)

    # **Compute Overall Model Accuracy**
    overall_accuracy = total_correct / total_relevant if total_relevant > 0 else 1

    # **Print Overall Accuracy**
    print(f"\nOverall Accuracy for {model_name}: {overall_accuracy:.4f}")

    # **Store Overall Accuracy in results_df**
    overall_metrics_df = pd.DataFrame([{
        "Model": model_name,
        "case_id": "Overall",
        "Query": "All",
        "Retrieved Keys": "All",
        "Essential Keys": "All",
        "Supplementary Keys": "All",
        "Accuracy": overall_accuracy
    }])

    results_df = pd.concat([results_df, overall_metrics_df], ignore_index=True)

# Save results
results_df.to_csv("retrieval_results.csv", index=False)


Processing model: abhinand/MedEmbed-base-v0.1


  results_df = pd.concat([results_df, pd.DataFrame([{



Overall Accuracy for abhinand/MedEmbed-base-v0.1: 0.7354
Processing model: BAAI/bge-base-en-v1.5

Overall Accuracy for BAAI/bge-base-en-v1.5: 0.7354
Processing model: sentence-transformers/all-MiniLM-L6-v2

Overall Accuracy for sentence-transformers/all-MiniLM-L6-v2: 0.7284
Processing model: thenlper/gte-base

Overall Accuracy for thenlper/gte-base: 0.7288


In [31]:
import chromadb
import pandas as pd
from sentence_transformers import SentenceTransformer

# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Define embedding models from MedEmbed benchmark
embedding_models = [
    "abhinand/MedEmbed-base-v0.1",
    "BAAI/bge-base-en-v1.5",
    "sentence-transformers/all-MiniLM-L6-v2",
    "thenlper/gte-base"
]

# DataFrame to store results
results_df = pd.DataFrame(columns=["Model", "case_id", "Query", "Retrieved Keys", "Essential Keys", "Supplementary Keys", "Accuracy"])

# Initialize cumulative accuracy variables
total_correct = 0
total_relevant = 0

# Iterate over embedding models
for model_name in embedding_models:
    print("Processing model:", model_name)
    model = SentenceTransformer(model_name)

    # Iterate over each row
    for index, row in df.iterrows():
        case_id = row["case_id"]

        # **Combine all patient questions into a single query**
        patient_question_dict = row["patient_question"]

        # Ensure it's a dictionary
        if not isinstance(patient_question_dict, dict):
            continue

        combined_query_text = " ".join(patient_question_dict.values())  # Merge all questions


        # Ensure note_excerpt_sentences is a dictionary
        note_excerpt_dict = row["note_excerpt_sentences"]
        if not isinstance(note_excerpt_dict, dict):
            continue

        note_excerpt_pairs = list(note_excerpt_dict.items())  # Extract (key, sentence) pairs

        essential_keys = set(row["essential"]) if row["essential"] else set()
        supplementary_keys = set(row["supplementary"]) if row["supplementary"] else set()
        total_keys = essential_keys | supplementary_keys  # All relevant keys

        # Reset ChromaDB for each row
        collection_name = f"temp_collection_{case_id}"
        try:
            chroma_client.delete_collection(collection_name)
        except:
            pass  # Ignore if collection doesn't exist

        collection = chroma_client.create_collection(name=collection_name)

        # Store embeddings with keys
        for key, sentence in note_excerpt_pairs:
            embedding = model.encode(sentence).tolist()
            collection.add(
                ids=[key],  # Store the key
                embeddings=[embedding],
                metadatas=[{"key": key, "text": sentence}]
            )

        # Retrieve based on **combined** patient question query
        query_embedding = model.encode(combined_query_text).tolist()
        k = max(1, len(note_excerpt_pairs) // 2)
        retrieved_results = collection.query(query_embedding, n_results=k)

        # Extract retrieved keys
        retrieved_keys = set(res["key"] for res in retrieved_results["metadatas"][0])

        # Compute accuracy
        correct_retrieved = len(retrieved_keys & total_keys)
        total_relevant_keys = len(total_keys)

        # Avoid division by zero; if no relevant keys exist, assume accuracy is 1
        accuracy = correct_retrieved / total_relevant_keys if total_relevant_keys > 0 else 1

        # Update cumulative metrics
        total_correct += correct_retrieved
        total_relevant += total_relevant_keys

        # Store per-case results
        results_df = pd.concat([results_df, pd.DataFrame([{
            "Model": model_name,
            "case_id": case_id,
            "Query": combined_query_text,  # Store the combined query
            "Retrieved Keys": list(retrieved_keys),
            "Essential Keys": list(essential_keys),
            "Supplementary Keys": list(supplementary_keys),
            "Accuracy": accuracy
        }])], ignore_index=True)

    # **Compute Overall Model Accuracy**
    overall_accuracy = total_correct / total_relevant if total_relevant > 0 else 1

    # **Print Overall Accuracy**
    print(f"\nOverall Accuracy for {model_name}: {overall_accuracy:.4f}")

    # **Store Overall Accuracy in results_df**
    overall_metrics_df = pd.DataFrame([{
        "Model": model_name,
        "case_id": "Overall",
        "Query": "All",
        "Retrieved Keys": "All",
        "Essential Keys": "All",
        "Supplementary Keys": "All",
        "Accuracy": overall_accuracy
    }])

    results_df = pd.concat([results_df, overall_metrics_df], ignore_index=True)

# Save results
results_df.to_csv("retrieval_results.csv", index=False)


Processing model: abhinand/MedEmbed-base-v0.1


  results_df = pd.concat([results_df, pd.DataFrame([{



Overall Accuracy for abhinand/MedEmbed-base-v0.1: 0.6032
Processing model: BAAI/bge-base-en-v1.5

Overall Accuracy for BAAI/bge-base-en-v1.5: 0.5979
Processing model: sentence-transformers/all-MiniLM-L6-v2

Overall Accuracy for sentence-transformers/all-MiniLM-L6-v2: 0.6014
Processing model: thenlper/gte-base

Overall Accuracy for thenlper/gte-base: 0.6045


In [None]:
import chromadb
import pandas as pd
from sentence_transformers import SentenceTransformer

# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Define embedding models from MedEmbed benchmark
embedding_models = [
    "abhinand/MedEmbed-base-v0.1",
    "BAAI/bge-base-en-v1.5",
    "sentence-transformers/all-MiniLM-L6-v2",
    "thenlper/gte-base"
]

# DataFrame to store results
results_df = pd.DataFrame(columns=["Model", "case_id", "Query", "Retrieved Keys", "Essential Keys", "Supplementary Keys", "Accuracy"])

# Initialize cumulative accuracy variables
total_correct = 0
total_relevant = 0

# Iterate over embedding models
for model_name in embedding_models:
    print("Processing model:", model_name)
    model = SentenceTransformer(model_name)

    # Iterate over each row
    for index, row in df.iterrows():
        case_id = row["case_id"]

        # **Extract patient questions (dict) and clinical question (string)**
        patient_question_dict = row["patient_question"]
        clinical_question_text = row["clinician_question"]

        # Ensure patient_question is a dictionary
        if not isinstance(patient_question_dict, dict):
            continue

        # **Combine all patient questions with the clinical question**
        combined_query_text = " ".join(patient_question_dict.values()) + " " + clinical_question_text

        # Ensure note_excerpt_sentences is a dictionary
        note_excerpt_dict = row["note_excerpt_sentences"]
        if not isinstance(note_excerpt_dict, dict):
            continue

        note_excerpt_pairs = list(note_excerpt_dict.items())  # Extract (key, sentence) pairs

        essential_keys = set(row["essential"]) if row["essential"] else set()
        supplementary_keys = set(row["supplementary"]) if row["supplementary"] else set()
        total_keys = essential_keys | supplementary_keys  # All relevant keys

        # Reset ChromaDB for each row
        collection_name = f"temp_collection_{case_id}"
        try:
            chroma_client.delete_collection(collection_name)
        except:
            pass  # Ignore if collection doesn't exist

        collection = chroma_client.create_collection(name=collection_name)

        # Store embeddings with keys
        for key, sentence in note_excerpt_pairs:
            embedding = model.encode(sentence).tolist()
            collection.add(
                ids=[key],  # Store the key
                embeddings=[embedding],
                metadatas=[{"key": key, "text": sentence}]
            )

        # Retrieve based on **combined patient + clinical question query**
        query_embedding = model.encode(combined_query_text).tolist()
        k = max(1, len(note_excerpt_pairs) // 2)
        retrieved_results = collection.query(query_embedding, n_results=k)

        # Extract retrieved keys
        retrieved_keys = set(res["key"] for res in retrieved_results["metadatas"][0])

        # Compute accuracy
        correct_retrieved = len(retrieved_keys & total_keys)
        total_relevant_keys = len(total_keys)

        # Avoid division by zero; if no relevant keys exist, assume accuracy is 1
        accuracy = correct_retrieved / total_relevant_keys if total_relevant_keys > 0 else 1

        # Update cumulative metrics
        total_correct += correct_retrieved
        total_relevant += total_relevant_keys

        # Store per-case results
        results_df = pd.concat([results_df, pd.DataFrame([{
            "Model": model_name,
            "case_id": case_id,
            "Query": combined_query_text,  # Store the combined query
            "Retrieved Keys": list(retrieved_keys),
            "Essential Keys": list(essential_keys),
            "Supplementary Keys": list(supplementary_keys),
            "Accuracy": accuracy
        }])], ignore_index=True)

    # **Compute Overall Model Accuracy**
    overall_accuracy = total_correct / total_relevant if total_relevant > 0 else 1

    # **Print Overall Accuracy**
    print(f"\nOverall Accuracy for {model_name}: {overall_accuracy:.4f}")

    # **Store Overall Accuracy in results_df**
    overall_metrics_df = pd.DataFrame([{
        "Model": model_name,
        "case_id": "Overall",
        "Query": "All",
        "Retrieved Keys": "All",
        "Essential Keys": "All",
        "Supplementary Keys": "All",
        "Accuracy": overall_accuracy
    }])

    results_df = pd.concat([results_df, overall_metrics_df], ignore_index=True)

# Save results
results_df.to_csv("retrieval_results.csv", index=False)


In [11]:
!pip install llama_index

Collecting llama_index
  Downloading llama_index-0.12.25-py3-none-any.whl.metadata (12 kB)
Collecting llama-index-agent-openai<0.5.0,>=0.4.0 (from llama_index)
  Downloading llama_index_agent_openai-0.4.6-py3-none-any.whl.metadata (727 bytes)
Collecting llama-index-cli<0.5.0,>=0.4.1 (from llama_index)
  Downloading llama_index_cli-0.4.1-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.13.0,>=0.12.25 (from llama_index)
  Downloading llama_index_core-0.12.25-py3-none-any.whl.metadata (2.5 kB)
Collecting llama-index-embeddings-openai<0.4.0,>=0.3.0 (from llama_index)
  Downloading llama_index_embeddings_openai-0.3.1-py3-none-any.whl.metadata (684 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.4.0 (from llama_index)
  Downloading llama_index_indices_managed_llama_cloud-0.6.9-py3-none-any.whl.metadata (3.6 kB)
Collecting llama-index-llms-openai<0.4.0,>=0.3.0 (from llama_index)
  Downloading llama_index_llms_openai-0.3.26-py3-none-any.whl.metadata (3.3 kB)
Collec

In [12]:
!pip install llama-index
%pip install llama-index-vector-stores-chroma
!pip install llama-index-embeddings-huggingface
%pip install llama-index-embeddings-langchain

Collecting llama-index-vector-stores-chroma
  Downloading llama_index_vector_stores_chroma-0.4.1-py3-none-any.whl.metadata (696 bytes)
Downloading llama_index_vector_stores_chroma-0.4.1-py3-none-any.whl (5.2 kB)
Installing collected packages: llama-index-vector-stores-chroma
Successfully installed llama-index-vector-stores-chroma-0.4.1
Collecting llama-index-embeddings-huggingface
  Downloading llama_index_embeddings_huggingface-0.5.2-py3-none-any.whl.metadata (767 bytes)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=2.6.1->llama-index-embeddings-huggingface)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=2.6.1->llama-index-embeddings-huggingface)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11

In [13]:
%pip install llama-index-retrievers-bm25

Collecting llama-index-retrievers-bm25
  Downloading llama_index_retrievers_bm25-0.5.2-py3-none-any.whl.metadata (740 bytes)
Collecting bm25s<0.3.0,>=0.2.0 (from llama-index-retrievers-bm25)
  Downloading bm25s-0.2.10-py3-none-any.whl.metadata (21 kB)
Collecting pystemmer<3.0.0.0,>=2.2.0.1 (from llama-index-retrievers-bm25)
  Downloading PyStemmer-2.2.0.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Downloading llama_index_retrievers_bm25-0.5.2-py3-none-any.whl (3.7 kB)
Downloading bm25s-0.2.10-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.7/53.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyStemmer-2.2.0.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (669 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m669.3/669.3 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling colle

In [14]:
!pip install -qU langchain langchain-community
%pip install -qU langchain-qdrant
%pip install -q fastembed

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m306.7/306.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.6/61.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.8/324.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [15]:
!pip install --upgrade llama-index




In [None]:

import chromadb
import pandas as pd
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.retrievers import VectorIndexRetriever, AutoMergingRetriever
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.schema import Document
from llama_index.core.storage.docstore import SimpleDocumentStore
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters

# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Initialize document store
docstore = SimpleDocumentStore()

# Define multiple embedding models to test
models = {
    "mpnet": HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"),
    "minilm": HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
    "distilroberta": HuggingFaceEmbeddings(model_name="sentence-transformers/all-distilroberta-v1"),
    "MedEmbed": HuggingFaceEmbeddings(model_name="abhinand/MedEmbed-base-v0.1"),
    "BAAI_bge": HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5"),
    "MiniLM": HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
    "GTE_base": HuggingFaceEmbeddings(model_name="thenlper/gte-base"),
}

# Select retriever type manually (change this as needed)
retriever_type = "base"  # Options: "base", "auto_merger", "bm25"

# DataFrame to store per-case results
results_df = pd.DataFrame(columns=["Model", "case_id", "Query", "Retriever Type", "Retrieved Keys", "Essential Keys", "Supplementary Keys", "Accuracy"])

# Dictionary to store per-model accuracy
model_accuracy_scores = {}

# ---- Execution Loop ----
for model_name, embed_model in models.items():
    print(f"\nRunning for model: {model_name}...")
    existing_collections = chroma_client.list_collections()

    if "all_cases_collection" in existing_collections:
        chroma_client.delete_collection("all_cases_collection")

    chroma_collection = chroma_client.get_or_create_collection("all_cases_collection")

    # Initialize accuracy list for this model
    accuracy_scores = []

    # Iterate over each row (case) in the DataFrame
    for index, row in df.iterrows():
        case_id = row["case_id"]

        # Extract patient and clinical questions
        patient_question_dict = row["patient_question"]
        clinical_question_text = row["clinician_question"]

        if not isinstance(patient_question_dict, dict):
            continue

        # Combine patient and clinical questions
        combined_query_text = " ".join(patient_question_dict.values()) + " " + clinical_question_text

        # Extract note excerpts
        note_excerpt_dict = row["note_excerpt_sentences"]
        if not isinstance(note_excerpt_dict, dict):
            continue

        essential_keys = set(row["essential"]) if row["essential"] else set()
        supplementary_keys = set(row["supplementary"]) if row["supplementary"] else set()
        total_keys = essential_keys | supplementary_keys  # All relevant keys

        # Function to add case-specific embeddings to the shared database
        def add_case_to_index(case_id, note_excerpts, embed_model):
            vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
            storage_context = StorageContext.from_defaults(vector_store=vector_store)

            # Convert note excerpts to Document format with case_id in metadata
            docs = [Document(text=sentence, metadata={"case_id": case_id, "key": key}) for key, sentence in note_excerpts.items()]

            # Add documents to the docstore
            docstore.add_documents(docs)

            # Add documents to the shared vector store
            index = VectorStoreIndex.from_documents(docs, storage_context=storage_context, embed_model=embed_model)
            return index

        # Function to build a retriever for a specific case
        def build_retriever(index, retriever_type, case_id, top_k=5):
            # Define metadata filters
            filters = MetadataFilters(filters=[ExactMatchFilter(key="case_id", value=case_id)])

            if retriever_type == "base":
                return index.as_retriever(similarity_top_k=top_k, filters=filters)

            elif retriever_type == "auto_merger":
                base_retriever = index.as_retriever(similarity_top_k=top_k, filters=filters)
                return AutoMergingRetriever(base_retriever,storage_context=index.storage_context, verbose=True)

            elif retriever_type == "bm25":
                return BM25Retriever.from_defaults(
                    docstore=docstore,
                    similarity_top_k=top_k,
                    stemmer=stemmer.Stemmer("english"),
                    language="english",
                    filters=filters
                )

            else:
                raise ValueError("Invalid retriever_type. Choose from: 'base', 'auto_merger', 'bm25'.")

        # Function to retrieve nodes for a specific case
        def get_case_retrieved_nodes(retriever, question):
            results = retriever.retrieve(question)
            return results

        # Add the case to the shared vector index
        index = add_case_to_index(case_id, note_excerpt_dict, embed_model)

        # Build retriever for the specific case
        retriever = build_retriever(index, retriever_type, case_id, top_k=max(1, (len(note_excerpt_dict)*2) // 3))

        # Retrieve nodes only from the current case
        retrieved_results = get_case_retrieved_nodes(retriever, combined_query_text)

        # Extract retrieved keys
        retrieved_keys = set(node.metadata["key"] for node in retrieved_results)

        # Compute accuracy
        correct_retrieved = len(retrieved_keys & total_keys)
        total_relevant_keys = len(total_keys)
        accuracy = correct_retrieved / total_relevant_keys if total_relevant_keys > 0 else 1

        # Store accuracy for per-model evaluation
        accuracy_scores.append(accuracy)

        # Store per-case results
        results_df = pd.concat([results_df, pd.DataFrame([{
            "Model": model_name,
            "case_id": case_id,
            "Query": combined_query_text,
            "Retriever Type": retriever_type,
            "Retrieved Keys": list(retrieved_keys),
            "Essential Keys": list(essential_keys),
            "Supplementary Keys": list(supplementary_keys),
            "Accuracy": accuracy
        }])], ignore_index=True)

    # Compute mean accuracy for this model
    mean_accuracy = sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
    model_accuracy_scores[model_name] = mean_accuracy

# ---- Overall Performance Summary ----
summary_df = pd.DataFrame([
    {"Model": model, "Retriever Type": retriever_type, "Mean Accuracy": acc}
    for model, acc in model_accuracy_scores.items()
])

# Save per-case results
results_df.to_csv("retrieval_results_case_specific.csv", index=False)

# Save overall summary
summary_df.to_csv("retrieval_overall_performance.csv", index=False)

# Print Key Insights
print("\n--- Overall Retriever Performance ---")
print(summary_df)


In [23]:
results_df

Unnamed: 0,Model,case_id,Query,Retriever Type,Retrieved Keys,Essential Keys,Supplementary Keys,Accuracy
0,mpnet,1,My question is if the sludge was there does no...,auto_merger,"[2, 1, 6, 3, 7, 5]","[5, 1, 7, 6]",[],1.000000
1,mpnet,2,dad given multiple shots of lasciks after he w...,auto_merger,"[10, 1, 6, 3, 7, 4, 5]","[7, 1, 4]",[],1.000000
2,mpnet,3,he is continously irritated and has headache w...,auto_merger,"[8, 2, 1, 4, 9, 5]","[5, 4]","[8, 9, 7]",0.800000
3,mpnet,4,My doctor performed a cardiac catherization. W...,auto_merger,"[8, 10, 17, 1, 2, 6, 13, 18, 3, 7, 15, 19, 12,...","[17, 18, 4, 15]",[],0.750000
4,mpnet,5,"I overdosed October 4th on trihexyphenidyl, th...",auto_merger,"[8, 10, 22, 17, 21, 2, 1, 12, 3, 18, 15, 19, 1...","[8, 10, 12, 17, 2, 18, 15, 11, 16, 9]",[],0.900000
...,...,...,...,...,...,...,...,...
135,GTE_base,16,The pain presented in May and has worsened to ...,auto_merger,"[18, 15, 28, 16, 2, 25, 9, 11, 20, 8, 1, 13, 2...","[18, 29, 23, 28, 27]","[21, 22, 17, 9, 20]",0.700000
136,GTE_base,17,"My palpitations are benign, I’m told. Fine, ho...",auto_merger,"[18, 15, 28, 16, 22, 32, 29, 25, 33, 20, 37, 2...","[22, 17, 32, 18, 0, 23, 34, 16, 33]","[37, 12, 9]",0.750000
137,GTE_base,18,"at hospital. I seemed fine, but now I am vomi...",auto_merger,"[22, 17, 2, 1, 6, 13, 3, 18, 15, 0, 19, 11, 16...","[17, 2, 1, 6, 13, 3, 0, 16, 12]",[22],1.000000
138,GTE_base,19,I went to ER for a bladder infection. The doc...,auto_merger,"[8, 10, 2, 1, 6, 3, 15, 11, 16, 12, 5, 14]","[6, 7, 15, 16, 5]",[],0.800000


In [None]:

import chromadb
import pandas as pd
from llama_index.core import StorageContext, VectorStoreIndex, get_response_synthesizer, PromptTemplate
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.retrievers import VectorIndexRetriever, AutoMergingRetriever
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.schema import Document
from llama_index.core.storage.docstore import SimpleDocumentStore
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.response_synthesizers import ResponseMode

# Use a writable path in Colab
chroma_client = chromadb.PersistentClient(path="/content/chroma_db")


# Initialize document store
docstore = SimpleDocumentStore()

# Define multiple embedding models to test
models = {
    # "mpnet": HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"),
    # "minilm": HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
    # "distilroberta": HuggingFaceEmbeddings(model_name="sentence-transformers/all-distilroberta-v1"),
    "MedEmbed": HuggingFaceEmbeddings(model_name="abhinand/MedEmbed-base-v0.1"),
    # "BAAI_bge": HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5"),
    # "MiniLM": HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
    # "GTE_base": HuggingFaceEmbeddings(model_name="thenlper/gte-base"),
}

# Select retriever type manually (change this as needed)
retriever_type = "base"  # Options: "base", "auto_merger", "bm25"

# DataFrame to store per-case results
results_df = pd.DataFrame(columns=["Model", "case_id", "Query", "Retriever Type", "Retrieved Keys", "Essential Keys", "Supplementary Keys", "Accuracy", "Response"])

# Dictionary to store per-model accuracy
model_accuracy_scores = {}

# ---- LLM Response Generation Function ----
def build_query_engine(retriever, response_type=ResponseMode.COMPACT, llm=None):
    """
    Builds and returns a query engine.
    """
    new_summary_tmpl_str = (
        "Context information is below.\n"
        "---------------------\n"
        "{context_str}\n"
        "---------------------\n"
        "Given the context information and not prior knowledge, "
        "answer the query in a very precise and accurate manner.\n"
        "CRITICAL: While answering the question, consider only the necessary context. "
        "There might be some irrelevant data present in the provided context information.\n"
        "Make sure to use your intelligence and provide an accurate and concise answer.\n"
        "STRICTLY answer the query based on the given context.\n"
        "Query: {query_str}\n"
        "Answer: "
    )
    new_summary_tmpl = PromptTemplate(new_summary_tmpl_str)

    response_synthesizer = get_response_synthesizer(
        response_mode=response_type, llm=llm, text_qa_template=new_summary_tmpl
    )

    return RetrieverQueryEngine(retriever=retriever, response_synthesizer=response_synthesizer)

def get_response(query_engine, query):
    """
    Gets a response from the query engine.
    """
    return query_engine.query(query)

# ---- Execution Loop ----
for model_name, embed_model in models.items():
    print(f"\nRunning for model: {model_name}...")

    existing_collections = chroma_client.list_collections()

    if "all_cases_collection" in existing_collections:
        chroma_client.delete_collection("all_cases_collection")

    chroma_collection = chroma_client.get_or_create_collection("all_cases_collection")
    # Initialize accuracy list for this model
    accuracy_scores = []

    # Iterate over each row (case) in the DataFrame
    for index, row in df.iterrows():
        case_id = row["case_id"]

        # Extract patient and clinical questions
        patient_question_dict = row["patient_question"]
        clinical_question_text = row["clinician_question"]

        if not isinstance(patient_question_dict, dict):
            continue

        # Combine patient and clinical questions
        combined_query_text = " ".join(patient_question_dict.values()) + " " + clinical_question_text

        # Extract note excerpts
        note_excerpt_dict = row["note_excerpt_sentences"]
        if not isinstance(note_excerpt_dict, dict):
            continue

        essential_keys = set(row["essential"]) if row["essential"] else set()
        supplementary_keys = set(row["supplementary"]) if row["supplementary"] else set()
        total_keys = essential_keys | supplementary_keys  # All relevant keys

        # Function to add case-specific embeddings to the shared database
        def add_case_to_index(case_id, note_excerpts, embed_model):
            vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
            storage_context = StorageContext.from_defaults(vector_store=vector_store)

            # Convert note excerpts to Document format with case_id in metadata
            docs = [Document(text=sentence, metadata={"case_id": case_id, "key": key}) for key, sentence in note_excerpts.items()]

            # Add documents to the docstore
            docstore.add_documents(docs)

            # Add documents to the shared vector store
            index = VectorStoreIndex.from_documents(docs, storage_context=storage_context, embed_model=embed_model)
            return index

        # Function to build a retriever for a specific case
        def build_retriever(index, retriever_type, case_id, top_k=5):
            # Define metadata filters
            filters = MetadataFilters(filters=[ExactMatchFilter(key="case_id", value=case_id)])

            if retriever_type == "base":
                return index.as_retriever(similarity_top_k=top_k, filters=filters)

            elif retriever_type == "auto_merger":
                base_retriever = index.as_retriever(similarity_top_k=top_k, filters=filters)
                return AutoMergingRetriever(base_retriever, storage_context=index.storage_context, verbose=True)

            elif retriever_type == "bm25":
                return BM25Retriever.from_defaults(
                    docstore=docstore,
                    similarity_top_k=top_k,
                    stemmer=stemmer.Stemmer("english"),
                    language="english",
                    filters=filters
                )

            else:
                raise ValueError("Invalid retriever_type. Choose from: 'base', 'auto_merger', 'bm25'.")

        # Add the case to the shared vector index
        index = add_case_to_index(case_id, note_excerpt_dict, embed_model)

        # Build retriever for the specific case
        retriever = build_retriever(index, retriever_type, case_id, top_k=max(1, (len(note_excerpt_dict)*2) // 3))

        # Generate LLM-based response
        query_engine = build_query_engine(retriever, llm=None)  # Replace `None` with an actual LLM if needed
        response = get_response(query_engine, combined_query_text)

        # Extract retrieved keys
        retrieved_results = retriever.retrieve(combined_query_text)
        retrieved_keys = set(node.metadata["key"] for node in retrieved_results)

        # Compute accuracy
        correct_retrieved = len(retrieved_keys & total_keys)
        total_relevant_keys = len(total_keys)
        accuracy = correct_retrieved / total_relevant_keys if total_relevant_keys > 0 else 1

        # Store results
        results_df = pd.concat([results_df, pd.DataFrame([{
            "Model": model_name,
            "case_id": case_id,
            "Query": combined_query_text,
            "Retriever Type": retriever_type,
            "Retrieved Keys": list(retrieved_keys),
            "Essential Keys": list(essential_keys),
            "Supplementary Keys": list(supplementary_keys),
            "Accuracy": accuracy,
            "Response": response
        }])], ignore_index=True)

# Save per-case results
results_df.to_csv("retrieval_results_with_responses.csv", index=False)

# Print Key Insights
print("\n--- Overall Retriever Performance ---")
print(results_df)


In [25]:
import chromadb
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Define embedding models
embedding_models = [
    "abhinand/MedEmbed-base-v0.1",
    "BAAI/bge-base-en-v1.5",
    "sentence-transformers/all-MiniLM-L6-v2",
    "thenlper/gte-base"
]

# DataFrame to store results
results_df = pd.DataFrame(columns=["Model", "case_id", "Query", "Retrieved Keys", "Essential Keys", "Supplementary Keys", "Accuracy"])

# Function to determine k dynamically
def get_dynamic_k(similarities, total_pairs):
    max_similarity = similarities[0] if similarities else 0
    if max_similarity > 0.9:
        k = 2
    elif max_similarity > 0.8:
        k = 3
    elif max_similarity > 0.6:
        k = min(7, total_pairs // 2)
    else:
        k = min(10, total_pairs // 1.5)

    min_k = 4 if max_similarity <= 0.8 else 3
    max_k = 15

    k = max(k, min_k)
    k = min(k, max_k)
    k = min(k, total_pairs)

    return int(k)

# Iterate over embedding models
for model_name in embedding_models:
    print("Processing model:", model_name)
    model = SentenceTransformer(model_name)

    total_correct = 0
    total_relevant = 0

    # Iterate over each row
    for index, row in df.iterrows():
        case_id = row["case_id"]
        query_text = row["clinician_question"]

        # Ensure note_excerpt_sentences is a dictionary
        note_excerpt_dict = row["note_excerpt_sentences"]
        if not isinstance(note_excerpt_dict, dict):
            continue

        note_excerpt_pairs = list(note_excerpt_dict.items())  # Extract (key, sentence) pairs

        essential_keys = set(row["essential"])
        supplementary_keys = set(row["supplementary"])
        total_keys = essential_keys | supplementary_keys  # All relevant keys

        # Reset ChromaDB for each row
        collection_name = f"temp_collection_{case_id}"
        try:
            chroma_client.delete_collection(collection_name)
        except:
            pass  # Ignore if collection doesn't exist

        collection = chroma_client.create_collection(name=collection_name)

        # Store embeddings with keys
        for key, sentence in note_excerpt_pairs:
            embedding = model.encode(sentence).tolist()
            collection.add(
                ids=[key],  # Store the key
                embeddings=[embedding],
                metadatas=[{"key": key, "text": sentence}]
            )

        # Retrieve based on query
        query_embedding = model.encode(query_text).tolist()
        retrieved_results = collection.query(query_embedding, n_results=len(note_excerpt_pairs))

        # Extract similarity scores
        similarities = retrieved_results["distances"][0] if retrieved_results["distances"] else []

        # Get the total number of stored key-value pairs
        total_pairs = len(note_excerpt_pairs)

        # Get dynamic k
        k = get_dynamic_k(similarities, total_pairs)
        print(k,total_pairs)

        # Retrieve top-k results
        retrieved_results = collection.query(query_embedding, n_results=k)
        retrieved_keys = set(res["key"] for res in retrieved_results["metadatas"][0])

        # Compute accuracy
        correct_retrieved = len(retrieved_keys & total_keys)
        total_relevant_keys = len(total_keys)

        accuracy = correct_retrieved / total_relevant_keys if total_relevant_keys > 0 else 1

        # Update cumulative metrics
        total_correct += correct_retrieved
        total_relevant += total_relevant_keys

        # Store per-query results
        results_df = pd.concat([results_df, pd.DataFrame([{
            "Model": model_name,
            "case_id": case_id,
            "Query": query_text,
            "Retrieved Keys": list(retrieved_keys),
            "Essential Keys": list(essential_keys),
            "Supplementary Keys": list(supplementary_keys),
            "Accuracy": accuracy
        }])], ignore_index=True)

    # **Compute Overall Model Accuracy**
    overall_accuracy = total_correct / total_relevant if total_relevant > 0 else 1

    # **Print Overall Accuracy**
    print(f"\nOverall Accuracy for {model_name}: {overall_accuracy:.4f}")

    # **Store Overall Accuracy in results_df**
    overall_metrics_df = pd.DataFrame([{
        "Model": model_name,
        "case_id": "Overall",
        "Query": "All",
        "Retrieved Keys": "All",
        "Essential Keys": "All",
        "Supplementary Keys": "All",
        "Accuracy": overall_accuracy
    }])

    results_df = pd.concat([results_df, overall_metrics_df], ignore_index=True)

# Save results
results_df.to_csv("retrieval_results_dynamic_k.csv", index=False)


Processing model: abhinand/MedEmbed-base-v0.1
6 9


  results_df = pd.concat([results_df, pd.DataFrame([{


7 11
5 10
10 21
7 23
7 25
10 30
7 54
7 21
10 32
10 27
7 14
6 12
3 9
6 12
10 30
7 38
7 23
7 18
4 9

Overall Accuracy for abhinand/MedEmbed-base-v0.1: 0.4127
Processing model: BAAI/bge-base-en-v1.5
4 9
7 11
5 10
10 21
7 23
7 25
10 30
3 54
7 21
7 32
7 27
7 14
6 12
3 9
6 12
10 30
3 38
7 23
7 18
4 9

Overall Accuracy for BAAI/bge-base-en-v1.5: 0.3810
Processing model: sentence-transformers/all-MiniLM-L6-v2
3 9
5 11
3 10
3 21
3 23
3 25
7 30
3 54
3 21
3 32
3 27
3 14
3 12
3 9
3 12
3 30
3 38
3 23
3 18
3 9

Overall Accuracy for sentence-transformers/all-MiniLM-L6-v2: 0.2063
Processing model: thenlper/gte-base
6 9
7 11
6 10
10 21
10 23
10 25
10 30
10 54
10 21
10 32
10 27
9 14
8 12
6 9
8 12
10 30
10 38
10 23
10 18
6 9

Overall Accuracy for thenlper/gte-base: 0.4921


In [None]:
import pandas as pd
import glob

# Get all CSV files matching the pattern
csv_files = glob.glob("/content/retrieval_results_*.csv")

# DataFrame to store overall results
overall_results_df = pd.DataFrame(columns=["Model", "Accuracy"])

# Process each file
for file in csv_files:
    df = pd.read_csv(file)

    # Extract only the overall performance row
    overall_df = df[df["case_id"] == "Overall"]

    # Keep only Model and Accuracy columns
    overall_df = overall_df[["Model", "Accuracy"]]

    # Append to final DataFrame
    overall_results_df = pd.concat([overall_results_df, overall_df], ignore_index=True)

# Sort by Accuracy in descending order
overall_results_df = overall_results_df.sort_values(by="Accuracy", ascending=False)

# Save final result
overall_results_df.to_csv("/content/overall_results_sorted.csv", index=False)

# Display the sorted DataFrame
print(overall_results_df)


                                                Model  Accuracy
8                               BAAI/bge-base-en-v1.5  0.582011
7                         abhinand/MedEmbed-base-v0.1  0.576720
9              sentence-transformers/all-MiniLM-L6-v2  0.574956
10                                  thenlper/gte-base  0.574074
6             sentence-transformers/all-MiniLM-L12-v2  0.569161
1                              BAAI/bge-large-en-v1.5  0.568783
16            sentence-transformers/all-MiniLM-L12-v2  0.565079
5                                intfloat/e5-large-v2  0.564374
2             sentence-transformers/all-mpnet-base-v2  0.562610
0                        abhinand/MedEmbed-large-v0.1  0.560847
3                                  thenlper/gte-large  0.560847
11  microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...  0.560847
4   microsoft/BiomedNLP-PubMedBERT-large-uncased-a...  0.558730
15                               intfloat/e5-small-v2  0.556878
14                             BAAI/bge-

  overall_results_df = pd.concat([overall_results_df, overall_df], ignore_index=True)


In [None]:
# import chromadb
# import pandas as pd
# from sentence_transformers import SentenceTransformer

# # Initialize ChromaDB client
# chroma_client = chromadb.PersistentClient(path="./chroma_db")

# # Define embedding models
# embedding_models = [
#     "pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb"
# ]

# # DataFrame to store results
# results_df = pd.DataFrame(columns=[
#     "Model", "case_id", "Query", "Retrieved Keys", "Essential Keys", "Supplementary Keys", "Precision", "Recall", "F1-Score"
# ])

# # Dictionary to store embeddings in a hierarchical format
# vector_db = {}

# # Iterate over embedding models
# for model_name in embedding_models:
#     print("Processing model:", model_name)
#     model = SentenceTransformer(model_name)

#     vector_db[model_name] = {}  # Initialize model-level dictionary

#     # Iterate over each row
#     for index, row in df.iterrows():
#         case_id = row["case_id"]
#         query_text = row["clinician_question"]

#         # Ensure note_excerpt_sentences is a dictionary
#         note_excerpt_dict = row["note_excerpt_sentences"]
#         if not isinstance(note_excerpt_dict, dict):
#             continue

#         note_excerpt_pairs = list(note_excerpt_dict.items())  # Extract (key, sentence) pairs

#         essential_keys = set(row["essential"])
#         supplementary_keys = set(row["supplementary"])

#         # Reset ChromaDB for each row
#         safe_model_name = model_name.replace("/", "").replace("-", "")  # Replace invalid characters
#         collection_name = f"{safe_model_name}collection"
#         print(collection_name)
#         collection = chroma_client.get_or_create_collection(name=collection_name)

#         # Store case-level embeddings
#         vector_db[model_name][case_id] = {
#             "query_embedding": model.encode(query_text).tolist(),
#             "note_excerpt_embeddings": {}
#         }

#         # Store query embedding in ChromaDB
#         collection.add(
#             ids=[f"{case_id}_query"],
#             embeddings=[vector_db[model_name][case_id]["query_embedding"]],
#             metadatas=[{"case_id": case_id, "model": model_name, "type": "query"}]
#         )

#         # Store note excerpt embeddings
#         for key, sentence in note_excerpt_pairs:
#             embedding = model.encode(sentence).tolist()
#             vector_db[model_name][case_id]["note_excerpt_embeddings"][key] = embedding

#             collection.add(
#                 ids=[f"{case_id}_{key}"],
#                 embeddings=[embedding],
#                 metadatas=[{"case_id": case_id, "model": model_name, "key": key, "type": "note"}]
#             )

#         # Retrieve results based on query
#         k = len(note_excerpt_pairs) // 2
#         retrieved_results = collection.query(
#             query_embeddings=[vector_db[model_name][case_id]["query_embedding"]],
#             n_results=k
#         )
#         print(retrieved_results["metadatas"][0])
#         # Extract retrieved keys
#         retrieved_keys = set(res.get("key", None) for res in retrieved_results["metadatas"][0] if "key" in res)


#         # Compute TP, FP, FN
#         tp = len(retrieved_keys & (essential_keys | supplementary_keys))
#         fp = len(retrieved_keys - (essential_keys | supplementary_keys))
#         fn = len(essential_keys - retrieved_keys)

#         # Compute precision, recall, F1-score for this query
#         precision = tp / (tp + fp) if (tp + fp) > 0 else 0
#         recall = tp / (tp + fn) if (tp + fn) > 0 else 0
#         f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

#         # Store per-query results
#         results_df = pd.concat([results_df, pd.DataFrame([{
#             "Model": model_name,
#             "case_id": case_id,
#             "Query": query_text,
#             "Retrieved Keys": list(retrieved_keys),
#             "Essential Keys": list(essential_keys),
#             "Supplementary Keys": list(supplementary_keys),
#             "Precision": precision,
#             "Recall": recall,
#             "F1-Score": f1
#         }])], ignore_index=True)

#     # Save results
# results_df.to_csv("retrieval_results.csv", index=False)



Processing model: pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb




pritamdekaBioBERTmnlisnliscinliscitailmednlistsbcollection


  results_df = pd.concat([results_df, pd.DataFrame([{


[{'case_id': '1', 'model': 'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb', 'type': 'query'}, {'case_id': '1', 'key': '5', 'model': 'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb', 'type': 'note'}, {'case_id': '1', 'key': '3', 'model': 'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb', 'type': 'note'}, {'case_id': '1', 'key': '6', 'model': 'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb', 'type': 'note'}]
pritamdekaBioBERTmnlisnliscinliscitailmednlistsbcollection
[{'case_id': '2', 'model': 'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb', 'type': 'query'}, {'case_id': '2', 'key': '4', 'model': 'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb', 'type': 'note'}, {'case_id': '2', 'key': '6', 'model': 'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb', 'type': 'note'}, {'case_id': '2', 'key': '1', 'model': 'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb', 'type': 'note'}, {'case_id': '2', 'key': '3', 'model': 'pritamdeka