# LLM Classifiers + Semantic Retrieval

In [20]:
import os

import pandas as pd
from dotenv import load_dotenv

from embeddings import ChromaVectorDB
from utils.utils import join_title_abstract

load_dotenv("../.env")

from utils import clean_dataframe, sample_test_papers

In [21]:
FULL_DATASET_CSV = os.getenv("FULL_DATASET_CSV")
RESULTS_DIR = os.getenv("RESULTS_DIR")
LABEL_COL = os.getenv("LABEL_COL")
RANDOM_STATE = int(os.getenv("RANDOM_STATE"))

## 1) Initialize ChromaDB Vector Database

In [22]:
# configuration for vector database
EMBEDDING_MODEL = "text-embedding-3-large"
CHROMA_DB_PATH = "../.vector_db/chroma_db"
COLLECTION_NAME = "papers_full"
CACHE_DIR_EMBEDDINGS = "../.embeddings_cache"

In [23]:
# initialize the vector database
vector_db = ChromaVectorDB(
    db_path=CHROMA_DB_PATH,
    collection_name=COLLECTION_NAME,
    embedding_model=EMBEDDING_MODEL,
    embedding_cache_dir=CACHE_DIR_EMBEDDINGS,
)
print(f"Papers in collection: {vector_db.get_collection_info()['count']}")

Papers in collection: 5747


## 2) Load data

In [24]:
# load dataset
df = pd.read_csv(FULL_DATASET_CSV, usecols=["id", "title", "abstract", LABEL_COL])
df = df.rename(columns={LABEL_COL: "label"})  # rename the label column to "label"

# clean datasets
df = clean_dataframe(df)
df.shape

(5747, 4)

## 3) Populate Vector Database

Add all papers from the dataset to the vector database. This will use the cached embeddings from the previous notebooks. Only run this once at the beginning.

Populate vector database (only run once)

In [25]:
# # populate vector database (only run once)
# vector_db.add_papers(
#     df=df,
#     id_col="id",
#     title_col="title",
#     abstract_col="abstract",
#     label_col="label",
#     batch_size=100,
#     normalize_embeddings=True,
# )

# # check final count
# info = vector_db.get_collection_info()
# print(f"Successfully populated database with {info['count']} papers")

## 4) Test Similarity Search

Search by text query

In [26]:
test_query = "The Effect of a Biofeedback-Based Integrated Program on Improving Orthostatic Hypotension in Community-Dwelling Older Adults: A Pilot Study Background: Orthostatic hypotension (OH) is prevalent among community-dwelling older adults and is associated with multiple negative health outcomes. Older adults are susceptible to developing OH because aging alters autonomic nervous system function. Biofeedback is a noninvasive, nonpharmacological intervention that can modulate autonomic nervous system dysfunction in older adults. Objectives: Our aim in this study was to examine the effect of a biofeedback-based integrated program on community-dwelling older adults with OH. Methods: We conducted a controlled pilot study. Community-dwelling older adults 65 years or older who had nonneurogenic OH were eligible. Data from 51 participants, comprising 27 in the intervention group and 24 in the control group, were analyzed. Weekly biofeedback-based integrated program consisting of biofeedback training along with group education about behavioral modification, physical activities, and telephone counseling was provided for 12 weeks. Orthostatic hypotension was evaluated by measuring the drop in systolic and diastolic blood pressure after postural changes. Autonomic nervous system function was measured using heart rate variability. Results: Among the indicators of heart rate variability, total power (P =.037) and low frequency (P =.017) increased significantly, suggesting that autonomic function improved. Severity of orthostatic symptoms (P <.001) and drops in systolic (P =.003) and diastolic (P =.012) blood pressure after postural changes decreased significantly in the intervention group. Conclusion: Biofeedback-based integrated program was effective in improving autonomic nervous system function and alleviated OH. Therefore, biofeedback-based integrated program should be tested in a larger randomized controlled study with long-term follow-up."

results = vector_db.search_similar(
    query_text=test_query,
    n_results=3,
    label="INCLUDE",
    include_metadata=True,
    include_self=False,
)

# for i in range(len(results["ids"][0])):
#     paper_id = results["ids"][0][i]
#     distance = results["distances"][0][i]
#     metadata = results["metadatas"][0][i]

#     print(f"PAPER {i+1} (ID: {paper_id}, Distance: {distance:.3f})")
#     print(f"Label: {'INCLUDE' if metadata['label'] else 'EXCLUDE'}")
#     print(f"Title: {metadata['title']}")
#     print(f"Abstract: {metadata['abstract']}...")
#     print("-" * 60)

Search by paper ID

In [27]:
sample_paper_id = 1818

# Get the original paper details
original_paper = vector_db.get_paper_by_id(sample_paper_id)
# print(f"Original Paper (ID: {sample_paper_id})")
# print(f"Label: {'INCLUDE' if original_paper['metadata']['label'] else 'EXCLUDE'}")
# print(f"Title: {original_paper['metadata']['title']}")
# print(f"Abstract: {original_paper['metadata']['abstract']}")
# print()

# Find similar papers
results = vector_db.search_similar_by_id(paper_id=sample_paper_id, n_results=3, include_self=False)  # Test with different value

# print(f"Found {len(results['ids'][0])} similar papers:")
# print()

# for i in range(len(results["ids"][0])):
#     paper_id = results["ids"][0][i]
#     distance = results["distances"][0][i]
#     metadata = results["metadatas"][0][i]

#     print(f"SIMILAR PAPER {i+1} (ID: {paper_id}, Distance: {distance:.3f})")
#     print(f"Label: {'INCLUDE' if metadata['label'] else 'EXCLUDE'}")
#     print(f"Title: {metadata['title']}")
#     print(f"Abstract: {metadata['abstract']}")
#     print("-" * 40)

Search random papers

In [28]:
random_papers = vector_db.search_random(n_results=3, label="INCLUDE")
# random_papers

## 5) Configure LLM classifiers

In [29]:
MODELS = {
    "openai": "gpt-5-mini",
    "medgemma": "google/medgemma-27b-text-it",
}

In [30]:
from llm import MedGemmaClassifier, OpenAIClassifier, criterias, evaluate_llm_classifier

# load criteria
INCLUSION_CRITERIA = criterias.INCLUSION_CRITERIA
EXCLUSION_CRITERIA = criterias.EXCLUSION_CRITERIA

classifiers = {}

for model_name, model in MODELS.items():
    # select classifier class
    if model_name == "openai":
        classifier = OpenAIClassifier
    elif model_name == "medgemma":
        classifier = MedGemmaClassifier

    # create classifiers
    new_classifiers = {
        f"{model_name}_0shot": classifier(model=model, use_few_shot=False),
        f"{model_name}_1shot": classifier(model=model, use_few_shot=True, n_shots=1),
        f"{model_name}_2shot": classifier(model=model, use_few_shot=True, n_shots=2),
        f"{model_name}_2shot_random": classifier(model=model, use_few_shot=True, n_shots=2, shot_selection_strategy="random"),
        f"{model_name}_4shot_nocriteria": classifier(model=model, use_few_shot=True, n_shots=4),
    }
    classifiers.update(new_classifiers)

    # initialize classifiers
    for name, classifier_obj in new_classifiers.items():
        if name == f"{model_name}_4shot_nocriteria":
            classifier_obj.set_criteria("Not set", "Not set")
        else:
            classifier_obj.set_criteria(INCLUSION_CRITERIA, EXCLUSION_CRITERIA)

In [31]:
classifiers

{'openai_0shot': <llm.llm_classifier.OpenAIClassifier at 0x1334383b0>,
 'openai_1shot': <llm.llm_classifier.OpenAIClassifier at 0x12549b080>,
 'openai_2shot': <llm.llm_classifier.OpenAIClassifier at 0x1341cda60>,
 'openai_2shot_random': <llm.llm_classifier.OpenAIClassifier at 0x13443bd40>,
 'openai_4shot_nocriteria': <llm.llm_classifier.OpenAIClassifier at 0x13443b170>,
 'medgemma_0shot': <llm.llm_classifier.MedGemmaClassifier at 0x13426e9f0>,
 'medgemma_1shot': <llm.llm_classifier.MedGemmaClassifier at 0x134509f40>,
 'medgemma_2shot': <llm.llm_classifier.MedGemmaClassifier at 0x13443a420>,
 'medgemma_2shot_random': <llm.llm_classifier.MedGemmaClassifier at 0x13450a390>,
 'medgemma_4shot_nocriteria': <llm.llm_classifier.MedGemmaClassifier at 0x134508410>}

## 6) Evaluation on test set

In [32]:
from sklearn.model_selection import train_test_split

# Split dataframe maintaining same indices as X/y split in other notebooks
train_df, test_df = train_test_split(df, test_size=0.20, random_state=RANDOM_STATE, stratify=df["label"])

1) Pre-compute embeddings for parallel processing

In [33]:
# Pre-compute embeddings for all evaluation papers to enable parallel processing
print("Pre-computing embeddings for parallel few-shot processing...")

# Get all text combinations from test_df for embedding
eval_texts = join_title_abstract(test_df)

# Pre-compute embeddings (this will cache them for later use)
embeddings, meta = vector_db.embedding_cache.compute(
    eval_texts,
    vector_db.embedding_model,
    batch_size=32,
    normalize_embeddings=True,  # Match the normalization used in similarity search
    device=None,
    show_progress_bar=True,
)

print(f"Pre-computed {len(embeddings)} embeddings for evaluation set")

Pre-computing embeddings for parallel few-shot processing...
Pre-computed 1150 embeddings for evaluation set


2) Run parallel classification

In [None]:
N_WORKERS = 30
results = {}

for provider, classifier in classifiers.items():
    results_df = classifier.classify_dataframe(test_df, parallel=True, n_workers=N_WORKERS)
    results[provider] = results_df
    results_df.to_csv(f"{RESULTS_DIR}/3/few_shots/{provider}_{len(test_df)}.csv", index=False)

Classifying papers (gpt-5-mini, 30 parallel workers):   0%|          | 0/1150 [00:00<?, ?it/s]

Classifying papers (google/medgemma-27b-text-it, 30 parallel workers):   0%|          | 0/1150 [00:00<?, ?it/sâ€¦

Error processing paper 2528: 'ec60c168-1f33-495b-a5ec-05199ffa39c1CollectionGetEvent0'
Error processing paper 4057: 'ec60c168-1f33-495b-a5ec-05199ffa39c1CollectionGetEvent0'
Error processing paper 4360: 'ec60c168-1f33-495b-a5ec-05199ffa39c1CollectionGetEvent0'
Error processing paper 5227: 'ec60c168-1f33-495b-a5ec-05199ffa39c1CollectionGetEvent0'


3) Load from saved files (alternative)

In [47]:
# Alternative: load from saved CSV files if already computed
results = {}
for provider in classifiers.keys():
    results[provider] = pd.read_csv(f"{RESULTS_DIR}/3/few_shots/{provider}_{len(test_df)}.csv")
results.keys()

dict_keys(['openai_0shot', 'openai_1shot', 'openai_2shot', 'openai_2shot_random', 'openai_4shot_nocriteria', 'medgemma_0shot', 'medgemma_1shot', 'medgemma_2shot', 'medgemma_2shot_random', 'medgemma_4shot_nocriteria'])

In [48]:
# decision distribution and metrics
metrics_list = []

for name, results_data in results.items():
    # treat UNCERTAIN as positive (include)
    metrics = evaluate_llm_classifier(results_data["label"], results_data["decision"], uncertain_as_positive=True)
    # split name into provider and strategy
    provider, strategy = name.split("_", 1)
    metrics_list.append(
        {
            "provider": provider,
            "strategy": strategy,
            "recall": metrics["recall"],
            "specificity": metrics["specificity"],
            "precision": metrics["precision"],
            "accuracy": metrics["accuracy"],
            "uncertain_percentage": metrics["uncertain_rate"] * 100,
            # "tot_samples": metrics["total_samples"],
        }
    )

metrics_df = pd.DataFrame(metrics_list).sort_values(by=["recall", "specificity"], ascending=[False, False])
metrics_df = metrics_df.round(2).reset_index(drop=True)

# export for thesis
metrics_df.to_csv(f"../results/thesis_figures_tables_generation/3/metrics.csv", index=False)
metrics_df

Unnamed: 0,provider,strategy,recall,specificity,precision,accuracy,uncertain_percentage
0,openai,4shot_nocriteria,0.92,0.76,0.28,0.78,0.17
1,medgemma,0shot,0.91,0.96,0.68,0.95,0.0
2,medgemma,4shot_nocriteria,0.88,0.52,0.16,0.55,0.0
3,medgemma,2shot,0.87,0.84,0.35,0.84,0.0
4,openai,1shot,0.86,0.97,0.75,0.96,1.74
5,openai,2shot,0.86,0.97,0.72,0.96,2.09
6,medgemma,1shot,0.86,0.91,0.49,0.91,0.0
7,openai,2shot_random,0.82,0.97,0.75,0.96,1.91
8,medgemma,2shot_random,0.82,0.9,0.44,0.89,0.0
9,openai,0shot,0.8,0.97,0.73,0.95,2.7


In [49]:
all_costs = {}
for name, classifier in classifiers.items():
    costs = classifier.compute_costs(results[name])
    all_costs[name] = costs

costs_df = pd.DataFrame(all_costs).T
costs_df = costs_df.reset_index().rename(columns={"index": "strategy"})


# export for thesis
costs_df.to_csv(f"../results/thesis_figures_tables_generation/3/costs.csv", index=False)
costs_df

Unnamed: 0,strategy,cost_per_paper,cost_per_1k_papers,total_cost,n_papers
0,openai_0shot,0.0006,0.623,0.716,1150.0
1,openai_1shot,0.0009,0.913,1.05,1150.0
2,openai_2shot,0.0012,1.24,1.427,1150.0
3,openai_2shot_random,0.0012,1.162,1.336,1150.0
4,openai_4shot_nocriteria,0.0021,2.09,2.403,1150.0
5,medgemma_0shot,0.0,0.0,0.0,1150.0
6,medgemma_1shot,0.0,0.0,0.0,1150.0
7,medgemma_2shot,0.0,0.0,0.0,1149.0
8,medgemma_2shot_random,0.0,0.0,0.0,1146.0
9,medgemma_4shot_nocriteria,0.0,0.0,0.0,1150.0
