# LLM Classifiers

## 1) Configuration and Setup

In [None]:
import os
import sys

import pandas as pd
from dotenv import load_dotenv
from tqdm.auto import tqdm

sys.path.append("../src")

from llm import GeminiClassifier, MedGemmaClassifier, OpenAIClassifier, evaluate_llm_classifier
from utils import clean_dataframe, sample_test_papers

load_dotenv("../.env")

In [None]:
FULL_DATASET_CSV = os.getenv("FULL_DATASET_CSV")
LABEL_COL = os.getenv("LABEL_COL")
RESULTS_DIR = os.getenv("RESULTS_DIR")
RANDOM_STATE = int(os.getenv("RANDOM_STATE"))

# create results directory if it doesn't exist
os.makedirs(f"{RESULTS_DIR}/2", exist_ok=True)

## 2) Load and Clean Dataset

In [None]:
# load dataset
df = pd.read_csv(FULL_DATASET_CSV, usecols=["id", "title", "abstract", LABEL_COL])
df = df.rename(columns={LABEL_COL: "label"})  # rename the label column to "label"

# clean datasets
df = clean_dataframe(df)

## 3) Configure LLM Classifier

In [None]:
from llm import criterias

# load criteria
INCLUSION_CRITERIA = criterias.INCLUSION_CRITERIA
EXCLUSION_CRITERIA = criterias.EXCLUSION_CRITERIA

# create classifiers
classifiers = {
    "medgemma-27b-text-it": MedGemmaClassifier(),
    "openai_gpt-5-mini": OpenAIClassifier(model="gpt-5-mini"),
    "openai_gpt-5-nano": OpenAIClassifier(model="gpt-5-nano"),
    "gemini_gemini-2.5-flash": GeminiClassifier(model="gemini-2.5-flash"),
    "gemini_gemini-2.5-pro": GeminiClassifier(model="gemini-2.5-pro"),
}

# initialize classifiers
for classifier in classifiers.values():
    classifier.set_criteria(INCLUSION_CRITERIA, EXCLUSION_CRITERIA)

## 4) Evaluation on test set

In [None]:
from sklearn.model_selection import train_test_split

# Split dataframe maintaining same indices as X/y split in other notebooks
train_df, test_df = train_test_split(df, test_size=0.20, random_state=RANDOM_STATE, stratify=df["label"])

1) Compute

In [None]:
results = {}

# run classification
for provider, classifier in classifiers.items():
    results_df = classifier.classify_dataframe(test_df, parallel=True, n_workers=15)
    results[provider] = results_df
    results_df.to_csv(f"{RESULTS_DIR}/2/{provider}_{len(test_df)}.csv", index=False)

2) Load

In [None]:
results = {}
results["openai_gpt-5-mini"] = pd.read_csv(f"../results/2/openai_gpt-5-mini_{len(test_df)}.csv")
results["openai_gpt-5-nano"] = pd.read_csv(f"../results/2/openai_gpt-5-nano_{len(test_df)}.csv")
results["gemini_gemini-2.5-flash"] = pd.read_csv(f"../results/2/gemini_gemini-2.5-flash_{len(test_df)}.csv")
results["gemini_gemini-2.5-pro"] = pd.read_csv(f"../results/2/gemini_gemini-2.5-pro_{len(test_df)}.csv")
results["medgemma-27b-text-it"] = pd.read_csv(f"../results/2/medgemma-27b-text-it_{len(test_df)}.csv")
results.keys()

## 6) Evaluate Performance

In [None]:
# decision distribution and metrics

metrics_list = []

for name, results_data in results.items():
    decision_counts = results_data["decision"].value_counts(normalize=True)

    # treat UNCERTAIN as positive (include)
    metrics = evaluate_llm_classifier(results_data["label"], results_data["decision"], uncertain_as_positive=True)

    row = {
        "recall": metrics["recall"],
        "specificity": metrics["specificity"],
        "accuracy": metrics["accuracy"],
        "precision": metrics["precision"],
        "no_percentage": decision_counts.get("NO", 0),
        "yes_percentage": decision_counts.get("YES", 0),
        "uncertain_percentage": decision_counts.get("UNCERTAIN", 0),
        "tot_samples": metrics["total_samples"],
    }
    metrics_list.append((name, row))

metrics_df = pd.DataFrame.from_dict(dict(metrics_list), orient="index").sort_values(by="recall", ascending=False)
display(metrics_df.style.format("{:.2f}"))

## 7) Compute confusion matrices
For openai_gpt-5-mini and medgemma-27b-text-it	

In [None]:
# export data to generate plots for the thesis
import pickle

# save results dictionary to pickle file
with open(f"../results/thesis_figures_tables_generation/2/results_summary.pkl", "wb") as f:
    pickle.dump(results, f)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

models_to_plot = ["openai_gpt-5-mini", "medgemma-27b-text-it"]

for ax, model_name in zip(axes, models_to_plot):
    results_data = results[model_name]

    # Convert decision to binary (treat UNCERTAIN as positive/YES)
    y_pred = results_data["decision"].apply(lambda x: 1 if x in ["YES", "UNCERTAIN"] else 0)
    y_true = results_data["label"].astype(int)

    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Exclude", "Include"])
    disp.plot(ax=ax, cmap="Blues", colorbar=False)
    ax.set_title(model_name)

plt.tight_layout()
plt.show()

## 8) Compute costs

In [None]:
all_costs = {}
for name, classifier in classifiers.items():
    costs = classifier.compute_costs(results[name])
    all_costs[classifier.model] = costs

all_costs_df = pd.DataFrame(all_costs).T
display(all_costs_df.style.format({"cost_per_paper": "$ {:.4f}", "cost_per_1k_papers": "$ {:.2f}", "total_cost": "$ {:.3f}", "n_papers": "{:.0f}"}))

## 9) Compute standard deviation
To see how "random" the LLM responses are (for reproducibility)

1) Compute

In [None]:
N_RUNS = 10
N_PAPERS_PER_RUN = 200

In [None]:
os.makedirs(f"{RESULTS_DIR}/2/sd_analysis", exist_ok=True)

for run in tqdm(range(1, N_RUNS + 1), desc="Runs"):
    for name, classifier in classifiers.items():
        if "5-mini" not in name:
            continue  # only run for GPT-5-mini to save costs
        results_df_run = classifier.classify_dataframe(
            df=test_df[0 : min(N_PAPERS_PER_RUN, len(test_df))],
            parallel=True,
            n_workers=20,
        )
        results_df_run.to_csv(f"{RESULTS_DIR}/2/sd_analysis/{classifier.model.split("/")[-1]}_run{run}.csv", index=False)

2) Load

In [None]:
# compute metrics means and stddev

sd_data = []
for file in os.listdir(f"{RESULTS_DIR}/2/sd_analysis"):
    df = pd.read_csv(os.path.join(f"{RESULTS_DIR}/2/sd_analysis", file))
    metrics = evaluate_llm_classifier(df["label"], df["decision"], uncertain_as_positive=True)
    model_name = file.split("_run")[0]
    run_number = int(file.split("_run")[1].split(".csv")[0])

    data = {"run": run_number, "model": model_name, **metrics}
    sd_data.append(data)

In [None]:
# analyze/display metrics means and stddev

sd_df = pd.DataFrame(sd_data)
sd_df = sd_df.drop(columns=["run"])

means = sd_df.groupby("model").mean()
means["n_total_runs"] = N_RUNS

stds = sd_df.groupby("model").std()
stds.drop(columns=["total_samples", "total_uncertain", "uncertain_rate"], inplace=True)

display(means.style.format("{:.2f}").set_caption("Mean"))
display(stds.style.format("{:.2f}").set_caption("Standard Deviation"))