# Part C

In [1]:
import json
import requests
import pandas as pd
from typing import TypedDict, Optional
from langgraph.graph import StateGraph, END

CSV_PATH = "hitl_green_100_a3.csv"
OLLAMA_MODEL = "gemma3:4b"
OLLAMA_URL = "http://localhost:11434/api/generate"

SYSTEM = (
    "You are classifying whether a patent claim is GREEN technology. "
    "Use ONLY the claim text. Do NOT use CPC codes or metadata. "
    "Green means explicit climate/energy/environmental benefit: emissions reduction, "
    "renewables, energy efficiency, batteries/EV powertrain, carbon capture, waste/recycling, "
    "water treatment, climate adaptation, etc. General 'efficiency' without environmental purpose is NOT enough."
)

def ollama(prompt: str, num_predict=240) -> str:
    payload = {
        "model": OLLAMA_MODEL,
        "prompt": prompt,
        "stream": False,
        "options": {"temperature": 0.0, "num_predict": num_predict},
    }
    r = requests.post(OLLAMA_URL, json=payload, timeout=180)
    r.raise_for_status()
    return r.json()["response"].strip()

def extract_json(s: str) -> dict:
    s = s.strip()
    if s.startswith("{") and s.endswith("}"):
        return json.loads(s)
    a, b = s.find("{"), s.rfind("}")
    if a != -1 and b != -1 and b > a:
        return json.loads(s[a:b+1])
    raise ValueError("Could not parse JSON:\n" + s[:500])

class DebateState(TypedDict):
    doc_id: int
    text: str
    advocate: Optional[str]
    skeptic: Optional[str]
    judge_json: Optional[dict]

def advocate_node(state: DebateState) -> DebateState:
    prompt = f"""{SYSTEM}

Role: Advocate (argue FOR green).
Given the claim text, argue why it could qualify as green tech. Cite exact phrases.
Return 3-6 bullet points. If no evidence, say: "No strong green evidence found."

Claim:
\"\"\"{state['text']}\"\"\"
"""
    state["advocate"] = ollama(prompt)
    return state

def skeptic_node(state: DebateState) -> DebateState:
    prompt = f"""{SYSTEM}

Role: Skeptic (argue AGAINST green / detect greenwashing).
Argue why it should NOT qualify as green tech. Cite exact phrases.
Return 3-6 bullet points.

Claim:
\"\"\"{state['text']}\"\"\"
"""
    state["skeptic"] = ollama(prompt)
    return state

def judge_node(state: DebateState) -> DebateState:
    prompt = f"""{SYSTEM}

Role: Judge.
Weigh the arguments and decide using ONLY the claim text.

Return ONLY valid JSON with keys:
- agent_judge_suggested: 0 or 1
- agent_judge_confidence: "low" | "medium" | "high"
- agent_judge_rationale: 1-3 sentences citing exact claim phrases

Claim:
\"\"\"{state['text']}\"\"\"

Advocate:
{state['advocate']}

Skeptic:
{state['skeptic']}
"""
    raw = ollama(prompt)
    state["judge_json"] = extract_json(raw)
    return state

def build_graph():
    g = StateGraph(DebateState)
    g.add_node("advocate", advocate_node)
    g.add_node("skeptic", skeptic_node)
    g.add_node("judge", judge_node)

    g.set_entry_point("advocate")
    g.add_edge("advocate", "skeptic")
    g.add_edge("skeptic", "judge")
    g.add_edge("judge", END)
    return g.compile()

def main():
    df = pd.read_csv(CSV_PATH)

    for c in ["agent_advocate", "agent_skeptic", "agent_judge_suggested", "agent_judge_confidence", "agent_judge_rationale"]:
        if c not in df.columns:
            df[c] = ""

    app = build_graph()

    for i in range(len(df)):
        if str(df.at[i, "agent_judge_suggested"]).strip() != "":
            continue

        state: DebateState = {
            "doc_id": int(df.at[i, "doc_id"]),
            "text": str(df.at[i, "text"]),
            "advocate": None,
            "skeptic": None,
            "judge_json": None,
        }

        try:
            out = app.invoke(state)
            df.at[i, "agent_advocate"] = out["advocate"]
            df.at[i, "agent_skeptic"] = out["skeptic"]
            jj = out["judge_json"]
            df.at[i, "agent_judge_suggested"] = int(jj["agent_judge_suggested"])
            df.at[i, "agent_judge_confidence"] = jj["agent_judge_confidence"]
            df.at[i, "agent_judge_rationale"] = jj["agent_judge_rationale"]

            df.to_csv(CSV_PATH, index=False)
            print(f"[{i+1}/{len(df)}] saved")
        except Exception as e:
            print(f"[{i+1}/{len(df)}] error: {e}")

    print("DONE:", CSV_PATH)

if __name__ == "__main__":
    main()

[1/100] saved
[2/100] saved
[3/100] saved
[4/100] saved
[5/100] saved
[6/100] saved
[7/100] saved
[8/100] saved
[9/100] saved
[10/100] saved
[11/100] saved
[12/100] saved
[13/100] saved
[14/100] saved
[15/100] saved
[16/100] saved
[17/100] saved
[18/100] saved
[19/100] saved
[20/100] saved
[21/100] saved
[22/100] saved
[23/100] saved
[24/100] saved
[25/100] saved
[26/100] saved
[27/100] saved
[28/100] saved
[29/100] saved
[30/100] saved
[31/100] saved
[32/100] saved
[33/100] saved
[34/100] saved
[35/100] saved
[36/100] saved
[37/100] saved
[38/100] saved
[39/100] saved
[40/100] saved
[41/100] saved
[42/100] saved
[43/100] saved
[44/100] saved
[45/100] saved
[46/100] saved
[47/100] saved
[48/100] saved
[49/100] saved
[50/100] saved
[51/100] saved
[52/100] saved
[53/100] saved
[54/100] saved
[55/100] saved
[56/100] saved
[57/100] saved
[58/100] saved
[59/100] saved
[60/100] saved
[61/100] saved
[62/100] saved
[63/100] saved
[64/100] saved
[65/100] saved
[66/100] saved
[67/100] saved
[68/

# Part D

In [1]:
import pandas as pd

CSV_PATH = "hitl_green_100_a3-hitl.csv"

SHOW_ADV_SKEP = True   # sæt True hvis du vil se advocate/skeptic bullets hver gang
MAX_CHARS = 1200        # hvor meget claimtekst der printes (resten kan stadig ses i csv)

def clip(s: str, n: int) -> str:
    s = "" if s is None else str(s)
    return s if len(s) <= n else s[:n] + " ...[truncated]"

def main():
    df = pd.read_csv(CSV_PATH)

    # ensure needed columns exist
    needed = ["doc_id", "text", "agent_judge_suggested", "agent_judge_confidence", "agent_judge_rationale"]
    for c in needed:
        if c not in df.columns:
            raise ValueError(f"Missing column '{c}' in {CSV_PATH}. Found: {list(df.columns)}")

    # add human column if missing
    if "is_green_human_a3" not in df.columns:
        df["is_green_human_a3"] = ""
    if "notes_a3" not in df.columns:
        df["notes_a3"] = ""

    # iterate rows
    for i in range(len(df)):
        # skip already labeled
        if str(df.at[i, "is_green_human_a3"]).strip() != "":
            continue

        doc_id = df.at[i, "doc_id"]
        text = str(df.at[i, "text"])
        sug = df.at[i, "agent_judge_suggested"]
        conf = df.at[i, "agent_judge_confidence"]
        rat = df.at[i, "agent_judge_rationale"]

        print("\n" + "=" * 100)
        print(f"Row {i+1}/{len(df)} | doc_id: {doc_id}")
        print("-" * 100)
        print(clip(text, MAX_CHARS))
        print("-" * 100)
        print(f"Agent Judge suggested: {sug} | confidence: {conf}")
        print(f"Judge rationale: {rat}")

        if SHOW_ADV_SKEP:
            if "agent_advocate" in df.columns:
                print("\nAdvocate:\n", df.at[i, "agent_advocate"])
            if "agent_skeptic" in df.columns:
                print("\nSkeptic:\n", df.at[i, "agent_skeptic"])

        print("-" * 100)

        while True:
            ans = input("Your FINAL label is_green_human_a3 (0/1) [s=skip, q=quit]: ").strip().lower()
            if ans in {"0", "1"}:
                df.at[i, "is_green_human_a3"] = int(ans)
                df.to_csv(CSV_PATH, index=False)
                print("Saved ✅")
                break
            if ans == "s":
                df.at[i, "notes_a3"] = "skipped"
                df.to_csv(CSV_PATH, index=False)
                print("Skipped (saved note) ✅")
                break
            if ans == "q":
                df.to_csv(CSV_PATH, index=False)
                print("Quit (progress saved).")
                return
            print("Please type 0, 1, s, or q.")

    print("\nDONE. All available rows labeled (or skipped).")
    print("Updated:", CSV_PATH)

if __name__ == "__main__":
    main()



Row 1/100 | doc_id: 8593474
----------------------------------------------------------------------------------------------------
1. A method for reading data from a tiled organized memory, comprising: in response to receiving one tiled-X read request to read two cache line size units from a tiled organized memory in the X-direction, requesting two cache lines from the tiled-X organized memory without fragmenting the received tiled-X read request into more than the one tiled-X read request; allocating the two cache lines in parallel; reading data associated with the two cache lines from a data cache in parallel; and returning, in parallel, data associated with the two requested cache lines, wherein the allocating, reading, and returning of the data is accomplished in one clock cycle.
----------------------------------------------------------------------------------------------------
Agent Judge suggested: 0 | confidence: low
Judge rationale: The claim describes a method for data readin

In [3]:
import os
import re
import numpy as np
import pandas as pd
import torch

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from sklearn.metrics import precision_recall_fscore_support


# -------------------- CONFIG --------------------
PARQUET_PATH = "patents_50k_green.parquet"
A3_CSV_PATH = "hitl_green_100_a3.csv"

BASE_MODEL = "AI-Growth-Lab/PatentSBERTa"
OUT_DIR = "models_finetuned_patentsberta_green_a3"
MERGED_OUT = "patents_50k_green_with_gold_a3.parquet"

ID_COL = "doc_id"
TEXT_COL = "text"
SPLIT_COL = "split"
SILVER_LABEL_COL = "is_green_silver"

# A3 human label column (script will auto-detect if needed)
PREFERRED_A3_HUMAN_COL = "is_green_human_a3"  # change if you used another name

# Recommended settings
MAX_LEN = 256
EPOCHS = 1
LR = 2e-5
TRAIN_BS = 16
EVAL_BS = 32
SEED = 42


# -------------------- DEVICE --------------------
if torch.backends.mps.is_available():
    DEVICE = "mps"
elif torch.cuda.is_available():
    DEVICE = "cuda"
else:
    DEVICE = "cpu"
print("Using device:", DEVICE)


def clean_binary_label(x):
    """Convert 0/1, 0.0/1.0, ' 1 ' to int. Return None if invalid/empty."""
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return None
    s = str(x).strip()
    if s == "":
        return None
    if re.fullmatch(r"[01](\.0+)?", s):
        return int(float(s))
    return None


def compute_metrics(eval_pred):
    # Works on old/new transformers
    if hasattr(eval_pred, "predictions"):
        logits = eval_pred.predictions
        labels = eval_pred.label_ids
    else:
        logits, labels = eval_pred

    preds = np.argmax(logits, axis=-1)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", zero_division=0)
    acc = (preds == labels).mean()
    return {"precision": p, "recall": r, "f1": f1, "accuracy": acc}


def main():
    os.makedirs(OUT_DIR, exist_ok=True)

    # 1) Load parquet (50k base)
    df = pd.read_parquet(PARQUET_PATH)
    for c in [ID_COL, TEXT_COL, SPLIT_COL, SILVER_LABEL_COL]:
        if c not in df.columns:
            raise ValueError(f"Missing column '{c}' in {PARQUET_PATH}. Found: {list(df.columns)}")

    df[ID_COL] = pd.to_numeric(df[ID_COL], errors="coerce").astype("Int64")

    # 2) Load A3 CSV with human labels
    a3 = pd.read_csv(A3_CSV_PATH)
    if ID_COL not in a3.columns:
        raise ValueError(f"Missing '{ID_COL}' in {A3_CSV_PATH}. Found: {list(a3.columns)}")

    # Auto-detect human label column
    if PREFERRED_A3_HUMAN_COL in a3.columns:
        human_col = PREFERRED_A3_HUMAN_COL
    elif "is_green_human" in a3.columns:
        human_col = "is_green_human"
    else:
        raise ValueError(
            f"Could not find a human label column. Expected '{PREFERRED_A3_HUMAN_COL}' or 'is_green_human'. "
            f"Found columns: {list(a3.columns)}"
        )

    a3[ID_COL] = pd.to_numeric(a3[ID_COL], errors="coerce").astype("Int64")
    a3[human_col] = a3[human_col].apply(clean_binary_label)

    a3_labeled = a3.dropna(subset=[ID_COL, human_col]).copy()
    a3_labeled[human_col] = a3_labeled[human_col].astype(int)

    if len(a3_labeled) == 0:
        raise RuntimeError(
            f"No human labels found in '{human_col}' inside {A3_CSV_PATH}. "
            "Make sure you've filled 0/1 and saved the file."
        )

    if len(a3_labeled) != 100:
        print(f"WARNING: expected 100 A3 labeled rows, found {len(a3_labeled)} (continuing anyway)")

    gold_map = dict(zip(a3_labeled[ID_COL].tolist(), a3_labeled[human_col].tolist()))
    print("A3 labeled doc_ids:", len(gold_map))

    # 3) Create is_green_gold_a3 (silver overridden by human for the 100)
    df["is_green_gold_a3"] = df[SILVER_LABEL_COL].astype(int)
    mask = df[ID_COL].isin(gold_map.keys())
    df.loc[mask, "is_green_gold_a3"] = df.loc[mask, ID_COL].map(gold_map).astype(int)

    # 4) Build train/eval sets
    train_silver_df = df[df[SPLIT_COL] == "train_silver"].copy()
    gold_100_df = df[df[ID_COL].isin(gold_map.keys())].copy()

    train_df = pd.concat([train_silver_df, gold_100_df], ignore_index=True)
    train_df["label"] = train_df["is_green_gold_a3"].astype(int)

    eval_silver_df = df[df[SPLIT_COL] == "eval_silver"].copy()
    eval_silver_df["label"] = eval_silver_df[SILVER_LABEL_COL].astype(int)

    gold_eval_df = gold_100_df.copy()
    gold_eval_df["label"] = gold_eval_df["is_green_gold_a3"].astype(int)

    print("\nSizes:")
    print("train_silver:", len(train_silver_df))
    print("gold_100_a3:", len(gold_100_df))
    print("train_final:", len(train_df))
    print("eval_silver:", len(eval_silver_df))

    # 5) HF datasets
    train_ds = Dataset.from_pandas(train_df[[TEXT_COL, "label"]], preserve_index=False)
    eval_ds = Dataset.from_pandas(eval_silver_df[[TEXT_COL, "label"]], preserve_index=False)
    gold_ds = Dataset.from_pandas(gold_eval_df[[TEXT_COL, "label"]], preserve_index=False)

    # 6) Tokenize
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

    def tok(batch):
        return tokenizer(batch[TEXT_COL], truncation=True, max_length=MAX_LEN)

    train_ds = train_ds.map(tok, batched=True, remove_columns=[TEXT_COL])
    eval_ds = eval_ds.map(tok, batched=True, remove_columns=[TEXT_COL])
    gold_ds = gold_ds.map(tok, batched=True, remove_columns=[TEXT_COL])

    collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # 7) Model
    model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=2).to(DEVICE)

    # 8) TrainingArguments (old-transformers compatible)
    args = TrainingArguments(
        output_dir=OUT_DIR,
        num_train_epochs=EPOCHS,
        learning_rate=LR,
        per_device_train_batch_size=TRAIN_BS,
        per_device_eval_batch_size=EVAL_BS,
        seed=SEED,
        report_to="none",
        fp16=False,
        logging_steps=50,
        save_steps=500,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        data_collator=collator,
        compute_metrics=compute_metrics,
    )

    # 9) Train once
    trainer.train()

    # 10) Evaluate
    print("\n=== Assignment 3 Model: eval_silver (silver labels) ===")
    m_eval = trainer.evaluate(eval_dataset=eval_ds)
    print(m_eval)

    print("\n=== Assignment 3 Model: gold_100 (A3 human labels) ===")
    m_gold = trainer.evaluate(eval_dataset=gold_ds)
    print(m_gold)

    # 11) Save model + tokenizer
    trainer.save_model(OUT_DIR)
    tokenizer.save_pretrained(OUT_DIR)

    # 12) Save merged parquet
    df.to_parquet(MERGED_OUT, index=False)
    print(f"\nSaved merged dataset: {MERGED_OUT}")
    print(f"Saved fine-tuned model: {OUT_DIR}")


if __name__ == "__main__":
    main()

Using device: mps
A3 labeled doc_ids: 100

Sizes:
train_silver: 35000
gold_100_a3: 100
train_final: 35100
eval_silver: 5000


Map:   0%|          | 0/35100 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mMPNetForSequenceClassification LOAD REPORT[0m from: AI-Growth-Lab/PatentSBERTa
Key                        | Status     | 
---------------------------+------------+-
embeddings.position_ids    | UNEXPECTED | 
pooler.dense.weight        | UNEXPECTED | 
pooler.dense.bias          | UNEXPECTED | 
classifier.dense.weight    | MISSING    | 
classifier.out_proj.weight | MISSING    | 
classifier.dense.bias      | MISSING    | 
classifier.out_proj.bias   | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Step,Training Loss
50,0.649193
100,0.5296
150,0.504056
200,0.513348
250,0.474745
300,0.494384
350,0.445602
400,0.440339
450,0.470936
500,0.460302


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


=== Assignment 3 Model: eval_silver (silver labels) ===




{'eval_loss': 0.4130362272262573, 'eval_precision': 0.8235048097030531, 'eval_recall': 0.7876, 'eval_f1': 0.8051523205888367, 'eval_accuracy': 0.8094, 'eval_runtime': 177.9072, 'eval_samples_per_second': 28.105, 'eval_steps_per_second': 0.882, 'epoch': 1.0}

=== Assignment 3 Model: gold_100 (A3 human labels) ===




{'eval_loss': 0.6464775800704956, 'eval_precision': 0.6666666666666666, 'eval_recall': 0.6296296296296297, 'eval_f1': 0.6476190476190476, 'eval_accuracy': 0.63, 'eval_runtime': 3.5431, 'eval_samples_per_second': 28.224, 'eval_steps_per_second': 1.129, 'epoch': 1.0}


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


Saved merged dataset: patents_50k_green_with_gold_a3.parquet
Saved fine-tuned model: models_finetuned_patentsberta_green_a3


# Part E

In [4]:
import re
import numpy as np
import pandas as pd
import torch

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, DataCollatorWithPadding
from sklearn.metrics import precision_recall_fscore_support

# ---------- INPUTS (ret hvis dine navne er anderledes) ----------
PARQUET_A2 = "patents_50k_green_with_gold.parquet"        # fra Assignment 2
PARQUET_A3 = "patents_50k_green_with_gold_a3.parquet"     # fra Assignment 3 (hvis du har gemt en sådan)
MODEL_A3_DIR = "models_finetuned_patentsberta_green_a3"   # din A3 fine-tunede model-mappe

A2_CSV = "hitl_green_100_labeled.csv"   # har llm_green_suggested + is_green_human
A3_CSV = "hitl_green_100_a3-hitl.csv"        # har agent_judge_suggested + is_green_human_a3 (eller tilsvarende)

# Baseline + A2 F1 (du har dem allerede)
BASELINE_F1 = 0.7618
A2_F1 = 0.8033

# Kolonnenavne (ret hvis dine hedder noget andet)
A2_AI_COL = "llm_green_suggested"
A2_HUMAN_COL = "is_green_human"

A3_AI_COL = "agent_judge_suggested"
A3_HUMAN_COL = "is_green_human_a3"

# eval labels for eval_silver
EVAL_SPLIT_COL = "split"
EVAL_SPLIT_NAME = "eval_silver"
EVAL_LABEL_COL = "is_green_silver"

TEXT_COL = "text"
MAX_LEN = 256


# ---------- helpers ----------
def clean_binary_label(x):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return None
    s = str(x).strip()
    if s == "":
        return None
    if re.fullmatch(r"[01](\.0+)?", s):
        return int(float(s))
    return None

def agreement(csv_path: str, ai_col: str, human_col: str):
    df = pd.read_csv(csv_path)
    if ai_col not in df.columns or human_col not in df.columns:
        return None, 0

    df[ai_col] = df[ai_col].apply(clean_binary_label)
    df[human_col] = df[human_col].apply(clean_binary_label)
    df = df.dropna(subset=[ai_col, human_col])
    if len(df) == 0:
        return None, 0
    rate = (df[ai_col].astype(int) == df[human_col].astype(int)).mean()
    return float(rate), int(len(df))

def eval_f1_model(model_dir: str, parquet_path: str):
    df = pd.read_parquet(parquet_path)
    eval_df = df[df[EVAL_SPLIT_COL] == EVAL_SPLIT_NAME].copy()
    eval_df["label"] = eval_df[EVAL_LABEL_COL].astype(int)

    ds = Dataset.from_pandas(eval_df[[TEXT_COL, "label"]], preserve_index=False)

    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    def tok(batch):
        return tokenizer(batch[TEXT_COL], truncation=True, max_length=MAX_LEN)
    ds = ds.map(tok, batched=True, remove_columns=[TEXT_COL])

    collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

    model = AutoModelForSequenceClassification.from_pretrained(model_dir)

    # Device
    if torch.backends.mps.is_available():
        device = "mps"
    elif torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"
    model.to(device)

    trainer = Trainer(model=model, data_collator=collator)
    pred = trainer.predict(ds)

    logits = pred.predictions
    labels = pred.label_ids
    preds = np.argmax(logits, axis=1)

    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", zero_division=0)
    acc = (preds == labels).mean()
    return float(f1), float(p), float(r), float(acc)


def main():
    # Agreement % for A2 and A3
    a2_agree, a2_n = agreement(A2_CSV, A2_AI_COL, A2_HUMAN_COL)
    a3_agree, a3_n = agreement(A3_CSV, A3_AI_COL, A3_HUMAN_COL)

    # A3 F1 on eval_silver
    a3_f1, a3_p, a3_r, a3_acc = eval_f1_model(MODEL_A3_DIR, PARQUET_A3)

    print("\n=== Agreement (AI suggestion vs Human) ===")
    if a2_agree is None:
        print("Assignment 2 agreement: could not compute (missing columns or labels)")
    else:
        print(f"Assignment 2 (Simple LLM): {a2_agree*100:.1f}%  (n={a2_n})")

    if a3_agree is None:
        print("Assignment 3 agreement: could not compute (missing columns or labels)")
    else:
        print(f"Assignment 3 (Agents/QLoRA): {a3_agree*100:.1f}%  (n={a3_n})")

    print("\n=== Assignment 3 Model on eval_silver ===")
    print(f"Precision: {a3_p:.4f}")
    print(f"Recall:    {a3_r:.4f}")
    print(f"F1:        {a3_f1:.4f}")
    print(f"Accuracy:  {a3_acc:.4f}")

    print("\n=== README table (copy-paste) ===")
    print("| Model Version | Training Data Source | F1 Score (Eval Set) |")
    print("|---|---|---:|")
    print(f"| 1. Baseline | Frozen Embeddings (No Fine-tuning) | {BASELINE_F1:.4f} |")
    print(f"| 2. Assignment 2 Model | Fine-tuned on Silver + Gold (Simple LLM) | {A2_F1:.4f} |")
    print(f"| 3. Assignment 3 Model | Fine-tuned on Silver + Gold (Agents/QLoRA) | {a3_f1:.4f} |")

    print("\n=== Suggested 2–3 sentence reflection template ===")
    if a3_f1 > A2_F1 + 0.005:
        verdict = "improved"
    elif a3_f1 < A2_F1 - 0.005:
        verdict = "did not improve"
    else:
        verdict = "was similar"

    print(
        f"Compared to Assignment 2, the advanced workflow {verdict} downstream model performance on the eval set "
        f"(F1: {A2_F1:.4f} → {a3_f1:.4f}). "
        "This suggests that structured labeling can (or cannot) add value beyond a simple LLM approach, "
        "depending on how much the advanced method improves label quality on borderline cases."
    )

    print("\n(You can also include the agreement comparison above in README.)")


if __name__ == "__main__":
    main()


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]




=== Agreement (AI suggestion vs Human) ===
Assignment 2 (Simple LLM): 91.0%  (n=100)
Assignment 3 (Agents/QLoRA): 95.0%  (n=100)

=== Assignment 3 Model on eval_silver ===
Precision: 0.8235
Recall:    0.7876
F1:        0.8052
Accuracy:  0.8094

=== README table (copy-paste) ===
| Model Version | Training Data Source | F1 Score (Eval Set) |
|---|---|---:|
| 1. Baseline | Frozen Embeddings (No Fine-tuning) | 0.7618 |
| 2. Assignment 2 Model | Fine-tuned on Silver + Gold (Simple LLM) | 0.8033 |
| 3. Assignment 3 Model | Fine-tuned on Silver + Gold (Agents/QLoRA) | 0.8052 |

=== Suggested 2–3 sentence reflection template ===
Compared to Assignment 2, the advanced workflow was similar downstream model performance on the eval set (F1: 0.8033 → 0.8052). This suggests that structured labeling can (or cannot) add value beyond a simple LLM approach, depending on how much the advanced method improves label quality on borderline cases.

(You can also include the agreement comparison above in READ