# Part A

## Make parquet

In [None]:
# Create patents_50k_green.parquet from AI-Growth-Lab/patents_claims_1.5m_traim_test
# Balanced 25k green (Y02*) + 25k not green (no Y02*)
# Splits: train_silver / pool_unlabeled / eval_silver

# pip install -U datasets pandas pyarrow scikit-learn

import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split

DS_NAME = "AI-Growth-Lab/patents_claims_1.5m_traim_test"
OUT_PATH = "patents_50k_green.parquet"

SEED = 42
N_PER_CLASS = 25_000

# 1) Load dataset (train split) and only needed columns
ds = load_dataset(DS_NAME, split="train")

# Identify Y02* columns from feature names
cols = ds.column_names
y02_cols = [c for c in cols if c.startswith("Y02")]
print("Found Y02 columns:", y02_cols)

# keep minimal columns: id, date (optional), text, and all Y02*
keep_cols = ["id", "date", "text"] + y02_cols
ds_small = ds.select_columns(keep_cols)

# 2) Convert to pandas
df = ds_small.to_pandas()

# 3) Create silver label:
# green if ANY Y02* column == 1
# (works whether columns are int/bool)
df["is_green_silver"] = (df[y02_cols].sum(axis=1) > 0).astype(int)

print(df["is_green_silver"].value_counts())

# 4) Sample balanced 50k
df_green = df[df["is_green_silver"] == 1].sample(N_PER_CLASS, random_state=SEED)
df_not = df[df["is_green_silver"] == 0].sample(N_PER_CLASS, random_state=SEED)

df_50k = pd.concat([df_green, df_not], ignore_index=True).sample(frac=1, random_state=SEED)

# 5) Create splits (stratified)
train_df, temp_df = train_test_split(
    df_50k,
    test_size=0.30,
    stratify=df_50k["is_green_silver"],
    random_state=SEED
)
pool_df, eval_df = train_test_split(
    temp_df,
    test_size=1/3,  # 10% of total (because temp is 30%)
    stratify=temp_df["is_green_silver"],
    random_state=SEED
)

train_df = train_df.copy()
pool_df = pool_df.copy()
eval_df = eval_df.copy()

train_df["split"] = "train_silver"
pool_df["split"]  = "pool_unlabeled"
eval_df["split"]  = "eval_silver"

final_df = pd.concat([train_df, pool_df, eval_df], ignore_index=True)

# 6) Rename id -> doc_id to match assignment wording (optional but nice)
final_df = final_df.rename(columns={"id": "doc_id"})

# 7) Keep only columns you need downstream (recommended)
final_df = final_df[["doc_id", "date", "text", "is_green_silver", "split"]]

print(final_df["split"].value_counts())
print(final_df["is_green_silver"].value_counts())
print(final_df.head())

# 8) Save parquet
final_df.to_parquet(OUT_PATH, index=False)
print(f"Saved: {OUT_PATH}")

## Train

In [None]:
# Part A (faster on Apple Silicon): Frozen PatentSBERTa embeddings + Logistic Regression
# - Uses MPS (Apple GPU) when available
# - Caches embeddings to disk so you don't recompute in Part B
#
# pip install -U transformers torch scikit-learn pandas pyarrow tqdm joblib numpy

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, classification_report
import joblib

# ---------------- Config ----------------
PARQUET_PATH = "patents_50k_green.parquet"
MODEL_NAME = "AI-Growth-Lab/PatentSBERTa"

TEXT_COL = "text"
LABEL_COL = "is_green_silver"
SPLIT_COL = "split"
ID_COL = "doc_id"

MAX_LENGTH = 256
BATCH_SIZE = 64  # try 64 on MPS; if you get memory issues, lower to 32
OUT_DIR = "models_baseline"
CACHE_DIR = os.path.join(OUT_DIR, "emb_cache")
os.makedirs(CACHE_DIR, exist_ok=True)
os.makedirs(OUT_DIR, exist_ok=True)

# ---------------- Device (MPS first) ----------------
if torch.backends.mps.is_available():
    DEVICE = "mps"
elif torch.cuda.is_available():
    DEVICE = "cuda"
else:
    DEVICE = "cpu"

print("Using device:", DEVICE)


def mean_pooling(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    mask = attention_mask.unsqueeze(-1).type_as(last_hidden_state)  # [B,T,1]
    summed = (last_hidden_state * mask).sum(dim=1)                  # [B,H]
    counts = mask.sum(dim=1).clamp(min=1e-9)                        # [B,1]
    return summed / counts


@torch.no_grad()
def embed_texts(texts, tokenizer, model, batch_size=BATCH_SIZE, max_length=MAX_LENGTH, device=DEVICE):
    model.eval()
    all_embs = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
        batch = texts[i:i + batch_size]

        enc = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )

        # move to device
        enc = {k: v.to(device) for k, v in enc.items()}

        out = model(**enc)
        token_embs = out.last_hidden_state  # [B,T,H]
        sent_embs = mean_pooling(token_embs, enc["attention_mask"])  # [B,H]

        # L2 normalize helps linear classifier
        sent_embs = torch.nn.functional.normalize(sent_embs, p=2, dim=1)

        all_embs.append(sent_embs.detach().cpu().numpy())

    return np.vstack(all_embs)


def load_or_compute_embeddings(cache_prefix, texts, tokenizer, model):
    x_path = os.path.join(CACHE_DIR, f"{cache_prefix}_X.npy")
    if os.path.exists(x_path):
        print(f"Loading cached embeddings: {x_path}")
        return np.load(x_path)

    X = embed_texts(texts, tokenizer, model)
    np.save(x_path, X)
    print(f"Saved embeddings: {x_path}")
    return X


def prf1(y_true, y_pred):
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    return p, r, f1


def main():
    # 1) Load parquet
    df = pd.read_parquet(PARQUET_PATH)

    # sanity checks
    for col in [TEXT_COL, LABEL_COL, SPLIT_COL]:
        if col not in df.columns:
            raise ValueError(f"Missing column '{col}'. Found columns: {list(df.columns)}")

    train_df = df[df[SPLIT_COL] == "train_silver"].dropna(subset=[TEXT_COL, LABEL_COL]).copy()
    eval_df  = df[df[SPLIT_COL] == "eval_silver"].dropna(subset=[TEXT_COL, LABEL_COL]).copy()

    X_train_text = train_df[TEXT_COL].astype(str).tolist()
    y_train = train_df[LABEL_COL].astype(int).to_numpy()

    X_eval_text = eval_df[TEXT_COL].astype(str).tolist()
    y_eval = eval_df[LABEL_COL].astype(int).to_numpy()

    print(f"Train size: {len(train_df)} | Eval size: {len(eval_df)}")
    print("Train label balance:", train_df[LABEL_COL].value_counts(normalize=True).to_dict())
    print("Eval label balance: ", eval_df[LABEL_COL].value_counts(normalize=True).to_dict())

    # 2) Load transformer (frozen)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModel.from_pretrained(MODEL_NAME)

    # IMPORTANT for Apple MPS stability: keep float32
    model = model.to(DEVICE)
    model.eval()

    # 3) Embeddings (cached)
    X_train = load_or_compute_embeddings("train_silver", X_train_text, tokenizer, model)
    X_eval  = load_or_compute_embeddings("eval_silver", X_eval_text, tokenizer, model)

    print("Embeddings shapes:", X_train.shape, X_eval.shape)

    # 4) Train baseline classifier
    clf = LogisticRegression(
        max_iter=3000,
        n_jobs=-1,
        solver="lbfgs",
    )
    clf.fit(X_train, y_train)

    # 5) Evaluate
    y_pred = clf.predict(X_eval)
    p, r, f1 = prf1(y_eval, y_pred)

    print("\n=== Part A: Baseline results on eval_silver ===")
    print(f"Precision: {p:.4f}")
    print(f"Recall:    {r:.4f}")
    print(f"F1:        {f1:.4f}")
    print("\nClassification report:")
    print(classification_report(y_eval, y_pred, digits=4, zero_division=0))

    # 6) Save baseline model + meta
    joblib.dump(clf, os.path.join(OUT_DIR, "baseline_logreg.joblib"))
    meta = {
        "transformer_model": MODEL_NAME,
        "max_length": MAX_LENGTH,
        "pooling": "mean_pooling + l2_normalize",
        "batch_size": BATCH_SIZE,
        "device_used": DEVICE,
        "features_dim": int(X_train.shape[1]),
        "train_rows": int(len(train_df)),
        "eval_rows": int(len(eval_df)),
        "parquet_path": PARQUET_PATH,
    }
    joblib.dump(meta, os.path.join(OUT_DIR, "baseline_meta.joblib"))

    print(f"\nSaved baseline classifier to: {OUT_DIR}/baseline_logreg.joblib")
    print(f"Saved meta to:               {OUT_DIR}/baseline_meta.joblib")
    print(f"Embeddings cached in:        {CACHE_DIR}/ (X.npy files)")


if __name__ == "__main__":
    main()


# Part B - Uncertainty sampling

In [2]:
# Part B: Uncertainty sampling on pool_unlabeled
# ----------------------------------------------
# Uses baseline_logreg + frozen PatentSBERTa embeddings
# Exports hitl_green_100.csv with required columns
#
# pip install -U transformers torch scikit-learn pandas pyarrow tqdm joblib numpy

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModel
import joblib

# ---------------- Config ----------------
PARQUET_PATH = "patents_50k_green.parquet"
MODEL_NAME = "AI-Growth-Lab/PatentSBERTa"

BASELINE_PATH = "models_baseline/baseline_logreg.joblib"
META_PATH = "models_baseline/baseline_meta.joblib"
CACHE_DIR = "models_baseline/emb_cache"

OUT_CSV = "hitl_green_100_unlabeled.csv"

TEXT_COL = "text"
SPLIT_COL = "split"
ID_COL = "doc_id"

MAX_LENGTH = 256
BATCH_SIZE = 64  # good on MPS; lower if memory issues

# ---------------- Device (MPS first) ----------------
if torch.backends.mps.is_available():
    DEVICE = "mps"
elif torch.cuda.is_available():
    DEVICE = "cuda"
else:
    DEVICE = "cpu"
print("Using device:", DEVICE)


def mean_pooling(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    mask = attention_mask.unsqueeze(-1).type_as(last_hidden_state)
    summed = (last_hidden_state * mask).sum(dim=1)
    counts = mask.sum(dim=1).clamp(min=1e-9)
    return summed / counts


@torch.no_grad()
def embed_texts(texts, tokenizer, model, batch_size=BATCH_SIZE, max_length=MAX_LENGTH, device=DEVICE):
    model.eval()
    all_embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding pool_unlabeled"):
        batch = texts[i:i + batch_size]
        enc = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )
        enc = {k: v.to(device) for k, v in enc.items()}
        out = model(**enc)
        sent_embs = mean_pooling(out.last_hidden_state, enc["attention_mask"])
        sent_embs = torch.nn.functional.normalize(sent_embs, p=2, dim=1)
        all_embs.append(sent_embs.cpu().numpy())
    return np.vstack(all_embs)


def load_or_compute_pool_embeddings(pool_texts, tokenizer, model):
    x_path = os.path.join(CACHE_DIR, "pool_unlabeled_X.npy")
    if os.path.exists(x_path):
        print(f"Loading cached pool embeddings: {x_path}")
        return np.load(x_path)
    X = embed_texts(pool_texts, tokenizer, model)
    os.makedirs(CACHE_DIR, exist_ok=True)
    np.save(x_path, X)
    print(f"Saved pool embeddings: {x_path}")
    return X


def main():
    # 1) Load data
    df = pd.read_parquet(PARQUET_PATH)
    pool_df = df[df[SPLIT_COL] == "pool_unlabeled"].dropna(subset=[TEXT_COL]).copy()

    if ID_COL not in pool_df.columns:
        raise ValueError(f"Expected '{ID_COL}' in parquet. Found: {list(pool_df.columns)}")

    pool_texts = pool_df[TEXT_COL].astype(str).tolist()
    print("Pool size:", len(pool_df))

    # 2) Load baseline model
    clf = joblib.load(BASELINE_PATH)
    meta = joblib.load(META_PATH)
    print("Loaded baseline + meta:", meta.get("transformer_model", "n/a"))

    # 3) Load transformer for embeddings (frozen)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
    model.eval()

    # 4) Get embeddings for pool_unlabeled (cached)
    X_pool = load_or_compute_pool_embeddings(pool_texts, tokenizer, model)

    # 5) Predict probabilities
    # predict_proba returns [p(class0), p(class1)] where class1 should be "green=1"
    proba = clf.predict_proba(X_pool)
    p_green = proba[:, 1]

    # 6) Uncertainty score u = 1 - 2*|p-0.5|
    u = 1.0 - 2.0 * np.abs(p_green - 0.5)

    pool_df["p_green"] = p_green
    pool_df["u"] = u

    # 7) Select top 100 most uncertain
    top100 = pool_df.sort_values("u", ascending=False).head(100).copy()

    # 8) Prepare HITL CSV with required + empty labeling cols
    out = pd.DataFrame({
        "doc_id": top100[ID_COL].values,
        "text": top100[TEXT_COL].values,
        "p_green": top100["p_green"].values,
        "u": top100["u"].values,
        "llm_green_suggested": ["" for _ in range(len(top100))],
        "llm_confidence": ["" for _ in range(len(top100))],
        "llm_rationale": ["" for _ in range(len(top100))],
        "is_green_human": ["" for _ in range(len(top100))],
        "notes": ["" for _ in range(len(top100))],
    })

    out.to_csv(OUT_CSV, index=False)
    print(f"Saved HITL file: {OUT_CSV}")
    print(out.head(3))


if __name__ == "__main__":
    main()


Using device: mps
Pool size: 10000
Loaded baseline + meta: AI-Growth-Lab/PatentSBERTa


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mMPNetModel LOAD REPORT[0m from: AI-Growth-Lab/PatentSBERTa
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading cached pool embeddings: models_baseline/emb_cache/pool_unlabeled_X.npy
Saved HITL file: hitl_green_100_unlabeled.csv
    doc_id                                               text   p_green  \
0  8593474  1. A method for reading data from a tiled orga...  0.500015   
1  9617453  1. A solvent-free aqueous polyurethane dispers...  0.499975   
2  8618928  1. A method for wireless health monitoring of ...  0.499939   

          u llm_green_suggested llm_confidence llm_rationale is_green_human  \
0  0.999971                                                                   
1  0.999949                                                                   
2  0.999878                                                                   

  notes  
0        
1        
2        


  ret = a @ b
  ret = a @ b
  ret = a @ b


# Part C - LLM and Human-in-the-loop

In [4]:
import json
import requests
import pandas as pd

CSV_PATH = "hitl_green_100_labeled.csv"
OLLAMA_MODEL = "gemma3:4b"

OLLAMA_URL = "http://localhost:11434/api/generate"

SYSTEM = (
    "You are a strict patent claim classifier for GREEN technology. "
    "You must use ONLY the claim text. Do NOT use CPC, metadata, or external knowledge. "
    "Green = technology that directly mitigates climate change, reduces emissions, improves energy efficiency, "
    "enables renewable energy, low-carbon transport, carbon capture, waste reduction/recycling, water treatment, "
    "or similar environmental benefits."
)

PROMPT_TEMPLATE = """{system}

Claim text:
\"\"\"{text}\"\"\"

Return ONLY valid JSON (no markdown, no backticks) with exactly these keys:
- llm_green_suggested: 0 or 1
- llm_confidence: "low" | "medium" | "high"
- llm_rationale: 1-3 sentences, cite exact phrases from the claim text
"""


def ollama_generate(prompt: str) -> str:
    payload = {
        "model": OLLAMA_MODEL,
        "prompt": prompt,
        "stream": False,
        "options": {
            "temperature": 0.0,
            "num_predict": 200,   # hard cap so it can’t ramble forever
        }
    }
    r = requests.post(OLLAMA_URL, json=payload, timeout=120)
    r.raise_for_status()
    return r.json()["response"].strip()


def extract_json(text: str) -> dict:
    text = text.strip()
    if text.startswith("{") and text.endswith("}"):
        return json.loads(text)

    start = text.find("{")
    end = text.rfind("}")
    if start != -1 and end != -1 and end > start:
        return json.loads(text[start:end+1])

    raise ValueError(f"Could not parse JSON. Output was:\n{text[:500]}")


def prompt_human(doc_id, claim_text, llm_suggested, llm_confidence, llm_rationale):
    print("\n" + "=" * 90)
    print(f"doc_id: {doc_id}")
    print("-" * 90)
    print(claim_text)
    print("-" * 90)
    print(f"LLM suggested: {llm_suggested} | confidence: {llm_confidence}")
    print(f"LLM rationale: {llm_rationale}")
    print("-" * 90)

    while True:
        ans = input("Your FINAL label is_green_human (0/1) [or 's' to skip]: ").strip().lower()
        if ans in {"0", "1"}:
            return int(ans), ""
        if ans == "s":
            return None, "skipped"
        print("Please type 0, 1, or s.")


def main():
    df = pd.read_csv(CSV_PATH)

    required = [
        "doc_id", "text", "p_green", "u",
        "llm_green_suggested", "llm_confidence", "llm_rationale",
        "is_green_human", "notes"
    ]
    for c in required:
        if c not in df.columns:
            raise ValueError(f"Missing column '{c}' in {CSV_PATH}")

    for i in range(len(df)):
        row = df.loc[i]

        # LLM step
        if pd.isna(row["llm_green_suggested"]) or str(row["llm_green_suggested"]).strip() == "":
            prompt = PROMPT_TEMPLATE.format(system=SYSTEM, text=str(row["text"]))
            try:
                raw = ollama_generate(prompt)
                out = extract_json(raw)

                df.at[i, "llm_green_suggested"] = int(out["llm_green_suggested"])
                df.at[i, "llm_confidence"] = out["llm_confidence"]
                df.at[i, "llm_rationale"] = out["llm_rationale"]
                df.to_csv(CSV_PATH, index=False)
                print(f"[{i+1}/{len(df)}] LLM done + saved.")
            except Exception as e:
                df.at[i, "notes"] = f"LLM error: {e}"
                df.to_csv(CSV_PATH, index=False)
                print(f"[{i+1}/{len(df)}] LLM error saved, continuing.")
                continue

        # Human step
        if pd.isna(row["is_green_human"]) or str(row["is_green_human"]).strip() == "":
            human_label, note = prompt_human(
                doc_id=row["doc_id"],
                claim_text=str(row["text"]),
                llm_suggested=df.at[i, "llm_green_suggested"],
                llm_confidence=df.at[i, "llm_confidence"],
                llm_rationale=df.at[i, "llm_rationale"],
            )
            if human_label is not None:
                df.at[i, "is_green_human"] = int(human_label)
            if note:
                df.at[i, "notes"] = note

            df.to_csv(CSV_PATH, index=False)
            print(f"[{i+1}/{len(df)}] Human label saved.")

    # Override stats for README
    labeled = df.copy()
    labeled = labeled[~labeled["is_green_human"].isna()]
    labeled = labeled[labeled["is_green_human"].astype(str).str.strip() != ""]
    if len(labeled) > 0:
        labeled["llm_green_suggested"] = labeled["llm_green_suggested"].astype(int)
        labeled["is_green_human"] = labeled["is_green_human"].astype(int)
        overrides = (labeled["llm_green_suggested"] != labeled["is_green_human"]).sum()
        print(f"\nOverride count: {overrides} / {len(labeled)}")
    else:
        print("\nNo human labels found yet.")

    print("\nDONE. Updated CSV:", CSV_PATH)


if __name__ == "__main__":
    main()

[1/100] LLM done + saved.

doc_id: 8593474
------------------------------------------------------------------------------------------
1. A method for reading data from a tiled organized memory, comprising: in response to receiving one tiled-X read request to read two cache line size units from a tiled organized memory in the X-direction, requesting two cache lines from the tiled-X organized memory without fragmenting the received tiled-X read request into more than the one tiled-X read request; allocating the two cache lines in parallel; reading data associated with the two cache lines from a data cache in parallel; and returning, in parallel, data associated with the two requested cache lines, wherein the allocating, reading, and returning of the data is accomplished in one clock cycle.
------------------------------------------------------------------------------------------
LLM suggested: 0.0 | confidence: low
LLM rationale: The claim describes a method for reading data from memory,

  df.at[i, "llm_confidence"] = out["llm_confidence"]
  df.at[i, "llm_rationale"] = out["llm_rationale"]


[1/100] Human label saved.
[2/100] LLM done + saved.

doc_id: 9617453
------------------------------------------------------------------------------------------
1. A solvent-free aqueous polyurethane dispersion comprising non-crystalline polyurethane polymer comprising the reaction product of: (A) an organic solvent-free isocyanate-terminated prepolymer having a bulk viscosity between 500 centipoise to 15,000 centipoise at 80° C. and comprising the reaction product of: (i) at least one polyisocyanate selected from the group consisting of aliphatic and cycloaliphatic polyisocyanates and mixtures thereof; (ii) at least one polyol, wherein said at least one polyol is selected from the group consisting of polyether polyols, polycarbonate polyols and mixtures thereof; (iii) at least one isocyanate-reactive compound comprising one or more ionic groups or potential ionic groups per molecule; and (iv) at least one isocyanate chain terminating agent comprising at least one carboxylic acid or su

# Part D - Final model

In [7]:
import os
import numpy as np
import pandas as pd

import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from sklearn.metrics import precision_recall_fscore_support

# ---------------- Config ----------------
PARQUET_PATH = "patents_50k_green.parquet"
HITL_CSV_PATH = "hitl_green_100_labeled.csv"

BASE_MODEL = "AI-Growth-Lab/PatentSBERTa"
OUT_DIR = "models_finetuned_patentsberta_green"

TEXT_COL = "text"
ID_COL = "doc_id"
SPLIT_COL = "split"
SILVER_LABEL_COL = "is_green_silver"
HUMAN_LABEL_COL = "is_green_human"

MAX_LEN = 256
EPOCHS = 1
LR = 2e-5

# Batch sizes: try 16 on MPS; if memory issues, set to 8
TRAIN_BS = 16
EVAL_BS = 32

SEED = 42

# ---------------- Device (MPS first) ----------------
if torch.backends.mps.is_available():
    DEVICE = "mps"
elif torch.cuda.is_available():
    DEVICE = "cuda"
else:
    DEVICE = "cpu"
print("Using device:", DEVICE)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", zero_division=0)
    acc = (preds == labels).mean()
    return {"precision": p, "recall": r, "f1": f1, "accuracy": acc}


def main():
    os.makedirs(OUT_DIR, exist_ok=True)

    # 1) Load base dataset (50k parquet)
    df = pd.read_parquet(PARQUET_PATH)

    for c in [ID_COL, TEXT_COL, SPLIT_COL, SILVER_LABEL_COL]:
        if c not in df.columns:
            raise ValueError(f"Missing column {c} in parquet. Found: {list(df.columns)}")

    # 2) Load HITL CSV (100 rows)
    hitl = pd.read_csv(HITL_CSV_PATH)

    for c in [ID_COL, HUMAN_LABEL_COL]:
        if c not in hitl.columns:
            raise ValueError(f"Missing column {c} in HITL CSV. Found: {list(hitl.columns)}")

    # Ensure human labels are present (no blanks)
    hitl_labeled = hitl.copy()
    hitl_labeled = hitl_labeled[~hitl_labeled[HUMAN_LABEL_COL].isna()]
    hitl_labeled = hitl_labeled[hitl_labeled[HUMAN_LABEL_COL].astype(str).str.strip() != ""].copy()
    hitl_labeled[HUMAN_LABEL_COL] = hitl_labeled[HUMAN_LABEL_COL].astype(int)

    if len(hitl_labeled) != 100:
        print(f"WARNING: Expected 100 human-labeled rows, found {len(hitl_labeled)}. (Continuing anyway)")

    # 3) Create is_green_gold in full dataset: default silver, override for HITL doc_ids
    df["is_green_gold"] = df[SILVER_LABEL_COL].astype(int)

    hitl_map = dict(zip(hitl_labeled[ID_COL].astype(int), hitl_labeled[HUMAN_LABEL_COL].astype(int)))
    df.loc[df[ID_COL].isin(hitl_map.keys()), "is_green_gold"] = df.loc[
        df[ID_COL].isin(hitl_map.keys()), ID_COL
    ].map(hitl_map).astype(int)

    # 4) Build training data: train_silver + gold_100
    train_silver_df = df[df[SPLIT_COL] == "train_silver"].copy()

    # Pull the 100 HITL rows from df (they should be in pool_unlabeled originally)
    gold_100_df = df[df[ID_COL].isin(hitl_map.keys())].copy()

    # Train set = train_silver + gold_100 (append)
    train_df = pd.concat([train_silver_df, gold_100_df], ignore_index=True)

    # Eval set (silver): eval_silver with silver labels (as assignment states)
    eval_silver_df = df[df[SPLIT_COL] == "eval_silver"].copy()
    eval_silver_df["label"] = eval_silver_df[SILVER_LABEL_COL].astype(int)

    # Gold eval set: the 100 with human labels
    gold_100_eval_df = gold_100_df.copy()
    gold_100_eval_df["label"] = gold_100_eval_df["is_green_gold"].astype(int)

    # Train labels use gold
    train_df["label"] = train_df["is_green_gold"].astype(int)

    print("\nSizes:")
    print("train_silver:", len(train_silver_df))
    print("gold_100:", len(gold_100_df))
    print("train_final (train_silver + gold_100):", len(train_df))
    print("eval_silver:", len(eval_silver_df))

    # 5) Convert to HF Datasets
    train_ds = Dataset.from_pandas(train_df[[ID_COL, TEXT_COL, "label"]], preserve_index=False)
    eval_ds = Dataset.from_pandas(eval_silver_df[[ID_COL, TEXT_COL, "label"]], preserve_index=False)
    gold_eval_ds = Dataset.from_pandas(gold_100_eval_df[[ID_COL, TEXT_COL, "label"]], preserve_index=False)

    # 6) Tokenize
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

    def tokenize_fn(batch):
        return tokenizer(batch[TEXT_COL], truncation=True, max_length=MAX_LEN)

    train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=[TEXT_COL])
    eval_ds = eval_ds.map(tokenize_fn, batched=True, remove_columns=[TEXT_COL])
    gold_eval_ds = gold_eval_ds.map(tokenize_fn, batched=True, remove_columns=[TEXT_COL])

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # 7) Model
    model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=2)

    # 8) Training args (1 epoch, small LR)
    args = TrainingArguments(
        output_dir=OUT_DIR,
        num_train_epochs=EPOCHS,
        learning_rate=LR,
        per_device_train_batch_size=TRAIN_BS,
        per_device_eval_batch_size=EVAL_BS,
        seed=SEED,
        report_to="none",
        fp16=False,
        logging_steps=50,
        save_steps=500,   # valgfrit: gem ind imellem
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # 9) Train once
    trainer.train()

    # 10) Evaluate on eval_silver
    print("\n=== Evaluation on eval_silver (silver labels) ===")
    eval_metrics = trainer.evaluate(eval_dataset=eval_ds)
    print(eval_metrics)

    # 11) Evaluate on gold_100
    print("\n=== Evaluation on gold_100 (human labels) ===")
    gold_metrics = trainer.evaluate(eval_dataset=gold_eval_ds)
    print(gold_metrics)

    # 12) Save final model + tokenizer
    trainer.save_model(OUT_DIR)
    tokenizer.save_pretrained(OUT_DIR)

    # 13) Save the merged dataset with is_green_gold for HF upload later
    merged_out = "patents_50k_green_with_gold.parquet"
    df.to_parquet(merged_out, index=False)
    print(f"\nSaved merged dataset with is_green_gold to: {merged_out}")
    print(f"Saved fine-tuned model to: {OUT_DIR}")


if __name__ == "__main__":
    main()


Using device: mps

Sizes:
train_silver: 35000
gold_100: 100
train_final (train_silver + gold_100): 35100
eval_silver: 5000


Map:   0%|          | 0/35100 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mMPNetForSequenceClassification LOAD REPORT[0m from: AI-Growth-Lab/PatentSBERTa
Key                        | Status     | 
---------------------------+------------+-
pooler.dense.weight        | UNEXPECTED | 
embeddings.position_ids    | UNEXPECTED | 
pooler.dense.bias          | UNEXPECTED | 
classifier.dense.bias      | MISSING    | 
classifier.out_proj.bias   | MISSING    | 
classifier.dense.weight    | MISSING    | 
classifier.out_proj.weight | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Step,Training Loss
50,0.644013
100,0.528508
150,0.499182
200,0.510852
250,0.466442
300,0.490231
350,0.44861
400,0.436779
450,0.466524
500,0.457045


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


=== Evaluation on eval_silver (silver labels) ===




{'eval_loss': 0.41351768374443054, 'eval_precision': 0.8173913043478261, 'eval_recall': 0.7896, 'eval_f1': 0.8032553407934894, 'eval_accuracy': 0.8066, 'eval_runtime': 2371.2933, 'eval_samples_per_second': 2.109, 'eval_steps_per_second': 0.066, 'epoch': 1.0}

=== Evaluation on gold_100 (human labels) ===




{'eval_loss': 0.6443789005279541, 'eval_precision': 0.6666666666666666, 'eval_recall': 0.6666666666666666, 'eval_f1': 0.6666666666666666, 'eval_accuracy': 0.64, 'eval_runtime': 6.482, 'eval_samples_per_second': 15.427, 'eval_steps_per_second': 0.617, 'epoch': 1.0}


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


Saved merged dataset with is_green_gold to: patents_50k_green_with_gold.parquet
Saved fine-tuned model to: models_finetuned_patentsberta_green
