In [1]:
import random, time, statistics
import numpy as np
import pandas as pd
from collections import Counter, defaultdict

# Use your best tuned params
TOP_CODES = 15
K_NEIGHBORS = 40
ALPHA = 0.65

def retrieve_codes_with_beta(symptoms: str, beta: float) -> list[str]:
    t = normalize_text(symptoms)
    if not t.strip():
        freq = Counter(doc_gt)
        return [c for c, _ in freq.most_common(TOP_CODES) if c]

    qw = word_vec.transform([t])
    qc = char_vec.transform([t])

    sw = (word_X @ qw.T).toarray().ravel()
    sc = (char_X @ qc.T).toarray().ravel()
    s = ALPHA * sw + (1.0 - ALPHA) * sc

    k = min(K_NEIGHBORS, len(s))
    idx = np.argpartition(-s, k - 1)[:k]
    idx = idx[np.argsort(-s[idx])]

    code_score = defaultdict(float)
    for rank, i in enumerate(idx, start=1):
        w = float(s[i]) / rank

        gt = str(doc_gt[i]).strip()
        if gt:
            code_score[gt] += w

        for code in doc_valid[i]:
            code = str(code).strip()
            if code and code != gt:
                code_score[code] += beta * w

    best = sorted(code_score.items(), key=lambda x: x[1], reverse=True)
    return [c for c, _ in best[:TOP_CODES]] if best else []

def eval_beta(subset, beta: float) -> dict:
    acc1 = 0
    rec3 = 0
    lat = []

    for r in subset:
        q = r.get("query", "")
        gt = str(r.get("gt", ""))
        valid = set(r.get("icd_codes", []))

        t0 = time.perf_counter()
        preds = retrieve_codes_with_beta(q, beta)
        lat.append(time.perf_counter() - t0)

        top3 = preds[:3]
        if top3 and top3[0] == gt:
            acc1 += 1
        if any(c in valid for c in top3):
            rec3 += 1

    n = len(subset) or 1
    lat_sorted = sorted(lat)
    p95 = lat_sorted[int(0.95 * (len(lat_sorted) - 1))] if lat_sorted else None

    return {
        "beta": beta,
        "n": len(subset),
        "acc1_%": round(100 * acc1 / n, 2),
        "recall3_%": round(100 * rec3 / n, 2),
        "lat_avg_ms": round(1000 * statistics.mean(lat), 2),
        "lat_p95_ms": round(1000 * p95, 2) if p95 is not None else None,
    }

# Random subset
seed = 123
rng = random.Random(seed)
subset = records[:]
rng.shuffle(subset)
subset = subset[:min(30, len(subset))]
print("Subset size:", len(subset), "seed:", seed)

betas = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4]
rows = [eval_beta(subset, b) for b in betas]

df_beta = pd.DataFrame(rows).sort_values(
    by=["recall3_%", "acc1_%", "lat_avg_ms"],
    ascending=[False, False, True],
).reset_index(drop=True)

df_beta

NameError: name 'records' is not defined