In [6]:
import json
from pathlib import Path

def find_project_root() -> Path:
    for p in [Path.cwd(), *Path.cwd().parents]:
        if (p / "data" / "test_set").is_dir():
            return p
    raise FileNotFoundError("Could not find data/test_set from current working directory.")

ROOT = find_project_root()
DATA_DIR = ROOT / "data" / "test_set"

print("CWD :", Path.cwd())
print("ROOT:", ROOT)
print("DATA:", DATA_DIR)

records = []
for p in sorted(DATA_DIR.glob("*.json")):
    records.append(json.loads(p.read_text(encoding="utf-8")))

print("Loaded records:", len(records))

CWD : /Users/dana/Desktop/qazcode/qazcode-nu/data
ROOT: /Users/dana/Desktop/qazcode/qazcode-nu
DATA: /Users/dana/Desktop/qazcode/qazcode-nu/data/test_set
Loaded records: 221


In [7]:
TOKEN_RE = re.compile(r"[a-zа-я0-9]+", re.IGNORECASE)

def normalize_text(text: Any) -> str:
    if text is None:
        return ""
    if not isinstance(text, str):
        text = str(text)
    toks = (t.lower() for t in TOKEN_RE.findall(text))
    toks = [t for t in toks if len(t) > 2]  # no stopwords (safer)
    return " ".join(toks)

texts_norm = [normalize_text(r.get("query","")) for r in records]

print("Non-empty raw queries:", sum(1 for r in records if (r.get("query") or "").strip()))
print("Non-empty normalized:", sum(1 for t in texts_norm if t.strip()))
print("Example normalized:", (texts_norm[0][:200] if records else "no records"))

Non-empty raw queries: 220
Non-empty normalized: 220
Example normalized: здравствуйте пару недель назад неудачно упала спину верхней ступеньки лестницы съехала ударилась примерно между лопатками подумала что ушиб боль проходит даже наоборот середине спины жгуче ноет тянет 


In [8]:
raw_texts = [(r.get("query") or "") for r in records]

# fallback: if normalization makes text empty, use raw
texts = []
for raw in raw_texts:
    t = normalize_text(raw)
    texts.append(t if t.strip() else raw)

doc_gt = [str(r.get("gt","")) for r in records]
doc_valid = [set(r.get("icd_codes", [])) for r in records]

word_vec = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=1,
    max_features=80_000,
    sublinear_tf=True,
    lowercase=True,
)

char_vec = TfidfVectorizer(
    analyzer="char_wb",
    ngram_range=(3,5),
    min_df=1,
    max_features=120_000,
    sublinear_tf=True,
    lowercase=True,
)

word_X = word_vec.fit_transform(texts)
char_X = char_vec.fit_transform(texts)

print("TF-IDF built:", word_X.shape, char_X.shape)

TF-IDF built: (221, 21558) (221, 28197)


In [9]:
def retrieve_codes(symptoms: str, top_codes: int, k_neighbors: int, alpha: float) -> list[str]:
    t = normalize_text(symptoms)
    if not t.strip():
        freq = Counter(doc_gt)
        return [c for c, _ in freq.most_common(top_codes) if c]

    qw = word_vec.transform([t])
    qc = char_vec.transform([t])

    sw = (word_X @ qw.T).toarray().ravel()
    sc = (char_X @ qc.T).toarray().ravel()
    s = alpha * sw + (1.0 - alpha) * sc

    k = min(k_neighbors, len(s))
    idx = np.argpartition(-s, k - 1)[:k]
    idx = idx[np.argsort(-s[idx])]

    code_score = defaultdict(float)
    for rank, i in enumerate(idx, start=1):
        w = float(s[i]) / rank
        codes = doc_valid[i] if doc_valid[i] else {doc_gt[i]}
        for code in codes:
            code = str(code).strip()
            if code:
                code_score[code] += w

    best = sorted(code_score.items(), key=lambda x: x[1], reverse=True)
    return [c for c, _ in best[:top_codes]] if best else []


def eval_config(subset, *, top_codes: int, k_neighbors: int, alpha: float) -> dict:
    acc1 = 0
    rec3 = 0
    lat = []

    for r in subset:
        symptoms = r.get("query", "")
        gt = str(r.get("gt", ""))
        valid = set(r.get("icd_codes", []))

        t0 = time.perf_counter()
        preds = retrieve_codes(symptoms, top_codes=top_codes, k_neighbors=k_neighbors, alpha=alpha)
        lat.append(time.perf_counter() - t0)

        top3 = preds[:3]
        if top3 and top3[0] == gt:
            acc1 += 1
        if any(c in valid for c in top3):
            rec3 += 1

    n = len(subset) or 1
    lat_sorted = sorted(lat)
    p95 = lat_sorted[int(0.95 * (len(lat_sorted) - 1))] if lat_sorted else None

    return {
        "top_codes": top_codes,
        "k_neighbors": k_neighbors,
        "alpha": alpha,
        "n": len(subset),
        "acc1_%": round(100 * acc1 / n, 2),
        "recall3_%": round(100 * rec3 / n, 2),
        "lat_avg_ms": round(1000 * statistics.mean(lat), 1) if lat else None,
        "lat_p95_ms": round(1000 * p95, 1) if p95 is not None else None,
    }


# pick random 30
seed = 42
rng = random.Random(seed)
subset = records[:]
rng.shuffle(subset)
subset = subset[:min(30, len(subset))]
print("Eval subset size:", len(subset), "seed:", seed)

TOP_CODES_GRID = [15, 30, 50]
KNN_GRID = [40, 60, 80, 120]
ALPHA_GRID = [0.55, 0.65, 0.75]

rows = []
for top_codes in TOP_CODES_GRID:
    for k_neighbors in KNN_GRID:
        for alpha in ALPHA_GRID:
            rows.append(eval_config(subset, top_codes=top_codes, k_neighbors=k_neighbors, alpha=alpha))

df = pd.DataFrame(rows).sort_values(
    by=["recall3_%", "acc1_%", "lat_avg_ms"],
    ascending=[False, False, True],
).reset_index(drop=True)

df.head(15)

Eval subset size: 30 seed: 42


Unnamed: 0,top_codes,k_neighbors,alpha,n,acc1_%,recall3_%,lat_avg_ms,lat_p95_ms
0,15,40,0.65,30,6.67,96.67,1.5,1.8
1,15,40,0.75,30,6.67,96.67,1.5,1.7
2,30,40,0.55,30,6.67,96.67,1.5,1.8
3,30,40,0.65,30,6.67,96.67,1.5,1.7
4,30,40,0.75,30,6.67,96.67,1.5,1.8
5,50,40,0.55,30,6.67,96.67,1.5,1.8
6,15,60,0.55,30,6.67,96.67,1.6,1.8
7,15,60,0.65,30,6.67,96.67,1.6,2.0
8,15,60,0.75,30,6.67,96.67,1.6,1.8
9,15,80,0.55,30,6.67,96.67,1.6,1.9


In [10]:
import random, time, statistics
import numpy as np
import pandas as pd
from collections import Counter, defaultdict

# Use your best tuned params
TOP_CODES = 15
K_NEIGHBORS = 40
ALPHA = 0.65

def retrieve_codes_with_beta(symptoms: str, beta: float) -> list[str]:
    t = normalize_text(symptoms)
    if not t.strip():
        freq = Counter(doc_gt)
        return [c for c, _ in freq.most_common(TOP_CODES) if c]

    qw = word_vec.transform([t])
    qc = char_vec.transform([t])

    sw = (word_X @ qw.T).toarray().ravel()
    sc = (char_X @ qc.T).toarray().ravel()
    s = ALPHA * sw + (1.0 - ALPHA) * sc

    k = min(K_NEIGHBORS, len(s))
    idx = np.argpartition(-s, k - 1)[:k]
    idx = idx[np.argsort(-s[idx])]

    code_score = defaultdict(float)
    for rank, i in enumerate(idx, start=1):
        w = float(s[i]) / rank

        gt = str(doc_gt[i]).strip()
        if gt:
            code_score[gt] += w

        for code in doc_valid[i]:
            code = str(code).strip()
            if code and code != gt:
                code_score[code] += beta * w

    best = sorted(code_score.items(), key=lambda x: x[1], reverse=True)
    return [c for c, _ in best[:TOP_CODES]] if best else []

def eval_beta(subset, beta: float) -> dict:
    acc1 = 0
    rec3 = 0
    lat = []

    for r in subset:
        q = r.get("query", "")
        gt = str(r.get("gt", ""))
        valid = set(r.get("icd_codes", []))

        t0 = time.perf_counter()
        preds = retrieve_codes_with_beta(q, beta)
        lat.append(time.perf_counter() - t0)

        top3 = preds[:3]
        if top3 and top3[0] == gt:
            acc1 += 1
        if any(c in valid for c in top3):
            rec3 += 1

    n = len(subset) or 1
    lat_sorted = sorted(lat)
    p95 = lat_sorted[int(0.95 * (len(lat_sorted) - 1))] if lat_sorted else None

    return {
        "beta": beta,
        "n": len(subset),
        "acc1_%": round(100 * acc1 / n, 2),
        "recall3_%": round(100 * rec3 / n, 2),
        "lat_avg_ms": round(1000 * statistics.mean(lat), 2),
        "lat_p95_ms": round(1000 * p95, 2) if p95 is not None else None,
    }

# Random subset
seed = 123
rng = random.Random(seed)
subset = records[:]
rng.shuffle(subset)
subset = subset[:min(30, len(subset))]
print("Subset size:", len(subset), "seed:", seed)

betas = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4]
rows = [eval_beta(subset, b) for b in betas]

df_beta = pd.DataFrame(rows).sort_values(
    by=["recall3_%", "acc1_%", "lat_avg_ms"],
    ascending=[False, False, True],
).reset_index(drop=True)

df_beta

Subset size: 30 seed: 123


Unnamed: 0,beta,n,acc1_%,recall3_%,lat_avg_ms,lat_p95_ms
0,0.15,30,100.0,100.0,1.55,1.69
1,0.3,30,100.0,100.0,1.55,1.73
2,0.4,30,100.0,100.0,1.55,1.71
3,0.1,30,100.0,100.0,1.56,1.77
4,0.05,30,100.0,100.0,1.58,1.8
5,0.25,30,100.0,100.0,1.6,1.77
6,0.2,30,100.0,100.0,1.65,2.06
7,0.0,30,100.0,100.0,2.06,2.88


In [11]:
import random, time, statistics
import numpy as np
import pandas as pd
from collections import Counter, defaultdict

# Use your best tuned params
TOP_CODES = 15
K_NEIGHBORS = 40
ALPHA = 0.65

def retrieve_codes_with_beta(symptoms: str, beta: float) -> list[str]:
    t = normalize_text(symptoms)
    if not t.strip():
        freq = Counter(doc_gt)
        return [c for c, _ in freq.most_common(TOP_CODES) if c]

    qw = word_vec.transform([t])
    qc = char_vec.transform([t])

    sw = (word_X @ qw.T).toarray().ravel()
    sc = (char_X @ qc.T).toarray().ravel()
    s = ALPHA * sw + (1.0 - ALPHA) * sc

    k = min(K_NEIGHBORS, len(s))
    idx = np.argpartition(-s, k - 1)[:k]
    idx = idx[np.argsort(-s[idx])]

    code_score = defaultdict(float)
    for rank, i in enumerate(idx, start=1):
        w = float(s[i]) / rank

        gt = str(doc_gt[i]).strip()
        if gt:
            code_score[gt] += w

        for code in doc_valid[i]:
            code = str(code).strip()
            if code and code != gt:
                code_score[code] += beta * w

    best = sorted(code_score.items(), key=lambda x: x[1], reverse=True)
    return [c for c, _ in best[:TOP_CODES]] if best else []

def eval_beta(subset, beta: float) -> dict:
    acc1 = 0
    rec3 = 0
    lat = []

    for r in subset:
        q = r.get("query", "")
        gt = str(r.get("gt", ""))
        valid = set(r.get("icd_codes", []))

        t0 = time.perf_counter()
        preds = retrieve_codes_with_beta(q, beta)
        lat.append(time.perf_counter() - t0)

        top3 = preds[:3]
        if top3 and top3[0] == gt:
            acc1 += 1
        if any(c in valid for c in top3):
            rec3 += 1

    n = len(subset) or 1
    lat_sorted = sorted(lat)
    p95 = lat_sorted[int(0.95 * (len(lat_sorted) - 1))] if lat_sorted else None

    return {
        "beta": beta,
        "n": len(subset),
        "acc1_%": round(100 * acc1 / n, 2),
        "recall3_%": round(100 * rec3 / n, 2),
        "lat_avg_ms": round(1000 * statistics.mean(lat), 2),
        "lat_p95_ms": round(1000 * p95, 2) if p95 is not None else None,
    }

# Random subset
seed = 123
rng = random.Random(seed)
subset = records[:]
rng.shuffle(subset)
subset = subset[:min(50, len(subset))]
print("Subset size:", len(subset), "seed:", seed)

betas = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4]
rows = [eval_beta(subset, b) for b in betas]

df_beta = pd.DataFrame(rows).sort_values(
    by=["recall3_%", "acc1_%", "lat_avg_ms"],
    ascending=[False, False, True],
).reset_index(drop=True)

df_beta

Subset size: 50 seed: 123


Unnamed: 0,beta,n,acc1_%,recall3_%,lat_avg_ms,lat_p95_ms
0,0.05,50,100.0,100.0,1.57,1.77
1,0.1,50,100.0,100.0,1.58,1.75
2,0.2,50,100.0,100.0,1.58,1.77
3,0.15,50,100.0,100.0,1.61,1.81
4,0.25,50,100.0,100.0,1.61,1.82
5,0.4,50,100.0,100.0,1.61,1.81
6,0.3,50,100.0,100.0,1.62,1.81
7,0.0,50,100.0,100.0,2.2,4.28


In [18]:
from pathlib import Path
print("CWD:", Path.cwd())
print("Has src here?", (Path.cwd() / "src").exists())
print("Has data here?", (Path.cwd() / "data").exists())

CWD: /Users/dana/Desktop/qazcode/qazcode-nu/data
Has src here? False
Has data here? False


In [19]:
import os
print(os.getcwd())

/Users/dana/Desktop/qazcode/qazcode-nu/data


In [20]:
import os, sys
os.chdir("..")  # go from .../data to .../qazcode-nu
print("CWD:", os.getcwd())

sys.path.insert(0, os.getcwd())  # so Python can import src.*

CWD: /Users/dana/Desktop/qazcode/qazcode-nu


In [15]:
from pathlib import Path
import sys

ROOT = find_root()
sys.path.insert(0, str(ROOT / "src"))  # add src/ itself
import mock_server as ms
ms.build_protocol_index()
print("Imported from:", ms.__file__)

NameError: name 'find_root' is not defined

In [14]:
import json, time, statistics, re
from pathlib import Path

# 1) Load test set
TEST_DIR = Path("data/test_set")
tests = [json.loads(p.read_text(encoding="utf-8")) for p in sorted(TEST_DIR.glob("*.json"))]
print("Loaded:", len(tests))

# 2) Use the EXACT same retriever as the mock server
from src import mock_server as ms
ms.build_protocol_index()  # uses your updated focused indexing if you patched it

def norm_code(x):
    if x is None:
        return ""
    s = str(x).strip().upper()
    s = re.sub(r"\s+", "", s)
    return s

acc1 = 0
rec3 = 0
lat = []

for t in tests:
    q = t.get("query", "")
    gt = norm_code(t.get("gt", ""))
    valid = {norm_code(c) for c in (t.get("icd_codes", []) or [])}
    if gt:
        valid.add(gt)

    t0 = time.perf_counter()
    codes, _ = ms.retrieve_codes_from_protocols(
        q, top_codes=15, k_neighbors=40, alpha=0.65, beta=0.15
    )
    lat.append(time.perf_counter() - t0)

    top3 = [norm_code(c) for c in (codes[:3] if codes else [])]

    if top3 and top3[0] in valid:
        acc1 += 1
    if any(c in valid for c in top3):
        rec3 += 1

n = len(tests) or 1
lat_sorted = sorted(lat)
p95 = lat_sorted[int(0.95*(n-1))] if lat_sorted else None

print({
    "n": len(tests),
    "accuracy@1_%": round(100*acc1/n, 2),
    "recall@3_%": round(100*rec3/n, 2),
    "lat_avg_ms": round(1000*statistics.mean(lat), 2),
    "lat_p95_ms": round(1000*p95, 2) if p95 is not None else None,
})

Loaded: 0


ModuleNotFoundError: No module named 'src'