In [1]:
!pip install chromadb sentence-transformers datasets tqdm scikit-learn numpy seaborn pandas torch

Collecting chromadb
  Downloading chromadb-1.0.8-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-4.0.1-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.32.1-py3-none-any.whl.metadata (2.5 kB)
Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from chromadb)
  Downloading opentelemetry_instrumentation

In [2]:
!pip install -qU faiss-cpu

import numpy as np, faiss, torch, platform
print("FAISS:", faiss.__version__, "| NumPy:", np.__version__,
      "| Python:", platform.python_version())


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[?25hFAISS: 1.11.0 | NumPy: 2.0.2 | Python: 3.11.12


In [24]:
# ================================================================
# Intent Detection – FAISS-CPU (HNSW) benchmark multiparâmetro
# ================================================================
# • IDs sequenciais → idx.add(embs)                                  #
# • Compatível com MODELS em formato rico (provider / name)          #
# • Suporta listas de modelos, k e limiares (τ)                      #
# ================================================================

import os, time, json
from pathlib import Path
from collections import Counter

import numpy as np
import torch, faiss
from datasets import load_dataset
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from sentence_transformers import SentenceTransformer
from pprint import pprint

# ---------------- Config -----------------------------------------
DATA_DIR = "./faiss_indices"
BATCH    = 1024
DEVICE   = "cuda" if torch.cuda.is_available() else "cpu"

MODELS = {
    "BGE": {
        "provider": "sentence-transformers",
        "name": "BAAI/bge-base-en-v1.5"
    },
    "BGE-Large": {
        "provider": "sentence-transformers",
        "name": "BAAI/bge-large-en-v1.5"
    },
    "MPNet": {
        "provider": "sentence-transformers",
        "name": "sentence-transformers/all-mpnet-base-v2"
    },
    "GTR-T5-Base": {
        "provider": "sentence-transformers",
        "name": "sentence-transformers/gtr-t5-base"
    },
    "GTR-T5-Large": {
        "provider": "sentence-transformers",
        "name": "sentence-transformers/gtr-t5-large"
    },
    "LaBSE": {
        "provider": "sentence-transformers",
        "name": "sentence-transformers/LaBSE"
    },
    "DistilRoBERTa": {
        "provider": "sentence-transformers",
        "name": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    },
    "All-MiniLM-L12-v2": {
        "provider": "sentence-transformers",
        "name": "sentence-transformers/all-MiniLM-L12-v2"
    },
    "All-MiniLM-L6-v2": {
        "provider": "sentence-transformers",
        "name": "sentence-transformers/all-MiniLM-L6-v2"
    }
}

# -------- helper: extrai string do checkpoint --------------------
def get_model_name(model_key: str) -> str:
    cfg = MODELS[model_key]
    return cfg["name"] if isinstance(cfg, dict) else cfg      # retro-compatível

# ------------- FAISS helpers -------------
def build_hnsw(dim, m=32, efC=400):
    idx = faiss.IndexHNSWFlat(dim, m, faiss.METRIC_INNER_PRODUCT)
    idx.hnsw.efConstruction = efC
    idx.hnsw.efSearch       = 128
    return idx              # versão CPU

def save_idx(idx, path):   faiss.write_index(idx, path)
def load_idx(path):        return faiss.read_index(path)

# --------- Indexação (IDs seq.) ----------
def index_dataset(model_key):
    idx_path = f"{DATA_DIR}/{model_key}.faiss"
    lbl_path = f"{DATA_DIR}/{model_key}_labels.json"

    if os.path.exists(idx_path):
        return load_idx(idx_path), json.load(open(lbl_path))

    os.makedirs(DATA_DIR, exist_ok=True)
    model_name = get_model_name(model_key)
    model      = SentenceTransformer(model_name, device=DEVICE)

    ds    = load_dataset("clinc_oos", "plus")
    text  = ds["train"]["text"]
    lbls  = [ds["train"].features["intent"].names[i] for i in ds["train"]["intent"]]

    embs = model.encode(text, device=DEVICE,
                        batch_size=256, show_progress_bar=True,
                        normalize_embeddings=True).astype("float32")

    idx  = build_hnsw(embs.shape[1])
    idx.add(embs)                       # IDs = posição 0..N-1
    save_idx(idx, idx_path)
    json.dump(lbls, open(lbl_path, "w"))

    return idx, lbls

# ------------- Predição ------------------
def predict_intent(intents, sims, thr):
    votes  = Counter(intents)
    top    = max(votes.values())
    cands  = [c for c,v in votes.items() if v == top]
    best   = max(cands, key=lambda c: np.mean([s for i,s in enumerate(sims) if intents[i]==c]))
    return "oos" if sims[0] < thr else best

# -------- Avaliação multiparâmetro -------
def evaluate_grid(idx, lbls, model_key, k_list, thr_list):
    ds   = load_dataset("clinc_oos", "plus")["test"]
    gold = [ds.features["intent"].names[i] for i in ds["intent"]]
    txts = ds["text"]

    model = SentenceTransformer(get_model_name(model_key), device=DEVICE)
    k_max = max(k_list)

    preds = {(k, t): [] for k in k_list for t in thr_list}
    lat   = []

    for text in tqdm(txts, desc=f"[{model_key}] avaliando"):
        t0 = time.time()
        Q  = model.encode([text], device=DEVICE,
                          normalize_embeddings=True).astype("float32")
        sims, ids = idx.search(Q, k_max)
        lat.append((time.time() - t0) * 1_000)

        intents = [lbls[j] for j in ids[0]]
        dists   = sims[0]

        for k in k_list:
            sub_i = intents[:k]
            sub_s = dists[:k]
            for t in thr_list:
                preds[(k, t)].append(predict_intent(sub_i, sub_s, t))

    # métricas
    metric_table = {k: {} for k in k_list}
    true_oos = [g == "oos" for g in gold]

    for k in k_list:
        for t in thr_list:
            pr = preds[(k, t)]
            acc = round(accuracy_score(gold, pr), 4)
            f1  = round(f1_score(gold, pr, average="macro"), 4)

            pred_oos    = [p == "oos" for p in pr]
            acertos_oos = sum(1 for g, p in zip(true_oos, pred_oos) if g and p)
            oos_acerto  = round(acertos_oos / sum(true_oos), 4)

            metric_table[k][t] = {
                "accuracy"   : acc,
                "macro_f1"   : f1,
                "oos_acerto" : oos_acerto
            }

    latency = {
        "avg_ms": round(np.mean(lat), 2),
        "p50_ms": round(np.percentile(lat, 50), 2),
        "p95_ms": round(np.percentile(lat, 95), 2),
        "p99_ms": round(np.percentile(lat, 99), 2),
        "max_ms": round(np.max(lat), 2)
    }

    return {
        "metrics": metric_table,
        "latency": latency
    }


# --------------------------- MAIN ------------------------------------------
if __name__ == "__main__":
    model_list = [
        "BGE",
        "BGE-Large",
        "MPNet",
        "GTR-T5-Base",
        "GTR-T5-Large",
        "LaBSE",
        "DistilRoBERTa",
        "All-MiniLM-L12-v2",
        "All-MiniLM-L6-v2"
    ]
    k_values   = [1, 3, 5, 10]
    thr_values = [0.40, 0.55, 0.70, 0.75]

    all_results = {}

    for m in model_list:
        idx, lbls       = index_dataset(m)
        res             = evaluate_grid(idx, lbls, m, k_values, thr_values)
        all_results[m]  = res

    # Impressão formatada dos resultados
    for model, result in all_results.items():
        print(f"\n=== {model} ===")
        print(">> Latência (ms):")
        for k, v in result["latency"].items():
            print(f"   {k}: {v}")
        print(">> Melhores F1 por k:")
        for k, row in result["metrics"].items():
            best = max(row.items(), key=lambda x: x[1]["macro_f1"])
            thr, scores = best
            print(f"   k={k} | thr={thr} | F1={scores['macro_f1']} | Acc={scores['accuracy']} | OOS-Rec={scores['oos_acerto']}")



[BGE] avaliando: 100%|██████████| 5500/5500 [01:09<00:00, 79.43it/s]
[BGE-Large] avaliando: 100%|██████████| 5500/5500 [02:03<00:00, 44.51it/s]
[MPNet] avaliando: 100%|██████████| 5500/5500 [01:16<00:00, 71.69it/s]
[GTR-T5-Base] avaliando: 100%|██████████| 5500/5500 [01:24<00:00, 65.30it/s]
[GTR-T5-Large] avaliando: 100%|██████████| 5500/5500 [02:31<00:00, 36.31it/s]
[LaBSE] avaliando: 100%|██████████| 5500/5500 [01:08<00:00, 79.77it/s]
[DistilRoBERTa] avaliando: 100%|██████████| 5500/5500 [01:05<00:00, 84.14it/s]
[All-MiniLM-L12-v2] avaliando: 100%|██████████| 5500/5500 [01:06<00:00, 82.92it/s]
[All-MiniLM-L6-v2] avaliando: 100%|██████████| 5500/5500 [00:39<00:00, 138.34it/s]



=== BGE ===
>> Latência (ms):
   avg_ms: 12.08
   p50_ms: 10.96
   p95_ms: 17.73
   p99_ms: 21.84
   max_ms: 77.85
>> Melhores F1 por k:
   k=1 | thr=0.75 | F1=0.8842 | Acc=0.8742 | OOS-Rec=0.811
   k=3 | thr=0.75 | F1=0.8903 | Acc=0.8782 | OOS-Rec=0.8
   k=5 | thr=0.75 | F1=0.8919 | Acc=0.8784 | OOS-Rec=0.789
   k=10 | thr=0.75 | F1=0.8935 | Acc=0.8785 | OOS-Rec=0.78

=== BGE-Large ===
>> Latência (ms):
   avg_ms: 21.91
   p50_ms: 20.3
   p95_ms: 32.17
   p99_ms: 36.95
   max_ms: 51.38
>> Melhores F1 por k:
   k=1 | thr=0.75 | F1=0.9002 | Acc=0.8893 | OOS-Rec=0.796
   k=3 | thr=0.75 | F1=0.9082 | Acc=0.8947 | OOS-Rec=0.782
   k=5 | thr=0.75 | F1=0.9098 | Acc=0.8942 | OOS-Rec=0.766
   k=10 | thr=0.75 | F1=0.908 | Acc=0.8922 | OOS-Rec=0.761

=== MPNet ===
>> Latência (ms):
   avg_ms: 13.44
   p50_ms: 12.46
   p95_ms: 19.67
   p99_ms: 22.19
   max_ms: 32.23
>> Melhores F1 por k:
   k=1 | thr=0.55 | F1=0.8781 | Acc=0.8709 | OOS-Rec=0.805
   k=3 | thr=0.55 | F1=0.8857 | Acc=0.8765 | OOS-R

In [31]:
df.sort_values("accuracy", ascending=False)

Unnamed: 0,model,k,threshold,accuracy,macro_f1,oos_acerto,latency_avg_ms,latency_p50_ms,latency_p95_ms,latency_p99_ms,latency_max_ms,f1_latency_ratio
23,BGE-Large,3,0.75,0.8947,0.9082,0.782,23.01,21.28,33.45,38.41,47.03,0.039470
27,BGE-Large,5,0.75,0.8942,0.9098,0.766,23.01,21.28,33.45,38.41,47.03,0.039539
31,BGE-Large,10,0.75,0.8922,0.9080,0.761,23.01,21.28,33.45,38.41,47.03,0.039461
19,BGE-Large,1,0.75,0.8893,0.9002,0.796,23.01,21.28,33.45,38.41,47.03,0.039122
41,MPNet,5,0.55,0.8791,0.8900,0.784,14.09,12.93,20.87,23.91,31.63,0.063165
...,...,...,...,...,...,...,...,...,...,...,...,...
127,All-MiniLM-L12-v2,10,0.75,0.7227,0.7436,0.981,11.99,10.79,18.23,20.73,30.09,0.062018
131,All-MiniLM-L6-v2,1,0.75,0.7195,0.7397,0.985,7.03,6.39,10.34,11.97,22.86,0.105220
143,All-MiniLM-L6-v2,10,0.75,0.7193,0.7399,0.981,7.03,6.39,10.34,11.97,22.86,0.105249
135,All-MiniLM-L6-v2,3,0.75,0.7185,0.7378,0.985,7.03,6.39,10.34,11.97,22.86,0.104950


In [32]:
df.sort_values("macro_f1", ascending=False)

Unnamed: 0,model,k,threshold,accuracy,macro_f1,oos_acerto,latency_avg_ms,latency_p50_ms,latency_p95_ms,latency_p99_ms,latency_max_ms,f1_latency_ratio
27,BGE-Large,5,0.75,0.8942,0.9098,0.766,23.01,21.28,33.45,38.41,47.03,0.039539
23,BGE-Large,3,0.75,0.8947,0.9082,0.782,23.01,21.28,33.45,38.41,47.03,0.039470
31,BGE-Large,10,0.75,0.8922,0.9080,0.761,23.01,21.28,33.45,38.41,47.03,0.039461
26,BGE-Large,5,0.70,0.8725,0.9009,0.567,23.01,21.28,33.45,38.41,47.03,0.039153
19,BGE-Large,1,0.75,0.8893,0.9002,0.796,23.01,21.28,33.45,38.41,47.03,0.039122
...,...,...,...,...,...,...,...,...,...,...,...,...
127,All-MiniLM-L12-v2,10,0.75,0.7227,0.7436,0.981,11.99,10.79,18.23,20.73,30.09,0.062018
143,All-MiniLM-L6-v2,10,0.75,0.7193,0.7399,0.981,7.03,6.39,10.34,11.97,22.86,0.105249
131,All-MiniLM-L6-v2,1,0.75,0.7195,0.7397,0.985,7.03,6.39,10.34,11.97,22.86,0.105220
135,All-MiniLM-L6-v2,3,0.75,0.7185,0.7378,0.985,7.03,6.39,10.34,11.97,22.86,0.104950
