## inference data
- CIFAR 10  test 데이터
- test : 10000
- resize = 128

In [None]:
!pip install onnxruntime

In [None]:
!pip install onnx

## CPU

In [3]:
import os, time, math, csv
import numpy as np
import onnxruntime as ort
import torch
from torchvision import datasets, transforms

# =========================
# Config
# =========================
MODEL_PATHS = [
    "mission16_vgg11bn_fp32.onnx",                 # FP32
    "mission16_vgg11bn_int8_dynamic.onnx",         # PTQ INT8 (dynamic)
    "mission16_vgg11bn_int8_static.onnx",          # PTQ INT8 (static)
    "mission_16_vgg11bn_qat_int8_fp32match.onnx",  # QAT INT8
]

IMG_SIZE    = 128
BATCH_SIZE  = 32
NUM_WORKERS = max(2, (os.cpu_count() or 8) - 2)
PIN_MEMORY  = True
NORM_MEAN   = (0.4914, 0.4822, 0.4465)   # CIFAR-10
NORM_STD    = (0.2470, 0.2435, 0.2616)
WARMUP      = 3
CSV_OUT     = "onnx_benchmark_results.csv"

# 동일 EP로 공정 비교를 원하면 "CUDA" 또는 "CPU"로 고정 (None은 GPU 우선/폴백)
FORCE_PROVIDER = None  # "CUDA" | "CPU" | None

print("ORT available providers:", ort.get_available_providers())

# =========================
# DataLoader (128 고정 + Normalize)
# =========================
tfms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE), interpolation=transforms.InterpolationMode.BILINEAR),
    transforms.ToTensor(),
    transforms.Normalize(NORM_MEAN, NORM_STD),
])
test_set = datasets.CIFAR10(root="./data", train=False, download=True, transform=tfms)
test_loader = torch.utils.data.DataLoader(
    test_set, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY
)

# =========================
# ORT session helpers
# =========================
def build_session(onnx_path: str):
    so = ort.SessionOptions()
    so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
    so.intra_op_num_threads = max(1, os.cpu_count()//2)

    if FORCE_PROVIDER == "CUDA":
        tries = [["CUDAExecutionProvider", "CPUExecutionProvider"], ["CPUExecutionProvider"]]
    elif FORCE_PROVIDER == "CPU":
        tries = [["CPUExecutionProvider"]]
    else:
        if "CUDAExecutionProvider" in ort.get_available_providers():
            tries = [["CUDAExecutionProvider", "CPUExecutionProvider"], ["CPUExecutionProvider"]]
        else:
            tries = [["CPUExecutionProvider"]]

    last_err = None
    for providers in tries:
        try:
            sess = ort.InferenceSession(onnx_path, sess_options=so, providers=providers)
            inp = sess.get_inputs()[0].name
            out = sess.get_outputs()[0].name
            return sess, inp, out, providers
        except Exception as e:
            last_err = e
    raise RuntimeError(f"Failed to create ORT session for {onnx_path}: {last_err}")

def onnx_predict_batch(sess, input_name, output_name, x: torch.Tensor) -> np.ndarray:
    x_np = np.ascontiguousarray(x.detach().cpu().numpy().astype(np.float32, copy=False))
    return sess.run([output_name], {input_name: x_np})[0]

@torch.no_grad()
def evaluate_onnx(sess, input_name, output_name, loader, warmup=WARMUP):
    # warmup
    it = iter(loader)
    for _ in range(warmup):
        try:
            xb, _ = next(it)
        except StopIteration:
            break
        _ = onnx_predict_batch(sess, input_name, output_name, xb)

    total, correct, times, n_batches = 0, 0, [], 0
    for xb, yb in loader:
        t0 = time.perf_counter()
        logits = onnx_predict_batch(sess, input_name, output_name, xb)
        t1 = time.perf_counter()

        preds = logits.argmax(axis=1)
        y_np = yb.detach().cpu().numpy()
        correct += (preds == y_np).sum()
        total   += y_np.shape[0]

        times.append(t1 - t0)
        n_batches += 1

    acc = correct / total if total else 0.0
    avg_batch_ms = (np.mean(times) * 1000.0) if times else math.nan
    mean_bs = (total / n_batches) if n_batches else 0
    per_image_ms = (avg_batch_ms / mean_bs) if mean_bs else math.nan
    ips = (1000.0 / per_image_ms) if per_image_ms and per_image_ms > 0 else math.nan
    return acc, per_image_ms, ips, avg_batch_ms, n_batches, total

def file_size_mb(path: str) -> float:
    try:
        return os.path.getsize(path) / (1024 * 1024)
    except OSError:
        return float("nan")

# =========================
# Run & collect
# =========================
rows = []
for path in MODEL_PATHS:
    if not os.path.exists(path):
        print(f"[SKIP] {path} (파일 없음)")
        continue

    size_mb = file_size_mb(path)
    try:
        sess, in_name, out_name, providers = build_session(path)
    except Exception as e:
        print(f"[FAIL] {path}: {e}")
        continue

    acc, per_img_ms, ips, avg_batch_ms, n_batches, total = evaluate_onnx(
        sess, in_name, out_name, test_loader
    )

    print(f"\n=== {os.path.basename(path)} ===")
    print(f"Providers(in use): {providers}")
    print(f"size: {size_mb:.2f} MB | acc: {acc*100:.2f}% | per_image: {per_img_ms:.3f} ms | "
          f"img/s: {ips:.1f} | avg_batch: {avg_batch_ms:.2f} ms | batches: {n_batches} | samples: {total}")

    rows.append({
        "model": os.path.basename(path),
        "size_mb": round(size_mb, 2),
        "acc_pct": round(acc*100, 2),
        "per_image_ms": round(per_img_ms, 3),
        "images_per_sec": round(ips, 1),
        "avg_batch_ms": round(avg_batch_ms, 2),
        "provider": providers[0] if providers else "unknown",
    })

# 요약 표
if rows:
    print("\n# 비교 요약 (size / speed / accuracy)")
    hdr = f"{'model':38s}  {'size(MB)':>9s}  {'acc(%)':>8s}  {'per_img(ms)':>12s}  {'img/s':>8s}  {'EP':>10s}"
    print(hdr)
    print("-" * len(hdr))
    for r in rows:
        print(f"{r['model']:38s}  {r['size_mb']:9.2f}  {r['acc_pct']:8.2f}  "
              f"{r['per_image_ms']:12.3f}  {r['images_per_sec']:8.1f}  {r['provider']:>10s}")

    # CSV 저장
    with open(CSV_OUT, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
        writer.writeheader()
        writer.writerows(rows)
    print(f"\nSaved: {CSV_OUT}")
else:
    print("평가할 ONNX 모델이 없습니다.")

ORT available providers: ['AzureExecutionProvider', 'CPUExecutionProvider']


100%|██████████| 170M/170M [00:13<00:00, 12.7MB/s]



=== mission16_vgg11bn_fp32.onnx ===
Providers(in use): ['CPUExecutionProvider']
size: 227.37 MB | acc: 92.28% | per_image: 63.161 ms | img/s: 15.8 | avg_batch: 2017.92 ms | batches: 313 | samples: 10000

=== mission16_vgg11bn_int8_dynamic.onnx ===
Providers(in use): ['CPUExecutionProvider']
size: 83.25 MB | acc: 92.30% | per_image: 62.848 ms | img/s: 15.9 | avg_batch: 2007.92 ms | batches: 313 | samples: 10000

=== mission16_vgg11bn_int8_static.onnx ===
Providers(in use): ['CPUExecutionProvider']
size: 57.03 MB | acc: 91.72% | per_image: 40.329 ms | img/s: 24.8 | avg_batch: 1288.47 ms | batches: 313 | samples: 10000

=== mission_16_vgg11bn_qat_int8_fp32match.onnx ===
Providers(in use): ['CPUExecutionProvider']
size: 56.99 MB | acc: 91.31% | per_image: 37.867 ms | img/s: 26.4 | avg_batch: 1209.82 ms | batches: 313 | samples: 10000

# 비교 요약 (size / speed / accuracy)
model                                    size(MB)    acc(%)   per_img(ms)     img/s          EP
--------------------------

## CUDA

In [None]:
pip install -U onnxruntime-gpu

In [8]:
# onnx_benchmark_gpu_only.py
# ------------------------------------------------------------
# pip uninstall -y onnxruntime
# pip install -U onnxruntime-gpu onnx torchvision torch
# ------------------------------------------------------------
import os, time, math, csv, json, gzip
import numpy as np
import onnxruntime as ort
import torch
from torchvision import datasets, transforms

# =========================
# Config
# =========================
MODEL_PATHS = [
    "mission16_vgg11bn_fp32.onnx",                 # FP32
    "mission16_vgg11bn_int8_dynamic.onnx",         # PTQ INT8 (dynamic, FC-only이라고 가정)
    "mission16_vgg11bn_int8_static.onnx",          # PTQ INT8 (static)
    "mission_16_vgg11bn_qat_int8_fp32match.onnx",  # QAT INT8
]

IMG_SIZE    = 128
BATCH_SIZE  = 32
NUM_WORKERS = max(2, (os.cpu_count() or 8) - 2)
PIN_MEMORY  = True
NORM_MEAN   = (0.4914, 0.4822, 0.4465)   # CIFAR-10
NORM_STD    = (0.2470, 0.2435, 0.2616)
WARMUP      = 3
CSV_OUT     = "onnx_benchmark_results_gpu.csv"

# ---- GPU 강제 정책 ----
# CUDA만 허용(폴백 금지). TensorRT도 허용하려면 ALLOWED_EPS에 'TensorrtExecutionProvider'를 추가하세요.
ALLOWED_EPS = {"CUDAExecutionProvider"}
AVAILABLE_EPS = ort.get_available_providers()
print("ORT available providers:", AVAILABLE_EPS)
assert "CUDAExecutionProvider" in AVAILABLE_EPS, \
    "CUDAExecutionProvider가 없습니다. 같은 가상환경에서 `pip install -U onnxruntime-gpu` 하세요."

# =========================
# DataLoader
# =========================
tfms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE), interpolation=transforms.InterpolationMode.BILINEAR),
    transforms.ToTensor(),
    transforms.Normalize(NORM_MEAN, NORM_STD),
])
test_set = datasets.CIFAR10(root="./data", train=False, download=True, transform=tfms)
test_loader = torch.utils.data.DataLoader(
    test_set, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY
)

# =========================
# Helpers
# =========================
def build_session(onnx_path: str, device_id: int = 0):
    """
    CUDA 전용 세션 생성. CPU/다른 EP는 등록 안 함.
    실제 실행 EP는 프로파일로 검증.
    """
    so = ort.SessionOptions()
    so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
    so.intra_op_num_threads = max(1, os.cpu_count()//2)
    so.enable_profiling = True  # 실제 실행 EP 확인용

    cuda_opts = {"device_id": device_id, "arena_extend_strategy": "kNextPowerOfTwo",
                 "cudnn_conv_algo_search": "DEFAULT", "do_copy_in_default_stream": True}
    providers = [("CUDAExecutionProvider", cuda_opts)]
    # ⚠️ 여기서 CPU EP는 넣지 않습니다(폴백 금지).
    sess = ort.InferenceSession(onnx_path, sess_options=so, providers=providers)
    in_name  = sess.get_inputs()[0].name
    out_name = sess.get_outputs()[0].name
    return sess, in_name, out_name

def _load_profile_any(path: str):
    """ORT 프로파일 파일을 dict/list/JSON lines/gzip 모두 대응."""
    with open(path, "rb") as f:
        raw = f.read()
    try:
        text = raw.decode("utf-8")
    except UnicodeDecodeError:
        text = gzip.decompress(raw).decode("utf-8")
    try:
        return json.loads(text)  # dict 또는 list
    except json.JSONDecodeError:
        items = []
        for line in text.splitlines():
            line = line.strip()
            if line.startswith("{"):
                try:
                    items.append(json.loads(line))
                except Exception:
                    pass
        return items

def collect_used_eps_from_profile(profile_path: str):
    """
    프로파일에서 실제 실행된 EP 집계.
    반환: (used_eps_set, disallowed_eps_set)
    """
    data = _load_profile_any(profile_path)
    if isinstance(data, dict):
        events = data.get("traceEvents") or data.get("events") or []
    elif isinstance(data, list):
        events = data
    else:
        events = []

    used = set()
    for ev in events:
        if isinstance(ev, dict):
            args = ev.get("args") or {}
            prov = args.get("provider")
            if prov:
                used.add(prov)

    # 간혹 provider가 안 찍히는 환경 → CUDA만 사용된 것으로 간주
    if not used:
        used = {"CUDAExecutionProvider"}
    bad = {p for p in used if p not in ALLOWED_EPS}
    return used, bad

def onnx_predict_batch(sess, input_name, output_name, x: torch.Tensor) -> np.ndarray:
    # 입력은 CPU numpy로 전달 → ORT가 내부에서 GPU로 업로드하여 연산
    x_np = np.ascontiguousarray(x.detach().cpu().numpy().astype(np.float32, copy=False))
    return sess.run([output_name], {input_name: x_np})[0]

@torch.no_grad()
def evaluate_onnx(sess, input_name, output_name, loader, warmup=WARMUP):
    # warmup
    it = iter(loader)
    for _ in range(warmup):
        try:
            xb, _ = next(it)
        except StopIteration:
            break
        _ = onnx_predict_batch(sess, input_name, output_name, xb)

    total, correct, times, n_batches = 0, 0, [], 0
    for xb, yb in loader:
        t0 = time.perf_counter()
        logits = onnx_predict_batch(sess, input_name, output_name, xb)
        t1 = time.perf_counter()

        preds = logits.argmax(axis=1)
        y_np = yb.detach().cpu().numpy()
        correct += (preds == y_np).sum()
        total   += y_np.shape[0]

        times.append(t1 - t0)
        n_batches += 1

    acc = correct / total if total else 0.0
    avg_batch_ms = (np.mean(times) * 1000.0) if times else math.nan
    mean_bs = (total / n_batches) if n_batches else 0
    per_image_ms = (avg_batch_ms / mean_bs) if mean_bs else math.nan
    ips = (1000.0 / per_image_ms) if per_image_ms and per_image_ms > 0 else math.nan
    return acc, per_image_ms, ips, avg_batch_ms, n_batches, total

def file_size_mb(path: str) -> float:
    try:
        return os.path.getsize(path) / (1024 * 1024)
    except OSError:
        return float("nan")

# =========================
# Run & collect
# =========================
rows = []
for path in MODEL_PATHS:
    if not os.path.exists(path):
        print(f"[SKIP] {path} (파일 없음)")
        continue

    size_mb = file_size_mb(path)
    try:
        sess, in_name, out_name = build_session(path)
    except Exception as e:
        print(f"[FAIL] {path}: 세션 생성 실패 -> {e}")
        continue

    acc, per_img_ms, ips, avg_batch_ms, n_batches, total = evaluate_onnx(
        sess, in_name, out_name, test_loader
    )

    # 실제 사용 EP 검증
    profile_path = sess.end_profiling()
    used_eps, bad_eps = collect_used_eps_from_profile(profile_path)
    if bad_eps:
        print(f"[WARN] {os.path.basename(path)}: 비허용 EP 사용 감지 -> {sorted(bad_eps)} (all used: {sorted(used_eps)})")

    print(f"\n=== {os.path.basename(path)} ===")
    print(f"used EPs (from profile): {sorted(used_eps)}")
    print(f"size: {size_mb:.2f} MB | acc: {acc*100:.2f}% | per_image: {per_img_ms:.3f} ms | "
          f"img/s: {ips:.1f} | avg_batch: {avg_batch_ms:.2f} ms | batches: {n_batches} | samples: {total}")

    rows.append({
        "model": os.path.basename(path),
        "size_mb": round(size_mb, 2),
        "acc_pct": round(acc*100, 2),
        "per_image_ms": round(per_img_ms, 3),
        "images_per_sec": round(ips, 1),
        "avg_batch_ms": round(avg_batch_ms, 2),
        "provider": "CUDAExecutionProvider",
        "used_eps": ";".join(sorted(used_eps)),
    })

# 요약 표
if rows:
    print("\n# 비교 요약 (size / speed / accuracy)")
    hdr = f"{'model':38s}  {'size(MB)':>9s}  {'acc(%)':>8s}  {'per_img(ms)':>12s}  {'img/s':>8s}  {'EP':>10s}"
    print(hdr)
    print("-" * len(hdr))
    for r in rows:
        print(f"{r['model']:38s}  {r['size_mb']:9.2f}  {r['acc_pct']:8.2f}  "
              f"{r['per_image_ms']:12.3f}  {r['images_per_sec']:8.1f}  {'CUDA':>10s}")

    with open(CSV_OUT, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
        writer.writeheader()
        writer.writerows(rows)
    print(f"\nSaved: {CSV_OUT}")
else:
    print("평가할 ONNX 모델이 없습니다.")

ORT available providers: ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']

=== mission16_vgg11bn_fp32.onnx ===
used EPs (from profile): ['CUDAExecutionProvider']
size: 227.37 MB | acc: 92.28% | per_image: 1.937 ms | img/s: 516.4 | avg_batch: 61.87 ms | batches: 313 | samples: 10000
[WARN] mission16_vgg11bn_int8_dynamic.onnx: 비허용 EP 사용 감지 -> ['CPUExecutionProvider'] (all used: ['CPUExecutionProvider', 'CUDAExecutionProvider'])

=== mission16_vgg11bn_int8_dynamic.onnx ===
used EPs (from profile): ['CPUExecutionProvider', 'CUDAExecutionProvider']
size: 83.25 MB | acc: 92.30% | per_image: 2.801 ms | img/s: 357.0 | avg_batch: 89.48 ms | batches: 313 | samples: 10000
[WARN] mission16_vgg11bn_int8_static.onnx: 비허용 EP 사용 감지 -> ['CPUExecutionProvider'] (all used: ['CPUExecutionProvider', 'CUDAExecutionProvider'])

=== mission16_vgg11bn_int8_static.onnx ===
used EPs (from profile): ['CPUExecutionProvider', 'CUDAExecutionProvider']
size: 57.03 MB | acc: 92.24% | per_