# Inference Optimizations
Note: used google collab for GPU because of time constraint. \
In real pipeline this would be done after training on GPU instance.

## Pre-process Data

In [1]:
# Upload kaggle.json
from google.colab import files
files.upload()

# Move to the correct path
!mkdir -p /content/.kaggle
!cp kaggle.json /content/.kaggle/
!chmod 600 /content/.kaggle/kaggle.json

# Set the environment variable so the API knows where to look
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/.kaggle"

# Test
!kaggle datasets list

Saving kaggle.json to kaggle.json
ref                                                          title                                                      size  lastUpdated                 downloadCount  voteCount  usabilityRating  
-----------------------------------------------------------  --------------------------------------------------  -----------  --------------------------  -------------  ---------  ---------------  
jayaantanaath/student-habits-vs-academic-performance         Student Habits vs Academic Performance                    19512  2025-04-12 10:49:08.663000          23391        402  1.0              
adilshamim8/cost-of-international-education                  Cost of International Education                           18950  2025-05-07 15:41:53.213000           4695         79  1.0              
adilshamim8/social-media-addiction-vs-relationships          Students' Social Media Addiction                           7851  2025-05-10 14:38:02.713000           2176       

In [2]:
import argparse
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import zipfile

# os.environ["KAGGLE_CONFIG_DIR"] = os.path.abspath(".kaggle") # Use local .kaggle directory
from kaggle.api.kaggle_api_extended import KaggleApi

In [3]:
def download_jigsaw(kaggle_dir):
    os.makedirs(kaggle_dir, exist_ok=True)

    api = KaggleApi()
    api.authenticate()

    # Download competition data
    api.competition_download_files(
        "jigsaw-unintended-bias-in-toxicity-classification",
        path=kaggle_dir
    )

    # Unzip
    zip_path = os.path.join(kaggle_dir, "jigsaw-unintended-bias-in-toxicity-classification.zip")
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(kaggle_dir)

    print("Downloaded and extracted Jigsaw dataset.")


def preprocess(kaggle_dir, output_dir, split_ratio=0.2):
    # Create output directory if missing
    os.makedirs(output_dir, exist_ok=True)

    # Ensure input file exists, create parent dir if needed (just in case)
    os.makedirs(kaggle_dir, exist_ok=True)
    input_path = os.path.join(kaggle_dir, "train.csv")
    if not os.path.exists(input_path):
        raise FileNotFoundError(f"train.csv not found in {kaggle_dir}")

    df = pd.read_csv(input_path).dropna(subset=["comment_text"])

    # Keep only the needed columns
    df = df[["comment_text", "target"]]

    # Binarize target (optional: uncomment if needed)
    # df["target"] = (df["target"] >= 0.5).astype(int)

    # Split
    train_df, val_df = train_test_split(df, test_size=split_ratio, random_state=42)

    os.makedirs(output_dir, exist_ok=True)
    train_df.to_csv(os.path.join(output_dir, "train.csv"), index=False)
    val_df.to_csv(os.path.join(output_dir, "val.csv"), index=False)

    print(f"Saved {len(train_df)} training and {len(val_df)} validation samples to {output_dir}")

In [4]:
kaggle_dir = "data/jigsaw/raw/"
output_dir = "data/jigsaw/processed/"
val_split = 0.2

download_jigsaw(kaggle_dir)
preprocess(kaggle_dir, output_dir, val_split)

Downloaded and extracted Jigsaw dataset.
Saved 1443896 training and 360975 validation samples to data/jigsaw/processed/


## Training

In [7]:
import os
import time
import argparse
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

In [8]:
config = {
    "initial_epochs": 2,
    "total_epochs": 1,
    "patience": 2,
    "batch_size": 128,
    "lr": 2e-5,
    "fine_tune_lr": 1e-5,
    "max_len": 128,
    "dropout_probability": 0.3,
    "model_name": "distilbert-base-uncased"
}

In [9]:
# ---------------------------
# Dataset
# ---------------------------
class JigsawDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.texts = df["comment_text"].tolist()
        self.labels = (df["target"] >= 0.5).astype(int).tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [8]:
# ---------------------------
# Training + Evaluation Functions
# ---------------------------
from tqdm import tqdm

def train_epoch(model, loader, criterion, optimizer, device, portion=0.01):
    model.train()
    total_loss, correct, total = 0, 0, 0

    num_batches = int(portion * len(loader)) # Doing part of the training because my part is inference and monitoring
    print(f"Training for {num_batches} batches")

    for i, batch in enumerate(tqdm(loader, desc="Training", leave=False)):
        if i >= num_batches:
            break

        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = outputs.logits.argmax(dim=1)
        correct += (preds == batch["labels"]).sum().item()
        total += batch["labels"].size(0)

    avg_loss = total_loss / num_batches
    avg_acc = correct / total
    print(f"Partial Epoch Summary - Avg Loss: {avg_loss:.4f}, Avg Accuracy: {avg_acc:.4f}\n")

    return avg_loss, avg_acc

def evaluate(model, loader, criterion, device, portion=0.01):
    model.eval()
    total_loss, correct, total = 0, 0, 0

    num_batches = int(portion * len(loader))
    print(f"Evaluating for {num_batches} batches")

    with torch.no_grad():
        for i, batch in enumerate(tqdm(loader, desc="Evaluating", leave=False)):
            if i >= num_batches:
                break

            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss

            total_loss += loss.item()
            preds = outputs.logits.argmax(dim=1)
            correct += (preds == batch["labels"]).sum().item()
            total += batch["labels"].size(0)

    avg_loss = total_loss / num_batches
    avg_acc = correct / total
    print(f"Eval Summary - Avg Loss: {avg_loss:.4f}, Accuracy: {avg_acc:.4f}\n")

    return avg_loss, avg_acc

In [9]:
# ---------------------------
# Main Training Pipeline
# ---------------------------
def main(args):
    # made to run in command line originally
    # parser = argparse.ArgumentParser()
    # parser.add_argument("--data-dir", type=str, required=True, help="Directory with train.csv and val.csv")
    # parser.add_argument("--save-path", type=str, required=True, help="Path to save the trained model")
    # parser.add_argument("--dry-run", action="store_true", help="Run a quick test on a small sample")
    # args = parser.parse_args()

    os.makedirs(os.path.dirname(args.save_path), exist_ok=True)
    device = torch.device("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu"))
    print(f"Using device: {device}")

    tokenizer = DistilBertTokenizer.from_pretrained(config["model_name"])
    train_df = pd.read_csv(os.path.join(args.data_dir, "train.csv"))
    if args.dry_run:
        train_df = train_df.sample(n=32, random_state=42)
    val_df = pd.read_csv(os.path.join(args.data_dir, "val.csv"))
    if args.dry_run:
        val_df = val_df.sample(n=32, random_state=42)

    train_loader = DataLoader(JigsawDataset(train_df, tokenizer, config["max_len"]),
                              batch_size=config["batch_size"], shuffle=True)
    val_loader = DataLoader(JigsawDataset(val_df, tokenizer, config["max_len"]),
                            batch_size=config["batch_size"])

    model = DistilBertForSequenceClassification.from_pretrained(config["model_name"])
    model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=config["lr"])

    best_val_loss = float("inf")
    patience_counter = 0

    for epoch in range(config["total_epochs"]):
        start = time.time()

        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc = evaluate(model, val_loader, criterion, device)

        print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f} Acc={train_acc:.4f} | Val Loss={val_loss:.4f} Acc={val_acc:.4f} | Time={time.time() - start:.2f}s")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), args.save_path)
            patience_counter = 0
            print("  Validation loss improved. Model saved.")
        else:
            patience_counter += 1
            print(f"  No improvement. Patience: {patience_counter}")
            if patience_counter >= config["patience"]:
                print("  Early stopping.")
                break

In [10]:
# simulate arguments
class args:
  data_dir = "data/jigsaw/processed/"
  save_path = "models/model.pth"
  dry_run = False

main(args)

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training for 112 batches




Partial Epoch Summary - Avg Loss: 0.2503, Avg Accuracy: 0.9166

Evaluating for 28 batches




Eval Summary - Avg Loss: 0.1531, Accuracy: 0.9481

Epoch 1: Train Loss=0.2503 Acc=0.9166 | Val Loss=0.1531 Acc=0.9481 | Time=53.61s
  Validation loss improved. Model saved.


## Inference Optimization

In [11]:
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [10]:
import os
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torchinfo import summary
import time
import numpy as np
import pandas as pd

In [11]:
class JigsawDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.texts = df["comment_text"].tolist()
        self.labels = (df["target"] >= 0.5).astype(int).tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [12]:
batch_size = 128
max_len = 128
model_name = "distilbert-base-uncased"
dataset_dir = os.getenv("DATA_DIR", "data/jigsaw/processed")
model_path = "models/model.pth"

In [13]:
val_df = pd.read_csv(os.path.join(dataset_dir, "val.csv"))

tokenizer = DistilBertTokenizer.from_pretrained(model_name)
test_loader = DataLoader(JigsawDataset(val_df, tokenizer, max_len), batch_size=batch_size, shuffle=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### Measure inference performance of PyTorch model on CPU

In [33]:
device = torch.device("cpu")
model = DistilBertForSequenceClassification.from_pretrained(model_name)
state_dict = torch.load(model_path, map_location=device)
model.load_state_dict(state_dict)
model.compile() # Test Compile mode
model.eval()
summary(model)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0

In [34]:
model_size = os.path.getsize(model_path)
print(f"Model Size on Disk: {model_size/ (1e6) :.2f} MB")

Model Size on Disk: 267.85 MB


In [35]:
def evaluate_test(model, loader, device, portion=0.01):
    model.eval()
    correct, total = 0, 0

    num_batches = int(portion * len(loader))
    print(f"Evaluating for {num_batches} batches")

    with torch.no_grad():
        for i, batch in enumerate(tqdm(loader, desc="Evaluating", leave=False)):
            if i >= num_batches:
                break

            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss

            preds = outputs.logits.argmax(dim=1)
            correct += (preds == batch["labels"]).sum().item()
            total += batch["labels"].size(0)

    return correct, total

In [36]:
correct, total = evaluate_test(model, test_loader, device, portion=0.01)
accuracy = (correct / total) * 100
print(f"Accuracy: {accuracy:.2f}% ({correct}/{total} correct)")

Evaluating for 28 batches


                                                               

Accuracy: 94.81% (3398/3584 correct)




#### Inference Latency

In [38]:
num_trials = 100

# 1) get one batch as a dict
batch = next(iter(test_loader))
# 2) extract the first example and move to device
input_ids      = batch["input_ids"][0].unsqueeze(0).to(device)
attention_mask = batch["attention_mask"][0].unsqueeze(0).to(device)

model.eval()
# 3) warm-up
with torch.no_grad():
    _ = model(input_ids=input_ids, attention_mask=attention_mask)

# 4) timed runs
latencies = []
for _ in range(num_trials):
    start = time.perf_counter()
    with torch.no_grad():
        _ = model(input_ids=input_ids, attention_mask=attention_mask)
    latencies.append(time.perf_counter() - start)

In [39]:
print(f"Inference Latency (single sample, median): {np.percentile(latencies, 50) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 95th percentile): {np.percentile(latencies, 95) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 99th percentile): {np.percentile(latencies, 99) * 1000:.2f} ms")
print(f"Inference Throughput (single sample): {num_trials/np.sum(latencies):.2f} FPS")

Inference Latency (single sample, median): 31.67 ms
Inference Latency (single sample, 95th percentile): 44.55 ms
Inference Latency (single sample, 99th percentile): 45.59 ms
Inference Throughput (single sample): 29.51 FPS


#### Batch throughput

In [40]:
num_batches = 10  # Number of trials

# 1) Grab one batch (a dict) and move to device, dropping labels
batch = next(iter(test_loader))
batch = {k: v.to(device) for k, v in batch.items() if k != "labels"}

model.eval()

# 2) Warm-up
with torch.no_grad():
    model(**batch)

# 3) Timed runs
batch_times = []
for _ in range(num_batches):
    start = time.perf_counter()
    with torch.no_grad():
        model(**batch)
    batch_times.append(time.perf_counter() - start)

In [41]:
# assume `batch` is the dict you moved to device and `batch_times` is your list of durations
batch_size    = batch["input_ids"].shape[0]
total_samples = batch_size * num_batches
batch_fps     = total_samples / np.sum(batch_times)

print(f"Batch Throughput: {batch_fps:.2f} FPS")

Batch Throughput: 52.82 FPS


#### Summary

In [42]:
print(f"Model Size on Disk: {model_size/ (1e6) :.2f} MB")
print(f"Accuracy: {accuracy:.2f}% ({correct}/{total} correct)")
print(f"Inference Latency (single sample, median): {np.percentile(latencies, 50) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 95th percentile): {np.percentile(latencies, 95) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 99th percentile): {np.percentile(latencies, 99) * 1000:.2f} ms")
print(f"Inference Throughput (single sample): {num_trials/np.sum(latencies):.2f} FPS")
print(f"Batch Throughput: {batch_fps:.2f} FPS")

Model Size on Disk: 267.85 MB
Accuracy: 94.81% (3398/3584 correct)
Inference Latency (single sample, median): 31.67 ms
Inference Latency (single sample, 95th percentile): 44.55 ms
Inference Latency (single sample, 99th percentile): 45.59 ms
Inference Throughput (single sample): 29.51 FPS
Batch Throughput: 52.82 FPS


#### **Eager mode Summary**
Model Size on Disk: 267.85 MB \
Accuracy: 94.81% (3398/3584 correct) \
Inference Latency (single sample, median): 35.32 ms \
Inference Latency (single sample, 95th percentile): 55.24 ms \
Inference Latency (single sample, 99th percentile): 56.10 ms \
Inference Throughput (single sample): 26.91 FPS \
Batch Throughput: 40.85 FPS \

#### **Compiled Summary**
Model Size on Disk: 267.85 MB \
Accuracy: 94.81% (3398/3584 correct) \
Inference Latency (single sample, median): 31.67 ms \
Inference Latency (single sample, 95th percentile): 44.55 ms \
Inference Latency (single sample, 99th percentile): 45.59 ms \
Inference Throughput (single sample): 29.51 FPS \
Batch Throughput: 52.82 FPS \

### Measure inference performance of ONNX model on CPU¶

In [43]:
!pip install onnx onnxruntime-gpu

Collecting onnx
  Downloading onnx-1.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnx-1.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m92.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m88.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 

In [1]:
import onnx
import onnxruntime as ort

In [46]:
device = torch.device("cpu")
model = DistilBertForSequenceClassification.from_pretrained(model_name)
state_dict = torch.load(model_path, map_location=device)
model.load_state_dict(state_dict)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [47]:
onnx_model_path = "models/model.onnx"

# dummy input - used to clarify the input shape
batch_size = 1
seq_len    = max_len
dummy_input_ids = torch.randint(
    low=0,
    high=tokenizer.vocab_size,
    size=(batch_size, seq_len),
    dtype=torch.long,
    device=model.device
)
dummy_attention_mask = torch.ones(
    (batch_size, seq_len),
    dtype=torch.long,
    device=model.device
)

# export
torch.onnx.export(
    model,
    (dummy_input_ids, dummy_attention_mask),
    onnx_model_path,
    export_params=True,
    opset_version=14,
    do_constant_folding=True,
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={
        "input_ids":       {0: "batch_size", 1: "seq_len"},
        "attention_mask":  {0: "batch_size", 1: "seq_len"},
        "logits":          {0: "batch_size"}
    }
)

# sanity check
onnx_model = onnx.load(onnx_model_path)
onnx.checker.check_model(onnx_model)

In [49]:
model_size = os.path.getsize(onnx_model_path)
print(f"Model Size on Disk: {model_size/ (1e6) :.2f} MB")

Model Size on Disk: 267.96 MB


#### Create inference session

In [57]:
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
ort_session.get_providers()

['CPUExecutionProvider']

In [58]:
correct = 0
total = 0

num_samples = int(0.0001 * len(test_loader.dataset))
samples_tested = 0

for batch in test_loader:
    if samples_tested >= num_samples:
        break

    input_ids = batch["input_ids"].numpy()
    attention_mask = batch["attention_mask"].numpy()
    labels = batch["labels"].numpy()

    outputs = ort_session.run(None, {
        "input_ids": input_ids,
        "attention_mask": attention_mask
    })[0]

    predicted = np.argmax(outputs, axis=1)
    batch_size = labels.shape[0]
    correct += (predicted == labels).sum()
    total += batch_size
    samples_tested += batch_size

accuracy = (correct / total) * 100

In [59]:
print(f"Accuracy: {accuracy:.2f}% ({correct}/{total} correct)")

Accuracy: 96.09% (123/128 correct)


#### Inference Latency

In [60]:
# Prepare a single tokenized sample
model_input = tokenizer("This is a sample.", return_tensors="np", max_length=max_len, padding="max_length", truncation=True)
single_input_ids = model_input["input_ids"]
single_attention_mask = model_input["attention_mask"]

# Setup ONNX Runtime session
ort_session = ort.InferenceSession("models/model.onnx")

# Warm-up
ort_session.run(None, {
    "input_ids": single_input_ids,
    "attention_mask": single_attention_mask
})

# Timing
latencies = []
for _ in range(100):
    start = time.time()
    ort_session.run(None, {
        "input_ids": single_input_ids,
        "attention_mask": single_attention_mask
    })
    latencies.append(time.time() - start)

In [61]:
print(f"Inference Latency (single sample, median): {np.percentile(latencies, 50) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 95th percentile): {np.percentile(latencies, 95) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 99th percentile): {np.percentile(latencies, 99) * 1000:.2f} ms")
print(f"Inference Throughput (single sample): {num_trials/np.sum(latencies):.2f} FPS")

Inference Latency (single sample, median): 40.11 ms
Inference Latency (single sample, 95th percentile): 40.53 ms
Inference Latency (single sample, 99th percentile): 41.30 ms
Inference Throughput (single sample): 24.92 FPS


#### Batch Throughput

In [62]:
num_batches = 50

# Get a batch from the test data
batch = next(iter(test_loader))
input_ids = batch["input_ids"].numpy()
attention_mask = batch["attention_mask"].numpy()

# Warm-up
ort_session.run(None, {
    "input_ids": input_ids,
    "attention_mask": attention_mask
})

batch_times = []
for _ in range(num_batches):
    start_time = time.time()
    ort_session.run(None, {
        "input_ids": input_ids,
        "attention_mask": attention_mask
    })
    batch_times.append(time.time() - start_time)

In [63]:
batch_fps = (input_ids.shape[0] * num_batches) / np.sum(batch_times)
print(f"Batch Throughput: {batch_fps:.2f} FPS")

Batch Throughput: 41.60 FPS


#### Summary

In [64]:
print(f"Accuracy: {accuracy:.2f}% ({correct}/{total} correct)")
print(f"Model Size on Disk: {model_size/ (1e6) :.2f} MB")
print(f"Inference Latency (single sample, median): {np.percentile(latencies, 50) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 95th percentile): {np.percentile(latencies, 95) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 99th percentile): {np.percentile(latencies, 99) * 1000:.2f} ms")
print(f"Inference Throughput (single sample): {num_trials/np.sum(latencies):.2f} FPS")
print(f"Batch Throughput: {batch_fps:.2f} FPS")

Accuracy: 96.09% (123/128 correct)
Model Size on Disk: 267.96 MB
Inference Latency (single sample, median): 40.11 ms
Inference Latency (single sample, 95th percentile): 40.53 ms
Inference Latency (single sample, 99th percentile): 41.30 ms
Inference Throughput (single sample): 24.92 FPS
Batch Throughput: 41.60 FPS


### Apply optimizations to ONNX model

In [4]:
def benchmark_session(ort_session):
    print(f"Execution provider: {ort_session.get_providers()}")

    ## Benchmark accuracy (0.01% of test set)
    correct = 0
    total = 0
    num_samples = int(0.0001 * len(test_loader.dataset))
    samples_tested = 0

    for batch in test_loader:
        if samples_tested >= num_samples:
            break

        input_ids = batch["input_ids"].numpy()
        attention_mask = batch["attention_mask"].numpy()
        labels = batch["labels"].numpy()

        outputs = ort_session.run(None, {
            "input_ids": input_ids,
            "attention_mask": attention_mask
        })[0]

        predicted = np.argmax(outputs, axis=1)
        batch_size = labels.shape[0]
        correct += (predicted == labels).sum()
        total += batch_size
        samples_tested += batch_size

    accuracy = (correct / total) * 100
    print(f"Accuracy (0.01% sampled): {accuracy:.2f}% ({correct}/{total} correct)")

    ## Benchmark inference latency for single sample
    num_trials = 100
    single_batch = next(iter(test_loader))
    input_ids = single_batch["input_ids"][:1].numpy()
    attention_mask = single_batch["attention_mask"][:1].numpy()

    ort_session.run(None, {
        "input_ids": input_ids,
        "attention_mask": attention_mask
    })

    latencies = []
    for _ in range(num_trials):
        start = time.time()
        ort_session.run(None, {
            "input_ids": input_ids,
            "attention_mask": attention_mask
        })
        latencies.append(time.time() - start)

    print(f"Inference Latency (single sample, median): {np.percentile(latencies, 50) * 1000:.2f} ms")
    print(f"Inference Latency (single sample, 95th percentile): {np.percentile(latencies, 95) * 1000:.2f} ms")
    print(f"Inference Latency (single sample, 99th percentile): {np.percentile(latencies, 99) * 1000:.2f} ms")
    print(f"Inference Throughput (single sample): {num_trials / np.sum(latencies):.2f} FPS")

    ## Benchmark batch throughput
    num_batches = 50
    input_ids = single_batch["input_ids"].numpy()
    attention_mask = single_batch["attention_mask"].numpy()

    ort_session.run(None, {
        "input_ids": input_ids,
        "attention_mask": attention_mask
    })

    batch_times = []
    for _ in range(num_batches):
        start = time.time()
        ort_session.run(None, {
            "input_ids": input_ids,
            "attention_mask": attention_mask
        })
        batch_times.append(time.time() - start)

    batch_fps = (input_ids.shape[0] * num_batches) / np.sum(batch_times)
    print(f"Batch Throughput: {batch_fps:.2f} FPS")

#### Apply basic graph optimizations

In [67]:
onnx_model_path = "models/model.onnx"
optimized_model_path = "models/model_optimized.onnx"

session_options = ort.SessionOptions()
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
session_options.optimized_model_filepath = optimized_model_path

ort_session = ort.InferenceSession(
    onnx_model_path,
    sess_options=session_options,
    providers=["CPUExecutionProvider"]
)

In [68]:
onnx_model_path = "models/model_optimized.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=["CPUExecutionProvider"])
benchmark_session(ort_session)

Execution provider: ['CPUExecutionProvider']
Accuracy (0.01% sampled): 96.09% (123/128 correct)
Inference Latency (single sample, median): 24.31 ms
Inference Latency (single sample, 95th percentile): 25.66 ms
Inference Latency (single sample, 99th percentile): 32.61 ms
Inference Throughput (single sample): 40.47 FPS
Batch Throughput: 35.77 FPS


#### Dynamic quantization

In [70]:
!pip install neural-compressor

Collecting neural-compressor
  Downloading neural_compressor-3.3.1-py3-none-any.whl.metadata (15 kB)
Collecting deprecated>=1.2.13 (from neural-compressor)
  Downloading Deprecated-1.2.18-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting numpy<2.0 (from neural-compressor)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting schema (from neural-compressor)
  Downloading schema-0.7.7-py2.py3-none-any.whl.metadata (34 kB)
Downloading neural_compressor-3.3.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Deprecated-1.2.18-py2.py3-none-any.whl (10.0 kB)
Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [71]:
import neural_compressor
from neural_compressor import quantization

In [72]:
# Load ONNX model
model_path = "models/model.onnx"
fp32_model = neural_compressor.model.onnx_model.ONNXModel(model_path)

# Configure dynamic quantization
config_ptq = neural_compressor.PostTrainingQuantConfig(
    approach="dynamic"
)

# Quantize
q_model = quantization.fit(
    model=fp32_model,
    conf=config_ptq
)

2025-05-16 02:09:09 [INFO] Start auto tuning.
2025-05-16 02:09:09 [INFO] Quantize model without tuning!
2025-05-16 02:09:09 [INFO] Quantize the model with default configuration without evaluating the model.                To perform the tuning process, please either provide an eval_func or provide an                    eval_dataloader an eval_metric.
2025-05-16 02:09:09 [INFO] Adaptor has 5 recipes.
2025-05-16 02:09:09 [INFO] 0 recipes specified by user.
2025-05-16 02:09:09 [INFO] 3 recipes require future tuning.
2025-05-16 02:09:10 [INFO] *** Initialize auto tuning
2025-05-16 02:09:10 [INFO] {
2025-05-16 02:09:10 [INFO]     'PostTrainingQuantConfig': {
2025-05-16 02:09:10 [INFO]         'AccuracyCriterion': {
2025-05-16 02:09:10 [INFO]             'criterion': 'relative',
2025-05-16 02:09:10 [INFO]             'higher_is_better': True,
2025-05-16 02:09:10 [INFO]             'tolerable_loss': 0.01,
2025-05-16 02:09:10 [INFO]             'absolute': None,
2025-05-16 02:09:10 [INFO]     

In [73]:
# Save quantized model
quant_model_path = "models/model_quantized_dynamic.onnx"
q_model.save_model_to_file(quant_model_path)

In [74]:
# Print model size
model_size = os.path.getsize(quant_model_path)
print(f"Model Size on Disk: {model_size / 1e6:.2f} MB")

Model Size on Disk: 69.24 MB


In [75]:
ort_session = ort.InferenceSession(quant_model_path, providers=["CPUExecutionProvider"])
benchmark_session(ort_session)

Execution provider: ['CPUExecutionProvider']
Accuracy (0.01% sampled): 93.75% (120/128 correct)
Inference Latency (single sample, median): 19.46 ms
Inference Latency (single sample, 95th percentile): 19.91 ms
Inference Latency (single sample, 99th percentile): 20.16 ms
Inference Throughput (single sample): 51.27 FPS
Batch Throughput: 61.72 FPS


### Try a different execution providers

#### CPU

In [80]:
onnx_model_path = "models/model.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=["CPUExecutionProvider"])
benchmark_session(ort_session)

Execution provider: ['CPUExecutionProvider']
Accuracy (0.01% sampled): 96.09% (123/128 correct)
Inference Latency (single sample, median): 40.51 ms
Inference Latency (single sample, 95th percentile): 41.10 ms
Inference Latency (single sample, 99th percentile): 41.29 ms
Inference Throughput (single sample): 24.64 FPS
Batch Throughput: 25.55 FPS


#### CUDA

In [14]:
onnx_model_path = "models/model.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=["CUDAExecutionProvider"])
benchmark_session(ort_session)
ort.get_device()

Execution provider: ['CUDAExecutionProvider', 'CPUExecutionProvider']
Accuracy (0.01% sampled): 96.09% (123/128 correct)
Inference Latency (single sample, median): 1.69 ms
Inference Latency (single sample, 95th percentile): 1.87 ms
Inference Latency (single sample, 99th percentile): 1.93 ms
Inference Throughput (single sample): 581.60 FPS
Batch Throughput: 5005.71 FPS


'GPU'