# Convert GLiNER Model To ONNX Format And Quantize

In [1]:
import json
import warnings
from typing import Any, Literal

import numpy as np

# import pandas as pd
# import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
# pd.options.display.max_rows = 1_000
# pd.options.display.max_columns = 1_000
# pd.options.display.max_colwidth = 600

# # Polars settings
# pl.Config.set_fmt_str_lengths(1_000)
# pl.Config.set_tbl_cols(n=1_000)
# pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)


# Demo (Prevents ruff from removing the unused module import)
name: Any
category: Literal["A", "B", "C"]
json.loads('{"name": "Bike Rental Prediction", "category": "A"}')

{'name': 'Bike Rental Prediction', 'category': 'A'}

In [3]:
import torch

In [4]:
from gliner import GLiNER

model = GLiNER.from_pretrained("urchade/gliner_small-v2.1")

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

## Save And Load The Model (Locally)

In [5]:
# save
model.save_pretrained("gliner_small_v_2_p_1_local")

# load
gliner_model = GLiNER.from_pretrained("gliner_small_v_2_p_1_local", load_tokenizer=True)

config.json not found in /Users/mac/Desktop/Projects/gliner_test/gliner_small_v_2_p_1_local


In [9]:
from pathlib import Path

onnx_savepath: Path = Path("gliner_small_v_2_p_1_local") / "model.onnx"

### Prepare The Data

In [None]:
text: str = (
    "ONNX is an open-source format designed to enable the interoperability of AI models across various frameworks and tools."
)
labels: list[str] = ["format", "model", "tool", "cat"]

inputs, _ = gliner_model.prepare_model_inputs([text], labels)

# Ensure text_lengths is set and is a tensor of shape (batch_size,)
batch_size = inputs["input_ids"].shape[0]
seq_len = inputs["input_ids"].shape[1]
if "text_lengths" not in inputs or inputs["text_lengths"] is None or not isinstance(inputs["text_lengths"], torch.Tensor):
    inputs["text_lengths"] = torch.full((batch_size,), seq_len, dtype=torch.long)

### Export The PyTorch Model To ONNX Format

In [11]:
class GLiNERExportWrapper(torch.nn.Module):
    def __init__(self, backend_model: torch.nn.Module, span_mode: str) -> None:
        super().__init__()
        self.backend_model = backend_model
        self.span_mode = span_mode

    def forward(  # type: ignore[override]
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        words_mask: torch.Tensor,
        text_lengths: torch.Tensor,
        span_idx: torch.Tensor | None = None,
        span_mask: torch.Tensor | None = None,
    ) -> torch.Tensor:
        kwargs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "words_mask": words_mask,
            "text_lengths": text_lengths,
        }
        if self.span_mode != "token_level":
            kwargs["span_idx"] = span_idx
            kwargs["span_mask"] = span_mask
        return self.backend_model(**kwargs)


export_wrapper = GLiNERExportWrapper(gliner_model.model, gliner_model.config.span_mode)

if gliner_model.config.span_mode == "token_level":
    all_inputs = (
        inputs["input_ids"],
        inputs["attention_mask"],
        inputs["words_mask"],
        inputs["text_lengths"],
    )
    input_names = ["input_ids", "attention_mask", "words_mask", "text_lengths"]
    dynamic_axes = {
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "attention_mask": {0: "batch_size", 1: "sequence_length"},
        "words_mask": {0: "batch_size", 1: "sequence_length"},
        "text_lengths": {0: "batch_size", 1: "value"},
        "logits": {
            0: "position",
            1: "batch_size",
            2: "sequence_length",
            3: "num_classes",
        },
    }
else:
    all_inputs = (
        inputs["input_ids"],
        inputs["attention_mask"],
        inputs["words_mask"],
        inputs["text_lengths"],
        inputs["span_idx"],
        inputs["span_mask"],
    )
    input_names = [
        "input_ids",
        "attention_mask",
        "words_mask",
        "text_lengths",
        "span_idx",
        "span_mask",
    ]
    dynamic_axes = {
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "attention_mask": {0: "batch_size", 1: "sequence_length"},
        "words_mask": {0: "batch_size", 1: "sequence_length"},
        "text_lengths": {0: "batch_size", 1: "value"},
        "span_idx": {0: "batch_size", 1: "num_spans", 2: "idx"},
        "span_mask": {0: "batch_size", 1: "num_spans"},
        "logits": {
            0: "batch_size",
            1: "sequence_length",
            2: "num_spans",
            3: "num_classes",
        },
    }
print("Converting the model...")

torch.onnx.export(
    export_wrapper,
    all_inputs,
    f=onnx_savepath,
    input_names=input_names,
    output_names=["logits"],
    dynamic_axes=dynamic_axes,
    opset_version=14,
    dynamo=False,
)

Converting the model...




### Copy Necessary Model Files



In [13]:
import shutil
from pathlib import Path

# Copy config and tokenizer files to ONNX directory
onnx_dir: Path = Path("gliner_small_v_2_p_1_local_onnx")
local_dir: Path = Path("gliner_small_v_2_p_1_local")

# Ensure ONNX directory exists
onnx_dir.mkdir(exist_ok=True)

# Copy config file
shutil.copy(local_dir / "gliner_config.json", onnx_dir / "gliner_config.json")

# Copy tokenizer files
tokenizer_files: list[str] = [
    "added_tokens.json",
    "special_tokens_map.json",
    "spm.model",
    "tokenizer_config.json",
    "tokenizer.json",
]
for file in tokenizer_files:
    src: Path = local_dir / file
    dst: Path = onnx_dir / file
    if src.exists():
        shutil.copy(src, dst)

In [None]:
# Test the ONNX model
text = (
    "ONNX is an open-source format designed to enable the interoperability of AI models across various frameworks and tools."
)
labels = ["format", "model", "tool", "cat"]

entities = model.predict_entities(text, labels)
print("Predicted entities:", entities)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Predicted entities: [{'start': 0, 'end': 4, 'text': 'ONNX', 'label': 'format', 'score': 0.9161803722381592}, {'start': 73, 'end': 82, 'text': 'AI models', 'label': 'model', 'score': 0.768403172492981}]


## Quantize The Model

In [15]:
# quantize model
from onnxruntime.quantization import QuantType, quantize_dynamic

quantized_save_path = Path("gliner_small_v_2_p_1_local_onnx") / "model_quantized.onnx"
# Quantize the ONNX model
print("Quantizing the model...")
quantize_dynamic(
    onnx_savepath,  # Input model
    quantized_save_path,  # Output model
    weight_type=QuantType.QUInt8,  # Quantize weights to 8-bit integers
)

Quantizing the model...


  elem_type: 7
  shape {
    dim {
      dim_value: 2
    }
    dim {
      dim_param: "unk__404"
    }
  }
}
.
  elem_type: 7
  shape {
    dim {
      dim_value: 2
    }
    dim {
      dim_param: "unk__405"
    }
  }
}
.
  elem_type: 7
  shape {
    dim {
      dim_value: 2
    }
    dim {
      dim_param: "unk__419"
    }
  }
}
.


In [16]:
# load quantized model
model = GLiNER.from_pretrained(
    "./gliner_small_v_2_p_1_local_onnx/",
    load_onnx_model=True,
    load_tokenizer=True,
    onnx_model_file="model_quantized.onnx",
)

config.json not found in /Users/mac/Desktop/Projects/gliner_test/gliner_small_v_2_p_1_local_onnx


In [17]:
text: str = """
Transfer from Adebayo Davies to Emeka Jonathan via Opay
"""

labels = ["person", "bank", "location"]

entities = model.predict_entities(text, labels, threshold=0.4)

for entity in entities:
    print(entity["text"], "=>", entity["label"], f"(score: {entity['score']:.4f})")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Adebayo Davies => person (score: 0.8450)
Emeka Jonathan => person (score: 0.7873)
