In [20]:
import onnxruntime as ort
import numpy as np

from onnx import TensorProto, save, load
from onnx.helper import (
    make_model, make_node, make_tensor, make_graph,
    make_tensor_value_info, make_opsetid)
from onnx.checker import check_model

# Numericalizer


In [21]:
# Preprocessing parameters
SEPARATORS=[" "]
ACTION="LOWER"
STOPWORDS=["the"]

# Model parameters
OUTPUT_LENGTH = 15
VOCAB = ["do", "not", "some"]
INT_MAP = [i+1 for i in range(len(VOCAB))]
OOV_INT = 0

In [22]:
# Input string tensor:
# ["MY CLEAN QUERY oov stopword"]
string_input = make_tensor_value_info('string_input', TensorProto.STRING, [1])
# [["MY", "CLEAN", "QUERY", "oov", "stopword"]]
string_split = make_tensor_value_info('string_split', TensorProto.STRING, [1, None])

# Intermediate tensor after normalization:
# [["my", "clean", "query", "oov"]]
string_normalized = make_tensor_value_info('string_normalized', TensorProto.STRING, [1, None])

# Output tensor
numeric_output = make_tensor_value_info('numeric_output', TensorProto.INT64, [1, None])

# Pad inputs:
pads_data = make_tensor("pads_data", TensorProto.INT64, [4], [0, 0, 0, OUTPUT_LENGTH])  # Constant value of [0, OUTPUT_LENGTH]
# Pad output:
# [[1, 2, 3, -1, 0, 0, 0]] (pad = 3)
numeric_output_padded = make_tensor_value_info("numeric_output_padded", TensorProto.INT64, [1, None])

# Slice inputs:
slice_start_data = make_tensor("slice_start_data", TensorProto.INT64, [2], [0, 0])  # Constant value of 0
slice_end_data = make_tensor("slice_end_data", TensorProto.INT64, [2], [1, OUTPUT_LENGTH])  # Constant value of OUTPUT_LENGTH
# Slice output:
# [[1, 2, 3]] (start=0, end=3)
numeric_output_sliced = make_tensor_value_info("numeric_output_sliced", TensorProto.INT64, [1, OUTPUT_LENGTH])

# String Split node
split_node = make_node(
    op_type="Tokenizer",
    inputs=["string_input"],
    outputs=["string_split"],
    mark=0,  # Mark the beginning/end character
    mincharnum=1,  # Minimum number of characters allowed
    pad_value="",  # Padding value
    separators=SEPARATORS,  # List of separators (space)
    domain="com.microsoft"
)

# String Normalizer node
normalizer_node = make_node(
    "StringNormalizer",
    inputs=["string_split"],
    outputs=["string_normalized"],
    case_change_action=ACTION,
    stopwords=STOPWORDS # HERE WE DEFINE THE STOP WORDS
)

# CategoryMapper node
mapper_node = make_node(
    'CategoryMapper',
    ['string_normalized'],
    ['numeric_output'],
    cats_strings=["do", "not"],  # Vocabulary for mapping
    cats_int64s=[1, 2],  # Integer mapping of
    default_int64=0, # Default oov token
    domain='ai.onnx.ml'
)

# Constant node for pads
pads_constant_node = make_node(
    "Constant",
    inputs=[],
    outputs=["pads"],
    value=pads_data
)
# Padding node
padding_node = make_node(
    "Pad",
    inputs=["numeric_output", "pads"],
    outputs=["numeric_output_padded"],
    mode="constant"
)

# Constant for slice_start
slice_start_constant_node = make_node(
    "Constant",
    inputs=[],
    outputs=["slice_start"],
    value=slice_start_data
)
# Constant for slice_end
slice_end_constant_node = make_node(
    "Constant",
    inputs=[],
    outputs=["slice_end"],
    value=slice_end_data
)
# Slice node
slice_node = make_node(
    "Slice",
    inputs=["numeric_output_padded", "slice_start", "slice_end"],
    outputs=["numeric_output_sliced"],
)

# Create the graph with the new nodes
graph = make_graph(
    [
      split_node,
      normalizer_node,
      mapper_node,
      pads_constant_node,
      padding_node,
      slice_start_constant_node,
      slice_end_constant_node,
      slice_node,
    ],
    'numericalizer',
    inputs=[string_input],
    outputs=[numeric_output_sliced]
)

# Specify opset versions
onnx_model = make_model(
    graph,
    opset_imports=[
      make_opsetid('ai.onnx.ml', 1),
      make_opsetid('com.microsoft', 1),
      make_opsetid('', 17)
    ]
)

# Check the model consistency
check_model(onnx_model)

# Check the IR version
ir_version = onnx_model.ir_version
print("IR Version:", ir_version)

# Check the OpsSet version
ops_set_version = onnx_model.opset_import[2].version 
print("OpsSet Version:", ops_set_version)

# Save the model to a file
with open("tokenizer.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

IR Version: 8
OpsSet Version: 17


In [23]:
ort_sess = ort.InferenceSession('tokenizer.onnx')

x = np.array(["NOT not do SOME the oov"])
print(ort_sess.run(None, {'string_input': x}))

x = np.array(["NOT not do SOME the oov, and this is a longer string with more than 15 chars"])
print(ort_sess.run(None, {'string_input': x}))

x = np.array(["NOT not NOT not NOT not NOT not NOT not NOT not NOT not NOT"])
print(ort_sess.run(None, {'string_input': x}))

x = np.array(["NOT"])
print(ort_sess.run(None, {'string_input': x}))

outputs = ort_sess.run(None, {'string_input': x})

[array([[2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)]
[array([[2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)]
[array([[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]], dtype=int64)]
[array([[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)]


# Embedding

In [24]:
numeric_vector=outputs

In [25]:
import torch
import torch.nn as nn
import numpy as np

# Define the model
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(EmbeddingModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        pooled = torch.mean(embedded, dim=1)  # Global average pooling
        return pooled

# Example vocabulary
vocab_size = 10000  # Adjust this according to your vocabulary size
embedding_dim = 100  # Length of the embedding vector

# Initialize the model
model = EmbeddingModel(vocab_size, embedding_dim)

# Load the numeric vector into a PyTorch tensor
input_tensor = torch.tensor(outputs[0])
print(input_tensor)

# Perform inference
output = model(input_tensor)
print("Inference Output:")
print(output)

# Export the model to ONNX format
torch.onnx.export(model,
                  input_tensor,
                  "embedding_model.onnx",
                  input_names=["numeric_input_tensor"],
                  output_names=["output_embedding"],
                  opset_version=17)


tensor([[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
Inference Output:
tensor([[ 0.9169,  1.1620, -1.0315,  0.0393,  1.0036, -0.0645, -1.3321, -0.6720,
          0.4619, -2.3518,  1.4597,  1.4690, -0.0885,  0.3078,  0.1892,  0.9687,
          1.0791,  1.4366, -0.4015,  0.5671,  0.1021, -0.2049, -0.8865, -0.6974,
         -0.2545, -0.3683,  0.2314, -0.0735, -1.0602,  0.1250,  0.2749, -0.4889,
         -0.6295, -0.2636,  0.5858, -0.0387, -2.1346, -1.4208,  0.2145, -0.3342,
         -0.3430, -0.6142, -0.2990,  0.0913, -0.0588, -1.1512,  1.4431,  0.6062,
         -0.5437,  0.2913,  1.8397, -1.1921, -0.2544, -0.7286, -0.3283, -1.0158,
          0.2401, -0.5097,  0.8891, -1.6871,  0.8605, -0.0974,  1.0748, -1.6056,
         -2.0465,  0.7030,  0.1599,  0.7558,  0.9531, -1.1229,  1.2865,  0.2936,
          0.0306,  1.5168,  0.3206,  0.9903, -2.4394,  1.3872,  1.0919,  0.7916,
         -0.7218,  1.3455,  0.5363, -0.9169, -0.2500,  0.8704,  0.0632,  0.9085,
         -1.3063, -0.4256, -0.2826,

In [26]:
# Check the IR version
model = load("embedding_model.onnx")
ir_version = model.ir_version
print("IR Version:", ir_version)

# Check the OpsSet version
ops_set_version = model.opset_import[0].version if model.opset_import else None
print("OpsSet Version:", ops_set_version)

IR Version: 8
OpsSet Version: 17


In [27]:
ort_sess = ort.InferenceSession('embedding_model.onnx')

# String input
x = np.array([[2, 2, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

outputs = ort_sess.run(None, {'numeric_input_tensor': x})
outputs

[array([[ 0.7538846 ,  0.6315539 , -0.858148  , -0.31353578,  0.61638606,
          0.03712944, -0.95248246, -0.6610899 ,  0.44543356, -1.9777925 ,
          1.114134  ,  1.0663089 , -0.0475119 ,  0.3085864 ,  0.05627583,
          0.8581572 ,  0.8341553 ,  1.1332772 , -0.35968867,  0.4783103 ,
          0.1064309 , -0.21061324, -0.6963769 , -0.55464876, -0.36087307,
         -0.29060012,  0.03030721, -0.17939591, -0.8041696 , -0.01990776,
          0.2336085 , -0.4513607 , -0.462214  , -0.04942515,  0.6072999 ,
         -0.02932008, -1.7095354 , -1.0576055 ,  0.26565388, -0.44676393,
         -0.4330632 , -0.6291286 , -0.18799762,  0.0549604 , -0.24652083,
         -1.0596744 ,  1.0244541 ,  0.36720684, -0.57796216,  0.10362174,
          1.557532  , -0.99402636, -0.04505811, -0.8164367 , -0.19240957,
         -0.7551031 , -0.02828829, -0.39015922,  0.7850735 , -1.3096771 ,
          0.62475246, -0.23281808,  0.9300837 , -1.4388709 , -1.731892  ,
          0.49255267,  0.24945937,  0.

# Version Converter
https://github.com/onnx/tutorials/blob/main/tutorials/VersionConversion.md

In [28]:
# from onnx import version_converter
# # Load the model
# model = load("embedding_model.onnx")

# # Check that the IR is well formed
# check_model(model)

# # Convert to version 8
# converted_model = version_converter.convert_version(onnx_model, 9)

# # Save model
# save(converted_model, "embedding_model_v8.onnx")

# ONNX Compose
https://onnx.ai/onnx/api/compose.html

In [29]:
from onnx.compose import merge_models

# Load the two ONNX models
model1 = load("tokenizer.onnx")
model2 = load("embedding_model.onnx")

# Merge the models
io_map = [("numeric_output_sliced", "numeric_input_tensor")]
merged_model = merge_models(m1=model1, m2=model2, io_map=io_map)

# Save the merged model
save(merged_model, "encoder.onnx")


In [30]:
ort_sess = ort.InferenceSession('encoder.onnx')

# String input
x = np.array(["NOT not do SOME the oov"])

outputs = ort_sess.run(None, {'string_input': x})
outputs

[array([[ 0.80201274,  0.82813585, -0.87482804, -0.22561213,  0.8256478 ,
          0.01852857, -0.9869802 , -0.70129794,  0.41962573, -2.1136203 ,
          1.2512188 ,  1.280487  , -0.07217049,  0.39154813,  0.10505456,
          0.82849044,  0.91709095,  1.2231282 , -0.35748798,  0.4764018 ,
          0.05872453, -0.24836732, -0.7213394 , -0.56056064, -0.3087985 ,
         -0.3926673 ,  0.1158082 , -0.06045493, -0.8403996 , -0.04947696,
          0.27079332, -0.3402849 , -0.54239565, -0.10022059,  0.5104051 ,
         -0.03707116, -1.9227523 , -1.2315072 ,  0.25324383, -0.30595896,
         -0.44820789, -0.6083864 , -0.1130712 ,  0.0343467 , -0.15431458,
         -1.062029  ,  1.1480559 ,  0.4773775 , -0.6061598 ,  0.07696819,
          1.6609509 , -1.0828117 , -0.06846457, -0.9380718 , -0.19315866,
         -0.89049876,  0.02822874, -0.4400327 ,  0.7650407 , -1.3940212 ,
          0.7116711 , -0.29284608,  1.0068922 , -1.528401  , -1.8572272 ,
          0.5608603 ,  0.1534393 ,  0.