In [2]:
import onnxruntime as ort
import numpy as np

from onnx import TensorProto, save, load
from onnx.helper import (
    make_model, make_node, make_tensor, make_graph,
    make_tensor_value_info, make_opsetid)
from onnx.checker import check_model

# StringSplit
https://github.com/onnx/onnx/blob/main/docs/Operators.md#StringSplit

In [None]:
# Input tensor
string_input = make_tensor_value_info('string_input', TensorProto.STRING, [None])

# Intermediate tensor after splitting
string_split = make_tensor_value_info('string_split', TensorProto.STRING, [None, None])

# StringSplit node
split_node = make_node(
    'StringSplit',
    inputs=['string_input'],
    outputs=['string_split', 'unused1'],
    delimiter="",
    maxsplit=None,
)

# Create the graph with the splitting node
split_graph = make_graph(
    [split_node], 
    'splitter_model', 
    [string_input], 
    [string_split]
    )

# Specify opset versions
split_model = make_model(
    split_graph,
    opset_imports=[
        make_opsetid('', 20)
    ]
)

# Check the model consistency
check_model(split_model)

# Save the split model to a file
with open("split_node.onnx", "wb") as f:
    f.write(split_model.SerializeToString())


In [None]:
ort_sess = ort.InferenceSession('split_node.onnx')

# x = ["NOT not do SOME the oov"]
# x = ["I love computer science !"]
input_strings = np.array(["Hello World", "I love computer science !"])

outputs = ort_sess.run(None, {'string_input': input_strings})
outputs

# Tokenizer
https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosofttokenizer

In [50]:
# Define the Tokenizer node
node = make_node(
    op_type="Tokenizer",
    inputs=["x"],
    outputs=["y"],
    mark=0,  # Mark the beginning/end character
    mincharnum=1,  # Minimum number of characters allowed
    pad_value="",  # Padding value
    separators=[" "],  # List of separators (space)
    domain="com.microsoft"  # Specify the domain
)

# Create ONNX graph
tokenizer_graph = make_graph(
    [node],
    "tokenizer_graph",
    inputs=[
        make_tensor_value_info("x", TensorProto.STRING, [None])  # Input shape [None], allowing variable-length input
    ],
    outputs=[
        make_tensor_value_info("y", TensorProto.STRING, [1, None])  # Output shape [None, None]
    ]
)

# Create ONNX model
tokenizer_node = make_model(
    tokenizer_graph,
    opset_imports=[
        make_opsetid('com.microsoft', 1),
    ]
)

# Verify the model
check_model(tokenizer_node)

# Save the model to a file
with open("tokenizer_node.onnx", "wb") as f:
    f.write(tokenizer_node.SerializeToString())

In [51]:
# Load the ONNX model
sess = ort.InferenceSession("tokenizer_node.onnx")

# Define the input data
input_strings = np.array(["Hello World"])

# Perform inference
output = sess.run(None, {"x": input_strings.astype(object)})

# Print the tokenized output
print(output[0])

[['Hello' 'World']]


# Normalizer
https://github.com/onnx/onnx/blob/main/docs/Operators.md#StringNormalizer
No se puede hacer en batch dims [C] o [1, C]

In [48]:
# Input tensor
string_input = make_tensor_value_info('string_input', TensorProto.STRING, [1, None])

# Output tensor
string_normalized = make_tensor_value_info('string_normalized', TensorProto.STRING, [None, None])

# Lowercasing node
normalizer_node = make_node(
    "StringNormalizer",
    inputs=["string_input"],
    outputs=["string_normalized"],
    case_change_action="LOWER",
    stopwords=["the"]
)

# Create the graph with the new nodes
normalizer_graph = make_graph([normalizer_node],
                   'normalizer_model',
                   [string_input],
                   [string_normalized])

# Specify opset versions
normalizer_model = make_model(
    normalizer_graph,
    opset_imports=[
      make_opsetid('', 10)
    ]
)

# Check the model consistency
check_model(normalizer_model)

# Save the model to a file
with open("normalizer_node.onnx", "wb") as f:
    f.write(normalizer_model.SerializeToString())


In [49]:
ort_sess = ort.InferenceSession('normalizer_node.onnx')
x = [['NOT', 'not', 'do', 'SOME', 'the', 'oov']]
outputs = ort_sess.run(None, {'string_input': x})
outputs

[array([['not', 'not', 'do', 'some', 'oov']], dtype=object)]

# Mapper
https://github.com/onnx/onnx/blob/main/docs/Operators-ml.md#ai.onnx.ml.CategoryMapper

In [52]:
# Input tensor
string_input = make_tensor_value_info('string_input', TensorProto.STRING, [1, None])

# Output tensor
numeric_output = make_tensor_value_info('numeric_output', TensorProto.INT64, [1, None])

# CategoryMapper node
mapper_node = make_node(
    'CategoryMapper',
    ['string_input'],
    ['numeric_output'],
    cats_strings=["do", "not"],  # Vocabulary for mapping
    cats_int64s=[1, 2],  # Integer mapping of
    default_int64=0, # Default oov token
    domain='ai.onnx.ml'
)

# Create the graph with the new nodes
mapper_graph = make_graph([mapper_node],
                   'mapper_model',
                   [string_input],
                   [numeric_output])

# Specify opset versions
mapper_model = make_model(
    mapper_graph,
    opset_imports=[
      make_opsetid('ai.onnx.ml', 1),
    ]
)

# Check the model consistency
check_model(mapper_model)

# Save the model to a file
with open("mapper_node.onnx", "wb") as f:
    f.write(mapper_model.SerializeToString())


In [54]:
ort_sess = ort.InferenceSession('mapper_node.onnx')
x = [['not', 'not', 'do', 'some', 'oov']]
outputs = ort_sess.run(None, {'string_input': x})
outputs

[array([[2, 2, 1, 0, 0]], dtype=int64)]

# Squeeze
https://github.com/onnx/onnx/blob/main/docs/Operators.md#Squeeze

In [71]:
# Input tensor
input_tensor = make_tensor_value_info('input_tensor', TensorProto.INT64, [1, None])

# Shape tensor to specify the output shape
squeezed_tensor = make_tensor_value_info('squeezed_tensor', TensorProto.INT64, [None])

# Create Squeeze node
squeeze_node = make_node(
    "Squeeze",
    inputs=['input_tensor'],
    outputs=['squeezed_tensor']
)

# Create the graph with the new nodes
squeeze_graph = make_graph([squeeze_node],
                   'squeeze_model',
                   [input_tensor],
                   [squeezed_tensor])

# Specify opset versions
squeeze_model = make_model(
    squeeze_graph,
    opset_imports=[
      make_opsetid('', 13) 
    ]
)

# Check the model consistency
check_model(squeeze_model)

# Save the model to a file
with open("squeezed_node.onnx", "wb") as f:
    f.write(squeeze_model.SerializeToString())

In [72]:
ort_sess = ort.InferenceSession('squeezed_node.onnx')
input_data = np.array([[ 2,  2,  1,  3, -1]], dtype=np.int64)
outputs = ort_sess.run(None, {'input_tensor': input_data})
outputs

[array([ 2,  2,  1,  3, -1], dtype=int64)]

# Unsqueeze
https://github.com/onnx/onnx/blob/main/docs/Operators.md#Unsqueeze

In [73]:
# Input tensor
squeezed_tensor = make_tensor_value_info('squeezed_tensor', TensorProto.INT64, [None])

# Shape tensor to specify the output shape
unsqueeze_tensor = make_tensor_value_info('unsqueeze_tensor', TensorProto.INT64, [1, None])

# Create axes tensor to specify the axes to unsqueeze
axes_tensor = make_tensor_value_info('axes_tensor', TensorProto.INT64, [1])

# Create Unsqueeze node
unsqueeze_node = make_node(
    'Unsqueeze',
    inputs=['squeezed_tensor', 'axes_tensor'],
    outputs=['unsqueeze_tensor'],
)


# Create the graph with the new nodes
unsqueeze_graph = make_graph([unsqueeze_node],
                   'unsqueeze_model',
                   [squeezed_tensor, axes_tensor],
                   [unsqueeze_tensor])

# Specify opset versions
unsqueeze_model = make_model(
    unsqueeze_graph,
    opset_imports=[
      make_opsetid('', 13)
    ]
)

# Check the model consistency
check_model(unsqueeze_model)

# Save the model to a file
with open("unsqueeze_node.onnx", "wb") as f:
    f.write(unsqueeze_model.SerializeToString())

In [74]:
ort_sess = ort.InferenceSession('unsqueeze_node.onnx')
input_data = np.array([ 2,  2,  1,  3, -1], dtype=np.int64)
axes = np.array([0]).astype(np.int64)
outputs = ort_sess.run(None, {'squeezed_tensor': input_data, 'axes_tensor': axes})
outputs

[array([[ 2,  2,  1,  3, -1]], dtype=int64)]

# Flatten
https://github.com/onnx/onnx/blob/main/docs/Operators.md#Flatten

In [75]:
# Input tensor
input_tensor = make_tensor_value_info('input_tensor', TensorProto.INT64, [1, None])

# Shape tensor to specify the output shape
shape_tensor = make_tensor_value_info('shape_tensor', TensorProto.INT64, [1])

# Constant node to provide the shape
const_shape_node = make_node(
    "Constant",
    inputs=[],
    outputs=["shape_tensor"],
    value=make_tensor(name="const_tensor",
                             data_type=TensorProto.INT64,
                             dims=[1],
                             vals=[-1])
)

# Reshape node to flatten the 2D array into a 1D array
reshape_node = make_node(
    "Reshape",
    inputs=["input_tensor", "shape_tensor"],
    outputs=["output_tensor"]
)

# Output tensor
output_tensor = make_tensor_value_info('output_tensor', TensorProto.INT64, [None])

# Create the graph with the new nodes
flatten_graph = make_graph([const_shape_node, reshape_node],
                   'flatten_model',
                   [input_tensor],
                   [output_tensor])

# Specify opset versions
flatten_model = make_model(
    flatten_graph,
    opset_imports=[
      make_opsetid('', 13)
    ]
)

# Check the model consistency
check_model(flatten_model)

# Save the model to a file
with open("flatten_node.onnx", "wb") as f:
    f.write(flatten_model.SerializeToString())


In [76]:
ort_sess = ort.InferenceSession('flatten_node.onnx')
input_data = np.array([[ 2,  2,  1,  3, -1]], dtype=np.int64)
outputs = ort_sess.run(None, {'input_tensor': input_data})
outputs

[array([ 2,  2,  1,  3, -1], dtype=int64)]

# Slice
https://github.com/onnx/onnx/blob/main/docs/Operators.md#Slice

In [119]:
# Define the ONNX node
slice_node = make_node(
    "Slice",
    inputs=["x", "starts", "ends"],
    outputs=["y"],
)

# Create the ONNX graph with the defined node
slice_graph = make_graph(
    [slice_node],
    "slice_model",
    inputs=[
        make_tensor_value_info("x", TensorProto.INT64, [1, None]),
        make_tensor_value_info("starts", TensorProto.INT64, [2]),
        make_tensor_value_info("ends", TensorProto.INT64, [2]),
    ],
    outputs=[
        make_tensor_value_info("y", TensorProto.INT64, [1, None]),
    ],
)

# Create the ONNX model with the defined graph
slice_model = make_model(
    slice_graph,
    opset_imports=[
        make_opsetid('', 11)
    ]
)

# Check the model consistency
check_model(slice_model)


# Save the split model to a file
with open("slice_node.onnx", "wb") as f:
    f.write(slice_model.SerializeToString())

In [120]:
ort_sess = ort.InferenceSession('slice_node.onnx')

# Create input data
x = np.array([[1, 1, 0, -1, -1]], dtype=np.int64)
starts = np.array([0, 0], dtype=np.int64)
ends = np.array([1, 3], dtype=np.int64)

# Run the inference
output = ort_sess.run(None, {"x": x, "starts": starts, "ends": ends})

# Output shape and values
print("Output shape:", output[0].shape)
print("Output:", output[0])

Output shape: (1, 3)
Output: [[1 1 0]]


# Padding
https://github.com/onnx/onnx/blob/main/docs/Operators.md#Pad

In [55]:
# Define the ONNX node
padd_node = make_node(
    "Pad",
    inputs=["x", "pads"],
    outputs=["y"],
    mode="constant"
)

# Create ONNX graph
padd_graph = make_graph(
    [padd_node],
    "padd_model",
    inputs=[
        make_tensor_value_info("x", TensorProto.INT64, [1, None]),  # Input shape [None], allowing variable-length input
        make_tensor_value_info("pads", TensorProto.INT64, [4]),  # Pads shape [2] for a 1D input
    ],
    outputs=[make_tensor_value_info("y", TensorProto.INT64, [1, None])],  # Output shape [10]
)

# Create ONNX model
padd_model = make_model(
    padd_graph,
    opset_imports=[
      make_opsetid('', 13) 
    ]
)

# Verify the model
check_model(padd_model)

# Save the model to a file
with open("padding_node.onnx", "wb") as f:
    f.write(padd_model.SerializeToString())

In [56]:
# Load the ONNX model
session = ort.InferenceSession("padding_node.onnx")

# Prepare input data
input_data = np.array([[1, 2, 3, 4, 5]], dtype=np.int64) 
pads = np.array([0, 0, 0, 10], dtype=np.int64)

# Run inference
output = session.run(["y"], {"x": input_data, "pads": pads})
output

[array([[1, 2, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)]

# Compose


### Opset vesrion 20, IR 9


In [15]:
# Input tensor
string_input = make_tensor_value_info('string_input', TensorProto.STRING, [1])
# Intermediate tensor after splitting
string_split = make_tensor_value_info('string_split', TensorProto.STRING, [None])
# Intermediate tensor after lowercasing
string_normalized = make_tensor_value_info('string_normalized', TensorProto.STRING, [None])
# Output tensor
numeric_output = make_tensor_value_info('numeric_output', TensorProto.INT64, [1, None])

# String Split node
split_node = make_node(
    'StringSplit',
    inputs=['string_input'],
    outputs=['string_split', 'unused'],
    delimiter="",
    maxsplit=None,
)

# String Split node
split_node = make_node(
    op_type="Tokenizer",
    inputs=["string_input"],
    outputs=["string_split"],
    mark=0,  # Mark the beginning/end character
    mincharnum=1,  # Minimum number of characters allowed
    pad_value="",  # Padding value
    separators=[" "],  # List of separators (space)
    domain="com.microsoft"  # Specify the domain
)

# String Normalizer node
normalizer_node = make_node(
    "StringNormalizer",
    inputs=["string_split"],
    outputs=["string_normalized"],
    case_change_action="LOWER",
    stopwords=["the"]
)
# Category Mapper node
mapper_node = make_node(
    'CategoryMapper',
    inputs=['string_normalized'],
    outputs=['numeric_output'],
    cats_strings=["do", "not", "some"],  # Vocabulary for mapping
    cats_int64s=[1, 2, 3],  # Integer mapping of vocabulary
    domain='ai.onnx.ml'
)

# Create the graph with the new nodes
graph = make_graph([split_node, normalizer_node, mapper_node],
                   'numericalizer',
                   [string_input],
                   [numeric_output])

# Specify opset versions
onnx_model = make_model(
    graph,
    opset_imports=[
      make_opsetid('ai.onnx.ml', 1),
      make_opsetid('com.microsoft', 1),
      make_opsetid('', 20)
    ]
)

# Check the model consistency
check_model(onnx_model)

# Save the model to a file
with open("test_numericalizer_IR9.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

In [None]:
ort_sess = ort.InferenceSession('test_numericalizer_1.onnx')

x = np.array(["NOT not do SOME the oov"])
outputs = ort_sess.run(None, {'string_input': x})
outputs

### Opset vesrion 17, IR 8

In [84]:
# Input string tensor:
# ["MY CLEAN QUERY oov stopword"]
string_input = make_tensor_value_info('string_input', TensorProto.STRING, [1])
# Intermediate tensor after splitting:
# ["MY", "CLEAN", "QUERY", "oov", "stopword"]
string_split = make_tensor_value_info('string_split', TensorProto.STRING, [None])
# Intermediate tensor after normalization:
# ["my", "clean", "query", "oov"]
string_normalized = make_tensor_value_info('string_normalized', TensorProto.STRING, [None])
# Mapper tensor:
# [[1, 2, 3, -1]]
numeric_output = make_tensor_value_info('numeric_output', TensorProto.INT64, [1, None])
# Flatten:
# [1, 2, 3, -1]
shape_tensor = make_tensor_value_info('shape_tensor', TensorProto.INT64, [1])
numeric_output_flatten = make_tensor_value_info('numeric_output_flatten', TensorProto.INT64, [None])
# Pad inputs:
pads = make_tensor_value_info("pads", TensorProto.INT64, [2]) # Pads shape [2] for a 1D input
# Pad output:
# [1, 2, 3, -1, 0, 0, 0] (pad = 3)
numeric_output_padded = make_tensor_value_info("numeric_output_padded", TensorProto.INT64, [None])
# Slice inputs:
slice_start = make_tensor_value_info("slice_start", TensorProto.INT64, [1])
slice_end = make_tensor_value_info("slice_end", TensorProto.INT64, [1])
# Slice output:
# [1, 2, 3] (start=0, end=3)
numeric_output_sliced = make_tensor_value_info("numeric_output_sliced", TensorProto.INT64, [None])

# String Split node
split_node = make_node(
    op_type="Tokenizer",
    inputs=["string_input"],
    outputs=["string_split"],
    mark=0,  # Mark the beginning/end character
    mincharnum=1,  # Minimum number of characters allowed
    pad_value="",  # Padding value
    separators=[" "],  # List of separators (space)
    domain="com.microsoft"  # Specify the domain
)
# String Normalizer node
normalizer_node = make_node(
    "StringNormalizer",
    inputs=["string_split"],
    outputs=["string_normalized"],
    case_change_action="LOWER",
    stopwords=["the"] # HERE WE DEFINE THE STOP WORDS
)
# Category Mapper node
mapper_node = make_node(
    'CategoryMapper',
    inputs=['string_normalized'],
    outputs=['numeric_output'],
    cats_strings=["do", "not", "some"], # VOCAB FOR MAPPING
    cats_int64s=[1, 2, 3], # INTEGER MAPPING OF VOCAB
    default_int64=0, # Default oov token
    domain='ai.onnx.ml'
)
# Constant Shape node
const_shape_node = make_node(
    "Constant",
    inputs=[],
    outputs=["shape_tensor"],
    value=make_tensor(name="const_tensor",
                             data_type=TensorProto.INT64,
                             dims=[1],
                             vals=[-1])
)
# Reshape node: flatten the 2D array into a 1D array
reshape_node = make_node(
    "Reshape",
    inputs=["numeric_output", "shape_tensor"],
    outputs=["numeric_output_flatten"]
)
# Padding node
padding_node = make_node(
    "Pad",
    inputs=["numeric_output_flatten", "pads"],
    outputs=["numeric_output_padded"],
    mode="constant"
)
# Slice node
slice_node = make_node(
    "Slice",
    inputs=["numeric_output_padded", "slice_start", "slice_end"],
    outputs=["numeric_output_sliced"],
)
# Create the graph with the new nodes
graph = make_graph([split_node, normalizer_node, mapper_node, const_shape_node, reshape_node, padding_node, slice_node],
                   'numericalizer',
                   inputs=[string_input, pads, slice_start, slice_end],
                   outputs=[numeric_output_sliced])

# Specify opset versions
onnx_model = make_model(
    graph,
    opset_imports=[
      make_opsetid('ai.onnx.ml', 1),
      make_opsetid('com.microsoft', 1),
      make_opsetid('', 17)
    ]
)

# Check the model consistency
check_model(onnx_model)

# Check the IR version
ir_version = onnx_model.ir_version
print("IR Version:", ir_version)

# Save the model to a file
with open("test_numericalizer_IR8.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

IR Version: 8
