# Create Linear Model trivial ONNX model

In [86]:
import onnx
from onnx import TensorProto
from onnx.helper import (
    make_model, make_node, make_graph,
    make_tensor_value_info)
from onnx.checker import check_model


In [2]:
feature_dim = 5
X = make_tensor_value_info('X', TensorProto.FLOAT, ["batch_size", feature_dim])
Y = make_tensor_value_info('Y', TensorProto.FLOAT, ["batch_size"])
A_initializer = onnx.helper.make_tensor('A', TensorProto.FLOAT, [feature_dim], [100.0, 10.0, 1.0, 0.1, 0.01])
B_initializer = onnx.helper.make_tensor('B', TensorProto.FLOAT, [], [7000.0])
node1 = make_node('MatMul', ['X', 'A'], ['XA'], 'XA')
node2 = make_node('Add', ['XA', 'B'], ['Y'], 'Y')
graph = make_graph([node1, node2], 'lr', [X], [Y], initializer=[A_initializer, B_initializer])
onnx_model = make_model(graph)
check_model(onnx_model)
with open("linear_regression.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

In [3]:
import onnxruntime as ort
import numpy as np

x = np.arange(10, dtype=np.float32)+1
x = np.reshape(x, (2, 5))
print(x)
ort_sess = ort.InferenceSession('linear_regression.onnx')
outputs = ort_sess.run(['Y'], {'X': x})
print(outputs[0])

[[ 1.  2.  3.  4.  5.]
 [ 6.  7.  8.  9. 10.]]
[7123.45 7679.  ]


### MatMul WAT:  How does it work on the edge cases ?

In [12]:
import numpy

lhs = (np.arange(2 * 1 * 7 * 32, dtype=np.float32)+1) / 1000.0
lhs = np.reshape(lhs, (2, 1, 7, 32))
print(lhs.shape)
rhs = (np.arange(12*7*32, dtype=np.float32)+1) / 1000.0
rhs = np.reshape(rhs, (12, 32, 7))
print(rhs.shape)
res = np.matmul(rhs, lhs)
print(res.shape)


(2, 1, 7, 32)
(12, 32, 7)
(2, 12, 32, 32)


# Experimenting with model [`sentence-transformers/all-MiniLM-L6-v2`](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)

Normalization formulation:

$$
v = \frac{v}{\max(\lVert v \rVert_p, \epsilon)}.
$$

## Imports

In [6]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import onnxruntime as ort
import numpy as np

### Imports, create `tokenizer` and `model`

In [7]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

2024-10-31 07:39:20.623591: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-31 07:39:20.752880: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-31 07:39:20.795165: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-31 07:39:20.811348: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-31 07:39:20.914578: I tensorflow/core/platform/cpu_feature_guar

### Sentences and tokens

In [63]:
# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
print("Encoded input:")
print(encoded_input)

Encoded input:
{'input_ids': tensor([[ 101, 2023, 2003, 2019, 2742, 6251,  102],
        [ 101, 2169, 6251, 2003, 4991,  102,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0]])}


### Inference with ONNX

In [64]:
ort_sess = ort.InferenceSession('model.onnx')
outputKey = 'last_hidden_state'
inputs = {key: value.numpy() for key, value in encoded_input.data.items()}
modelOutput = ort_sess.run([outputKey], inputs)[0]
print(f"{modelOutput.shape}")
print(modelOutput)

(2, 7, 384)
[[[ 0.03656479 -0.01616146  0.1682453  ...  0.05540764 -0.16443957
   -0.29669833]
  [ 0.7239094   0.6399461   0.18878399 ...  0.5945502   0.6205655
    0.489683  ]
  [ 0.00637847  0.02030473  0.04475658 ...  0.34638238  1.3169885
   -0.16695468]
  ...
  [ 0.1479177  -0.06426162  0.14569402 ...  0.8837387  -0.33155778
    0.2975315 ]
  [ 0.52124625  0.6562965   0.5607001  ... -0.03988977  0.04121367
   -1.4035654 ]
  [ 1.0824106   0.7140344   0.39859214 ... -0.23005268  0.32431406
   -1.0312778 ]]

 [[ 0.2802185   0.11647302 -0.04178832 ...  0.27105364 -0.16846775
   -0.29611403]
  [ 0.87294626  0.4544794  -0.10909736 ...  0.13654931  0.45797268
   -0.20415133]
  [ 0.4751616   0.5731077   0.63044137 ...  0.6525696   0.5612419
   -1.3268433 ]
  ...
  [ 0.61133045  0.79203445 -0.4684846  ...  0.08543227  1.0591549
   -0.2983293 ]
  [ 0.4115055   1.0945691   0.23854384 ...  0.8983636   0.3683571
   -0.733289  ]
  [ 0.13744976  0.55554354  0.26777348 ...  0.5426259   0.46651605

In [136]:
model.graph.output.

['last_hidden_state']

In [162]:
def probeOnnxNodeOutput(node_output_name, inputs):
    model = onnx.load("model.onnx")
    del model.graph.output[:]
    model.graph.output.append(onnx.ValueInfoProto(name=node_output_name))
    assert len(model.graph.output) == 1
    onnx.save(model, "modified_model.onnx")    
    ort_sess = ort.InferenceSession('modified_model.onnx')
    return ort_sess.run([node_output_name], inputs)[0]

def p(node_output_name):
    output = probeOnnxNodeOutput(node_output_name, inputs)
    print(f"\n{node_output_name}: f{output.shape}")
    print(output)

In [151]:
with open('model_shapes.txt', 'a') as f:
    for node in model.graph.node:
        for node_output_name in node.output:
            output = probeOnnxNodeOutput(node_output_name, inputs)
            print(f"{node_output_name}\t{output.dtype}\t{output.shape}", file=f)
            f.flush()
        

In [167]:
# p("/embeddings/Slice_output_0")
# p("/embeddings/position_embeddings/Gather_output_0")
#p("token_type_ids")
#p("embeddings.token_type_embeddings.weight")
#p("/embeddings/token_type_embeddings/Gather_output_0")
# p("/embeddings/Add_output_0")
p("/embeddings/Add_1_output_0")


/embeddings/Add_1_output_0: f(2, 7, 384)
[[[-0.08855709 -0.03675481  0.01803644 ...  0.02607179  0.09117168
   -0.01518174]
  [-0.02002142 -0.00136943 -0.01765827 ...  0.02036703  0.05219622
    0.19905484]
  [-0.01959006 -0.03363657 -0.03186595 ...  0.02031087  0.07087033
    0.06444595]
  ...
  [-0.02530987  0.04081389  0.01253615 ... -0.02695212  0.03774461
    0.11325061]
  [-0.01395568 -0.02749825  0.07956143 ... -0.07483339  0.07742585
   -0.06570429]
  [ 0.03182676 -0.00320992 -0.02103326 ...  0.03869266  0.01906986
   -0.00592621]]

 [[-0.08855709 -0.03675481  0.01803644 ...  0.02607179  0.09117168
   -0.01518174]
  [ 0.03040212  0.05308453 -0.02380589 ... -0.10111795  0.02182422
    0.0473295 ]
  [-0.00270701 -0.05080456  0.08054851 ... -0.07771945  0.08808091
   -0.05600649]
  ...
  [ 0.0927911   0.01653565 -0.09761265 ...  0.04492704  0.03896102
   -0.01817189]
  [ 0.02310666  0.00902908 -0.02130682 ...  0.02319211  0.01912827
   -0.00660186]
  [-0.02132826  0.00192266  0.0

### Model Inference with HuggingFace/PyTorch version

In [32]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

print(f"{model_output.last_hidden_state.shape}")
print(model_output.last_hidden_state)

if False:
    # Disabled for now
    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    
    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    
    print(f"Sentence embeddings: {sentence_embeddings.shape}")
    print(sentence_embeddings)


torch.Size([2, 7, 384])
tensor([[[ 0.0366, -0.0162,  0.1682,  ...,  0.0554, -0.1644, -0.2967],
         [ 0.7239,  0.6399,  0.1888,  ...,  0.5946,  0.6206,  0.4897],
         [ 0.0064,  0.0203,  0.0448,  ...,  0.3464,  1.3170, -0.1670],
         ...,
         [ 0.1479, -0.0643,  0.1457,  ...,  0.8837, -0.3316,  0.2975],
         [ 0.5212,  0.6563,  0.5607,  ..., -0.0399,  0.0412, -1.4036],
         [ 1.0824,  0.7140,  0.3986,  ..., -0.2301,  0.3243, -1.0313]],

        [[ 0.2802,  0.1165, -0.0418,  ...,  0.2711, -0.1685, -0.2961],
         [ 0.8729,  0.4545, -0.1091,  ...,  0.1365,  0.4580, -0.2042],
         [ 0.4752,  0.5731,  0.6304,  ...,  0.6526,  0.5612, -1.3268],
         ...,
         [ 0.6113,  0.7920, -0.4685,  ...,  0.0854,  1.0592, -0.2983],
         [ 0.4115,  1.0946,  0.2385,  ...,  0.8984,  0.3684, -0.7333],
         [ 0.1374,  0.5555,  0.2678,  ...,  0.5426,  0.4665, -0.5284]]])
