In [1]:
!python -m onnxruntime.tools.make_dynamic_shape_fixed --dim_param batch_size --dim_value 3 "textual.onnx" "textual_shape1.onnx"

In [2]:
!python -m onnxruntime.tools.make_dynamic_shape_fixed --dim_param batch_size --dim_value 1 "visual.onnx" "visual_shape1.onnx"

In [1]:
import onnxruntime

def GetSession(model_file, npu = True):
    if npu:
        options = onnxruntime.SessionOptions()
        # (Optional) Enable configuration that raises an exception if the model can't be
        # run entirely on the QNN HTP backend.
        options.add_session_config_entry("session.disable_cpu_ep_fallback", "1")
        ort_session = onnxruntime.InferenceSession(model_file + '_shape1.onnx', 
                                            sess_options=options,
                                            providers=["QNNExecutionProvider"],
                                            provider_options=[{"backend_path": "QnnHtp.dll"}])
    else:
        ort_session = onnxruntime.InferenceSession(model_file + '.onnx', providers=['CPUExecutionProvider'])

    input_name = ort_session.get_inputs()[0].name
    output_name = ort_session.get_outputs()[0].name
    print(input_name, output_name)
    print("Available providers:", ort_session.get_providers())
    print("Current provider:", ort_session.get_provider_options())
    return ort_session

In [2]:
import onnxruntime
visual_session = GetSession('visual', True)


: 

In [None]:
import onnxruntime
textual_session  = GetSession('textual', False)

In [6]:
import clip
from PIL import Image
import numpy as np

# onnx cannot export with cuda
model, preprocess = clip.load("ViT-B/32", device="cpu", jit=False)

# batch first
image = preprocess(Image.open("CLIP.png")).unsqueeze(0).cpu() # [1, 3, 224, 224]
image_onnx = image.detach().cpu().numpy().astype(np.float32)

# batch first
text = clip.tokenize(["a diagram", "a dog", "a cat"]).cpu() # [3, 77]
text_onnx = text.detach().cpu().numpy().astype(np.int64)

In [7]:
import torch

logit_scale=100.0000
def visual_run(onnx_image):
    onnx_input_image = {visual_session.get_inputs()[0].name: onnx_image}
    visual_output, = visual_session.run(None, onnx_input_image)
    return visual_output

def textual_run(onnx_text):
    onnx_input_text = {textual_session.get_inputs()[0].name: onnx_text}
    textual_output, = textual_session.run(None, onnx_input_text)
    return textual_output

def OnnxModel(image, text, device: str = "cpu"):
    image_features = torch.from_numpy(visual_run(image)).to(device)
    text_features = torch.from_numpy(textual_run(text)).to(device)

    # normalized features
    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)

    # cosine similarity as logits
    logits_per_image = logit_scale * image_features @ text_features.t()
    logits_per_text = logits_per_image.t()

    # shape = [global_batch_size, global_batch_size]
    return logits_per_image, logits_per_text

In [None]:
image_features = visual_run(image_onnx)
text_features = textual_run(text_onnx)

logits_per_image, logits_per_text = OnnxModel(image_onnx, text_onnx)
probs = logits_per_image.softmax(dim=-1).detach().cpu().numpy()

print("Label probs:", probs)  # prints: [[0.9927937  0.00421067 0.00299571]]