In [None]:
pip install --upgrade onnxruntime

<font color='skyblue' size='6em'>#1 LOAD ONNX and JSON FILES</font>

In [None]:
import json
import onnxruntime
import numpy as np
from PIL import Image

# Load configuration files
tokenizer_config = json.load(open('/content/pix2struct_onnx/tokenizer_config.json', 'r'))
preprocessor_config = json.load(open('/content/pix2struct_onnx/preprocessor_config.json', 'r'))
preprocessor_config["target_size"] = (64, 64)

# Load ONNX models
encoder_model = onnxruntime.InferenceSession('/content/pix2struct_onnx/encoder_model.onnx')
decoder_with_past_model = onnxruntime.InferenceSession('/content/pix2struct_onnx/decoder_with_past_model.onnx')
decoder_model = onnxruntime.InferenceSession('/content/pix2struct_onnx/decoder_model.onnx')

In [None]:
<font color='skyblue' size='6em'>#2 PRE PROCESS THE IMAGE</font>

In [None]:
# Function to preprocess the image
def preprocess_image(image_path_or_url):
    if image_path_or_url.startswith("http"):
        response = requests.get(image_path_or_url, stream=True)
        response.raise_for_status()
        image = Image.open(io.BytesIO(response.content))
    else:
        image = Image.open(image_path_or_url)

    # Resize the image to the target size
    width, height = preprocessor_config["target_size"]
    image = image.resize((width, height))

    # Normalize the image
    image = np.array(image) / 255.0
    image = image.astype(np.float32)

    return image

<font color='skyblue' size='6em'>#3 RUN INFERENCE ON THREE SUBMODELS</font>

In [None]:

# Load and preprocess the image (provide either local path or image URL)
image_path_or_url = "https://cdn-clekk.nitrocdn.com/tkvYXMZryjYrSVhxKeFTeXElceKUYHeV/assets/images/optimized/rev-82dec77/wp-content/uploads/2021/05/machine-learning-types-infographics_1-1536x695.png"  # or image URL
preprocessed_image = preprocess_image(image_path_or_url)

# Encode the image
flattened_patches = np.expand_dims(preprocessed_image, axis=0)
attention_mask = np.ones((1, preprocessed_image.shape[0], preprocessed_image.shape[1], 1))

# Convert flattened_patches to int64 data type
flattened_patches_int64 = flattened_patches.astype(np.int64)

# Run inference on the encoder model
encoder_input = {'flattened_patches': flattened_patches_int64, 'attention_mask': attention_mask}
encoder_output = encoder_model.run(None, encoder_input)


# Run inference on the decoder with past model
decoder_with_past_input = {'encoder_output': encoder_output}
decoder_with_past_output = decoder_with_past_model.run(None, decoder_with_past_input)

# Run inference on the decoder model
decoder_input = {'past_key_values': decoder_with_past_output}
decoder_output = decoder_model.run(None, decoder_input)




<font color='skyblue' size='6em'>#4 DISPLAY GENERATED TEXT</font>

In [None]:
# Post-process the generated text
generated_text = tokenizer_config["token_to_text"][np.argmax(decoder_output[0])]

# Display or save the generated text
print("Generated Text:", generated_text)