In [None]:
import cv2
import numpy as np
from openvino.runtime import Core
from yaspin import yaspin
from pathlib import Path
import fitz as pdf
import matplotlib.pyplot as plt

from openvino.runtime import Core


In [None]:
ie = Core()

model_dir = Path("model")
precision = "FP16"
detection_model = "horizontal-text-detection-0001"
recognition_model = "text-recognition-0016"
base_model_dir = Path("~/open_model_zoo_models").expanduser()
omz_cache_dir = Path("~/open_model_zoo_cache").expanduser()

model_dir.mkdir(exist_ok=True)

In [None]:
download_command = f"omz_downloader --name {detection_model},{recognition_model} --output_dir {model_dir} --cache_dir {omz_cache_dir} --precision {precision}"
with yaspin(text=f"Downloading {detection_model}, {recognition_model}") as sp:
    download_result = !$download_command
    sp.text = f"Finished downloading {detection_model}, {recognition_model}"
    sp.ok("✔")

In [None]:
text_detection_path = Path(f"{model_dir}/intel/{detection_model}/{precision}/{detection_model}")
text_recognition_encoder_path = Path(f"{model_dir}/intel/{recognition_model}/{recognition_model}-encoder/{precision}/{recognition_model}-encoder")
text_recognition_decoder_path = Path(f"{model_dir}/intel/{recognition_model}/{recognition_model}-decoder/{precision}/{recognition_model}-decoder")
assert Path(text_detection_path.with_suffix('.xml')).is_file(), "Text detection model is not downloaded"
assert Path(text_recognition_encoder_path.with_suffix('.xml')).is_file(), "Text recognition encoder is not downloaded"
assert Path(text_recognition_decoder_path.with_suffix('.xml')).is_file(), "Text recognition decoder is not downlaoded"

In [None]:
detection_model_file = ie.read_model(text_detection_path.with_suffix('.xml'))
recognition_encoder_model_file = ie.read_model(text_recognition_encoder_path.with_suffix('.xml'))
recognition_decoder_model_file = ie.read_model(text_recognition_decoder_path.with_suffix('.xml'))

In [None]:
image_file = "outfile.jpeg"
image = cv2.imread(image_file)
ih, iw, c = image.shape

UUID_side = image[-int(ih/5):, int(iw/5):int(iw/5)+int(ih/5)]
PID_side = image[-int(ih/5):, -int(ih/5):]

In [None]:
def adjust_image_to_detection(image, shape):
    _, _, H, W = shape
    resized_image = cv2.resize(image, (W, H))
    return np.expand_dims(resized_image.transpose(2, 0, 1), 0)
    

detection_input_layer_shape = detection_model_file.input(0).shape

UUID_side_resized = adjust_image_to_detection(UUID_side, detection_input_layer_shape)
PID_side_resized = adjust_image_to_detection(PID_side, detection_input_layer_shape)

assert UUID_side_resized.shape == PID_side_resized.shape == tuple(detection_input_layer_shape), "Invalid input shape"

In [None]:
detection_model_compiled = ie.compile_model(detection_model_file, device_name="CPU")
output_key = detection_model_compiled.output("boxes")

In [None]:
UUID_boxes = detection_model_compiled([UUID_side_resized])[output_key]
PID_boxes = detection_model_compiled([PID_side_resized])[output_key]

In [None]:
remove_empty = lambda box: box[~np.all(box == 0, axis = 1)]
UUID_boxes_detected = remove_empty(UUID_boxes)
PID_boxes_detected = remove_empty(PID_boxes)

assert UUID_boxes_detected.shape[0] >= 1, "No UUID detected"
assert PID_boxes_detected.shape[0] >= 1, "No PID detected"

In [None]:
PID_image_transposed = np.transpose(PID_side_resized[0], (1, 2, 0))
x_min, y_min, x_max, y_max, prob = map(int, PID_boxes_detected[0])
PID_cropped = PID_image_transposed[y_min:y_max, x_min:x_max]
plt.imshow(PID_cropped)

In [None]:
UUID_image_transposed = np.transpose(UUID_side_resized[0], (1, 2, 0))
x_min, y_min, x_max, y_max, prob = map(int, UUID_boxes_detected[0])
UUID_cropped = UUID_image_transposed[y_min:y_max, x_min:x_max]
plt.imshow(UUID_cropped)

In [None]:
encoder_input_layer = recognition_encoder_model_file.input(0)
partial_shape_encoder_input_layer = encoder_input_layer.partial_shape
partial_shape_encoder_input_layer[3] = -1
recognition_encoder_model_file.reshape({encoder_input_layer: partial_shape_encoder_input_layer})

encoder_model_compiled = ie.compile_model(recognition_encoder_model_file, device_name="CPU")
decoder_model_compiled = ie.compile_model(recognition_decoder_model_file, device_name="CPU")

In [None]:
def resize_image_to_encoder(image, model_shape):
    N, C, H, W = model_shape
    IH, IW, IC = image.shape
    scale_ratio = H.get_length() / IH
    grayscale_image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    resized_image = cv2.resize(
        grayscale_image, None, fx=scale_ratio, fy=scale_ratio, interpolation=cv2.INTER_AREA
    )
    return resized_image[None, None,]
    

encoder_input_model_shape = encoder_model_compiled.input(0).partial_shape
input_image_for_encoder = resize_image_to_encoder(UUID_cropped, encoder_input_model_shape)
output_key_encoder_features = encoder_model_compiled.output('features')
output_key_encoder_hidden = encoder_model_compiled.output('decoder_hidden')

encoder_output = encoder_model_compiled([input_image_for_encoder])

encoder_output_features = encoder_output[output_key_encoder_features]
encoder_output_hidden = encoder_output[output_key_encoder_hidden]

In [None]:
decoder_input_previous = decoder_model_compiled.input('decoder_input')
decoder_input_features = decoder_model_compiled.input('features')
decoder_input_hidden = decoder_model_compiled.input('hidden')

In [None]:
first_decoder_input_dict = {
    decoder_input_previous: [0],
    decoder_input_features: encoder_output_features,
    decoder_input_hidden: encoder_output_hidden,
}

first_decoder_output = decoder_model_compiled(first_decoder_input_dict)

In [None]:
decoder_output_hidden_key = decoder_model_compiled.output('decoder_hidden')
decoder_output_value_key = decoder_model_compiled.output('decoder_output')

decoder_output_hidden = first_decoder_output[decoder_output_hidden_key]
decoder_output_value = first_decoder_output[decoder_output_value_key]

alphabet = "0123456789abcdefghijklmnopqrstuvwxyz"
alphabet = [symbol for symbol in alphabet]
alphabet = ["[START]", "[END]", " ", "[NOT SURE]"] + alphabet
letter_idx = np.argmax(decoder_output_value)
answer = ""
while letter_idx not in [1, 2] and len(answer) < 10:
    answer += alphabet[letter_idx]
    decoder_input_dict = {
        decoder_input_previous: [letter_idx],
        decoder_input_features: encoder_output_features,
        decoder_input_hidden: decoder_output_hidden,
    }
    decoder_output = decoder_model_compiled(decoder_input_dict)
    decoder_output_hidden = decoder_output[decoder_output_hidden_key]
    decoder_output_value = decoder_output[decoder_output_value_key]
    letter_idx = np.argmax(decoder_output_value)

print(answer)
