## Imports

In [None]:
#installing tensorflow version 2.2.0 in Colab
!pip install -U --pre tensorflow=="2.2.0"

In [None]:
import os
import pathlib

# Clone the tensorflow models repository
if "models" in pathlib.Path.cwd().parts:
  while "models" in pathlib.Path.cwd().parts:
    os.chdir('..')
elif not pathlib.Path('models').exists():
  !git clone --depth 1 https://github.com/tensorflow/models

In [None]:
# Install tensorflows Object Detection API
%%bash
cd models/research/
protoc object_detection/protos/*.proto --python_out=.
cp object_detection/packages/tf2/setup.py .
python -m pip install .

In [None]:
import matplotlib
import matplotlib.pyplot as plt

import zipfile

import os
import random
import io
import imageio
import glob
import scipy.misc
import numpy as np
from six import BytesIO
from PIL import Image, ImageDraw, ImageFont
from IPython.display import display, Javascript
from IPython.display import Image as IPyImage

import tensorflow as tf

from object_detection.utils import label_map_util
from object_detection.utils import config_util
from object_detection.utils import visualization_utils as viz_utils
from object_detection.utils import colab_utils
from object_detection.builders import model_builder

%matplotlib inline

# Utilities

In [None]:
def load_image_into_numpy_array(path):
  """Load an image from file into a numpy array.

  Puts image into numpy array to feed into tensorflow graph.

  Args:
    path: a file path.

  Returns:
    numpy array with shape (img_height, img_width, 3)
  """
  img_data = tf.io.gfile.GFile(path, 'rb').read()
  image = Image.open(BytesIO(img_data))
  
  (im_width, im_height) = image.size
  #print(image.size)
 
  #removing alpha channel from png images
  img_array = np.array(image.getdata())[:,:3]
  return img_array.reshape(
      (im_height, im_width, 3)).astype(np.uint8)

def plot_detections(image_np,
                    boxes,
                    classes,
                    scores,
                    category_index,
                    figsize=(12, 16),
                    image_name=None):
  """Wrapper function to visualize detection boxes.

  Args:
    image_np: numpy array with shape (img_height, img_width, 3)
    boxes: a numpy array of shape [N, 4]
    classes: a numpy array of shape [N]. Note that class indices are 1-based,
      and match the keys in the label map.
    scores: a numpy array of shape [N] or None.  If scores=None, then
      this function assumes that the boxes to be plotted are groundtruth
      boxes and plot all boxes as black with no classes or scores.
    category_index: a dict containing category dictionaries (each holding
      category index `id` and category name `name`) keyed by category indices.
    figsize: size for the figure.
    image_name: a name for the image file.
  """
  #check if grountruth boxes is empty
  if boxes is not None:
    #a copy of the image with annotations
    image_np_with_annotations = image_np.copy()
    #function from Object Detection API create visualization of dection boxes and
    #classes
    viz_utils.visualize_boxes_and_labels_on_image_array(
        image_np_with_annotations,
        boxes,
        classes,
        scores,
        category_index,
        use_normalized_coordinates=True,
        #threshhold for detecting if score is bigger
        min_score_thresh=0.5)
    if image_name:
      plt.imsave(image_name, image_np_with_annotations)
    else:
      plt.imshow(image_np_with_annotations)


# Face masks data

We will start with limited amount of data just consisting of 5 images of a images with face masks.  The  [coco](https://cocodataset.org/#explore) dataset contains 91 classes but not a specific class for people's faces with masks (contains a class person tough), so this is a novel class.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_dir = "/content/drive/MyDrive/data.zip"
with zipfile.ZipFile(data_dir , 'r') as zip_ref:
  zip_ref.extractall( '/content/'  )  


In [None]:
# Load images and visualize
train_image_dir = '/content/imagesFaceMask'
train_images_np = []
#selecting only 10 images from the entire dataset
for i in range(0, 25):
  image_path = os.path.join(train_image_dir, 'maksssksksss' + str(i) + '.png')
  #print("image_path: ", image_path)
  train_images_np.append(load_image_into_numpy_array(image_path))

plt.rcParams['axes.grid'] = False
plt.rcParams['xtick.labelsize'] = False
plt.rcParams['ytick.labelsize'] = False
plt.rcParams['xtick.top'] = False
plt.rcParams['xtick.bottom'] = False
plt.rcParams['ytick.left'] = False
plt.rcParams['ytick.right'] = False
plt.rcParams['figure.figsize'] = [14, 7]

for idx, train_image_np in enumerate(train_images_np):
  plt.subplot(6, 5, idx+1)
  plt.imshow(train_image_np)
plt.show()

# Annotate images with bounding boxes

In this cell I will annotate the face mask mages --- draw a box around  in each image.




In [None]:
gt_boxes = []
#colab_utils.annotate(train_images_np, box_storage_pointer=gt_boxes)

In [None]:
print(gt_boxes)

[]


In [None]:
ground_truths = gt_boxes


In [None]:
#memorize the groundtruth coordinates so whenever starting anew no need to annotate again from scratch
gt_boxes = [np.array([[0.28940104, 0.34960938, 0.40273438, 0.44726562]]), np.array([[0.30273438, 0.55      , 0.47273438, 0.645     ],
       [0.29606771, 0.7875    , 0.43440104, 0.87      ],
       [0.50940104, 0.0375    , 0.58773438, 0.105     ]]), np.array([[0.17773438, 0.175     , 0.23940104, 0.25      ],
       [0.18606771, 0.3725    , 0.25606771, 0.435     ],
       [0.15940104, 0.585     , 0.23773437, 0.665     ],
       [0.16106771, 0.8075    , 0.23106771, 0.9225    ]]), np.array([[0.21108333, 0.12      , 0.27275   , 0.1775    ],
       [0.21275   , 0.18      , 0.26275   , 0.2225    ],
       [0.23608333, 0.385     , 0.29275   , 0.4375    ],
       [0.24441667, 0.4625    , 0.28608333, 0.5375    ],
       [0.22775   , 0.5775    , 0.28441667, 0.6475    ],
       [0.23608333, 0.71      , 0.29441667, 0.7725    ],
       [0.26608333, 0.78      , 0.32275   , 0.8275    ]]), np.array([[0.54275   , 0.2192691 , 0.79941667, 0.59136213]]), np.array([[0.40608333, 0.595     , 0.51275   , 0.745     ]]), np.array([[0.33608333, 0.36333333, 0.63608333, 0.64666667]]), np.array([[0.50275   , 0.31      , 0.64108333, 0.4325    ],
       [0.40608333, 0.38      , 0.58108333, 0.5575    ],
       [0.46108333, 0.61      , 0.69275   , 0.8525    ]]), np.array([[0.70608333, 0.2192691 , 0.95275   , 0.49169435]]), np.array([[0.22941667, 0.54307116, 0.33775   , 0.72284644]]), np.array([[0.72608333, 0.30232558, 0.94941667, 0.64451827]]), np.array([[0.50108333, 0.0325    , 0.64108333, 0.1675    ],
       [0.64941667, 0.3575    , 0.78775   , 0.4925    ],
       [0.77275   , 0.5925    , 0.92608333, 0.725     ],
       [0.72941667, 0.76      , 0.84108333, 0.8675    ],
       [0.39441667, 0.6775    , 0.47275   , 0.7525    ],
       [0.42608333, 0.6175    , 0.51275   , 0.6875    ],
       [0.30608333, 0.5325    , 0.35775   , 0.58      ],
       [0.35941667, 0.485     , 0.41775   , 0.535     ],
       [0.41441667, 0.445     , 0.46608333, 0.5075    ],
       [0.11608333, 0.7175    , 0.14608333, 0.7575    ],
       [0.19941667, 0.57      , 0.24608333, 0.6175    ],
       [0.18941667, 0.51      , 0.23275   , 0.5525    ]]), np.array([[0.29941667, 0.405     , 0.36108333, 0.4575    ],
       [0.27108333, 0.29      , 0.32108333, 0.3425    ],
       [0.27441667, 0.1775    , 0.32275   , 0.2275    ],
       [0.23775   , 0.5775    , 0.28608333, 0.6175    ],
       [0.22775   , 0.6775    , 0.27441667, 0.7275    ],
       [0.26941667, 0.8275    , 0.32275   , 0.8875    ],
       [0.21441667, 0.295     , 0.24275   , 0.3275    ]]), np.array([[0.29775   , 0.3975    , 0.39108333, 0.49      ],
       [0.28275   , 0.725     , 0.34941667, 0.78      ],
       [0.25608333, 0.875     , 0.31275   , 0.9175    ]]), np.array([[0   , 0    , 0, 0      ]]), np.array([[0.84108333, 0.40932642, 0.91775   , 0.60621762]]), np.array([[0.63108333, 0.11627907, 0.83275   , 0.48172757]]), np.array([[0.50275   , 0.22259136, 0.72941667, 0.57475083]]), np.array([[0.38608333, 0.32      , 0.54941667, 0.4825    ],
       [0.65775   , 0.7525    , 0.79608333, 0.885     ]]), np.array([[0.54608333, 0.255     , 0.73941667, 0.4375    ],
       [0.41441667, 0.7725    , 0.57441667, 0.9275    ]]), np.array([[0.18941667, 0.33707865, 0.30941667, 0.57677903]]), np.array([[0.68108333, 0.095     , 0.89441667, 0.3       ],
       [0.40775   , 0.465     , 0.61108333, 0.6075    ],
       [0.24941667, 0.775     , 0.42775   , 0.885     ],
       [0.31941667, 0.1275    , 0.47941667, 0.27      ]]), np.array([[0.45941667, 0.445     , 0.68775   , 0.6325    ]]), np.array([[0.28775   , 0.47      , 0.43941667, 0.5975    ]]), np.array([[0.28941667, 0.59550562, 0.29108333, 0.59550562],
       [0.22441667, 0.35955056, 0.29275   , 0.61048689]])]

# Prepare data for training

Below I add the class annotations, I have only one class to detect here which is the mask presence class.  Everything is converted to the format that the training
loop below expects (e.g., everything converted to tensors, classes converted to one-hot representations, etc.).

In [None]:

# By convention, our non-background classes start counting at 1.  Given
# that we will be predicting just one class, we will therefore assign it a
# `class id` of 1.
mask = 1
num_classes = 1

#the class id and name dictionary holding the classes present
category_index = {mask: {'id': mask, 'name': 'mask'}}

# Convert class labels to one-hot; convert everything to tensors.
# The `label_id_offset` here shifts all classes by a certain number of indices;
# we do this here so that the model receives one-hot labels where non-background
# classes start counting at the zeroth index.

label_id_offset = 1
train_image_tensors = []
#lists to hold one hot tensors of groundtruth boxes and classes
gt_classes_one_hot_tensors = []
gt_box_tensors = []

for (train_image_np, gt_box_np) in zip(
    train_images_np, gt_boxes):
  #in case the image is not present or there are no groundtruth boxes 
  if train_image_np is not None and gt_box_np is not None:
    #expand dimensions and convert to tensor to make ready for model
    train_image_tensors.append(tf.expand_dims(tf.convert_to_tensor(
        train_image_np, dtype=tf.float32), axis=0))
    gt_box_tensors.append(tf.convert_to_tensor(gt_box_np, dtype=tf.float32))

    #shift classes by id offset
    zero_indexed_groundtruth_classes = tf.convert_to_tensor(
        np.ones(shape=[gt_box_np.shape[0]], dtype=np.int32) - label_id_offset)
    #convert ot one hot tensors and append to classes list
    gt_classes_one_hot_tensors.append(tf.one_hot(
        zero_indexed_groundtruth_classes, num_classes))
    
print('Done prepping data.')


Done prepping data.


# Let's visualize the masks from the annotated ground truth classes


In [None]:
#give all annotation a 100% detection score for plot visualization
dummy_scores = np.array([1.0]*25, dtype=np.float32)  # give boxes a score of 100%

plt.figure(figsize=(30, 15))
#choose the 25 first annotated images
for idx in range(25):
  plt.subplot(6, 5, idx+1)
  if gt_boxes[idx] is not None:
    #run plot_detections above
    plot_detections(
        train_images_np[idx],
        gt_boxes[idx],
        np.ones(shape=[gt_boxes[idx].shape[0]], dtype=np.int32),
        dummy_scores, category_index)
plt.show()

# Create model and restore weights for all but last layer

In this cell I build a single stage detection architecture (RetinaNet) and restore all but the classification layer at the top (which will be automatically randomly initialized).

For simplicity, a number of things in this colab have been preset for the specific RetinaNet architecture at hand (including assuming that the image size will always be 640x640).

In [None]:
# Download the checkpoint and put it into models/research/object_detection/test_data/

!wget http://download.tensorflow.org/models/object_detection/tf2/20200711/ssd_resnet50_v1_fpn_640x640_coco17_tpu-8.tar.gz
!tar -xf ssd_resnet50_v1_fpn_640x640_coco17_tpu-8.tar.gz
!mv ssd_resnet50_v1_fpn_640x640_coco17_tpu-8/checkpoint models/research/object_detection/test_data/

In [None]:
#clear session for restarting
tf.keras.backend.clear_session()

print('Building model and restoring weights for fine-tuning...', flush=True)
num_classes = 1
#the path of the model
pipeline_config = 'models/research/object_detection/configs/tf2/ssd_resnet50_v1_fpn_640x640_coco17_tpu-8.config'
#the path where the weights are stored
checkpoint_path = 'models/research/object_detection/test_data/checkpoint/ckpt-0'

# Load pipeline config and build a detection model.
#
# I am using the COCO architecture which in default predicts 90
# class slots, override the `num_classes` field here to be just
# one. The COCO model was trained with 118K images and brings forth many featers 
#learned that are  very useful for our few shot detection training

#configur the pipeline configurations
configs = config_util.get_configs_from_pipeline_file(pipeline_config)
model_config = configs['model']
#chnage the numebr of classes
model_config.ssd.num_classes = num_classes
#freze batch normalization within the model
model_config.ssd.freeze_batchnorm = True
#build the model wiht the settings above
detection_model = model_builder.build(
      model_config=model_config, is_training=True)

# Set up object-based checkpoint restore --- RetinaNet has two prediction
# `heads` --- one for classification, the other for box regression.  I will
# restore the box regression head used for the bounding box but initialize the 
#classification head from scratch because we have an entirely new class to predict
#(we show the omission below by commenting out the line that we would add if 
#we wanted to restore both heads)


fake_box_predictor = tf.compat.v2.train.Checkpoint(
    #restore bounding box regression head
    _base_tower_layers_for_heads=detection_model._box_predictor._base_tower_layers_for_heads,
    # initialize the classification head from scratch
    _box_prediction_head=detection_model._box_predictor._box_prediction_head,
    )

fake_model = tf.compat.v2.train.Checkpoint(
          #frozen feature extracting point from model
          _feature_extractor=detection_model._feature_extractor,
          #our final defined layers fro bounding box and class
          _box_predictor=fake_box_predictor)
#checkpoint from where the model will pick up further training 
ckpt = tf.compat.v2.train.Checkpoint(model=fake_model)
#restoration of checkpoint
ckpt.restore(checkpoint_path).expect_partial()

# Run model through a dummy image so that variables are created it is necessary to do so
image, shapes = detection_model.preprocess(tf.zeros([1, 640, 640, 3]))
prediction_dict = detection_model.predict(image, shapes)
#the model has a postprocess method which creates a prediction dictionary
#as an output from which information about predictions are to be extracted
_ = detection_model.postprocess(prediction_dict, shapes)
print('Weights restored!')

Building model and restoring weights for fine-tuning...
Weights restored!


In [None]:

# Inspect the layers of detection_model
for i,v in enumerate(detection_model.trainable_variables):
    print(f"i: {i} \t name: {v.name} \t shape:{v.shape} \t dtype={v.dtype}")
    

As we can see from above our pretrained detection model has 268 layers in total (namely convolutions, bathnormalizations, fully connected) layers. It is a state of the art detection model that has high performance in detection of many objects.

##Custom training loop



In [None]:

#set the training of the model to True to indicate training phase
detection_model.trainable = True

# These parameters can be tuned; since our training set has 25 images
# it doesn't make sense to have a much larger batch size, though we could
# fit more examples in memory if we wanted to, set the learning rate and number 
#of batches to 100 (could possibly train longer)
batch_size = 19
learning_rate = 0.01
num_batches = 100

# Select variables in top layers to fine-tune.
trainable_variables = detection_model.trainable_variables
to_fine_tune = []
#these are the names of the 2 top layers (for bounding box regression and class 
#classification) that we need to train further, the rest of the layers is frozen
prefixes_to_train = [
  'WeightSharedConvolutionalBoxPredictor/WeightSharedConvolutionalBoxHead',
  'WeightSharedConvolutionalBoxPredictor/WeightSharedConvolutionalClassHead']

for var in trainable_variables:
  #add in in fine tuning only the above layers
  if any([var.name.startswith(prefix) for prefix in prefixes_to_train]):
    to_fine_tune.append(var)

# Set up forward + backward pass for a single train step.
def get_model_train_step_function(model, optimizer, vars_to_fine_tune):
  """Get a tf.function for training step. This will convert to tensorflow graph 
  and will increase the running speed"""

  @tf.function
  def train_step_fn(image_tensors,
                    groundtruth_boxes_list,
                    groundtruth_classes_list):
    """A single training iteration.

    Args:
      image_tensors: A list of [1, height, width, 3] Tensor of type tf.float32.
        Note that the height and width can vary across images, as they are
        reshaped within this function to be 640x640.
      groundtruth_boxes_list: A list of Tensors of shape [N_i, 4] with type
        tf.float32 representing groundtruth boxes for each image in the batch.
      groundtruth_classes_list: A list of Tensors of shape [N_i, num_classes]
        with type tf.float32 representing groundtruth boxes for each image in
        the batch.

    Returns:
      A scalar tensor representing the total loss for the input batch.
    """
    shapes = tf.constant(batch_size * [[640, 640, 3]], dtype=tf.int32)
    #method of model that provides the ground truths
    model.provide_groundtruth(
        groundtruth_boxes_list=groundtruth_boxes_list,
        groundtruth_classes_list=groundtruth_classes_list)
    #starting gradient tape method to compute changes in trainable variables
    with tf.GradientTape() as tape:

      preprocessed_images = tf.concat(
          #callin the preprocess method of the detection model to make images suitable 
          #for the model
          [detection_model.preprocess(image_tensor)[0]
           for image_tensor in image_tensors], axis=0)
      
      #the prediction dict output of the model predictions
      prediction_dict = model.predict(preprocessed_images, shapes)

      #this model outputs losses in a dictionary 
      losses_dict = model.loss(prediction_dict, shapes)

      #we have to sum up for the total loss from both top layers namely the classification 
      #and bounding box regression layer
      total_loss = losses_dict['Loss/localization_loss'] + losses_dict['Loss/classification_loss']

      #calcuate the gradients from the loss with respect to trainable variables
      gradients = tape.gradient(total_loss, vars_to_fine_tune)
      
      #update the trainable variables(weights)
      optimizer.apply_gradients(zip(gradients, vars_to_fine_tune))

    return total_loss

  return train_step_fn

#the gradient optimizer
optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9)
#initialize the step training function above to the following object
train_step_fn = get_model_train_step_function(
    detection_model, optimizer, to_fine_tune)

print('Start fine-tuning!', flush=True)
#run for all batches set
for idx in range(num_batches):
  # Grab keys for a random subset of examples
  all_keys = list(range(len(train_images_np)))
  random.shuffle(all_keys)
  example_keys = all_keys[:batch_size]

  # Note that we do not do data augmentation in this demo.  If you want a
  # a fun exercise, we recommend experimenting with random horizontal flipping
  # and random cropping

  #get the ground truth boxes list as tensors
  gt_boxes_list = [gt_box_tensors[key] for key in example_keys]

  #get the ground truth classes list as tensors
  gt_classes_list = [gt_classes_one_hot_tensors[key] for key in example_keys]
  
  #get the images list as tensors
  image_tensors = [train_image_tensors[key] for key in example_keys]

  # Training step (forward pass + backwards pass) as defined above
  total_loss = train_step_fn(image_tensors, gt_boxes_list, gt_classes_list)

  if idx % 10 == 0:
    print('batch ' + str(idx) + ' of ' + str(num_batches)
    + ', loss=' +  str(total_loss.numpy()), flush=True)

print('Done fine-tuning!')

Start fine-tuning!
batch 0 of 100, loss=1.7639313
batch 10 of 100, loss=1.0625103
batch 20 of 100, loss=0.80171895
batch 30 of 100, loss=0.67687976
batch 40 of 100, loss=0.5348338
batch 50 of 100, loss=0.52442133
batch 60 of 100, loss=0.5032982
batch 70 of 100, loss=0.3376547
batch 80 of 100, loss=0.3898978
batch 90 of 100, loss=0.3391522
Done fine-tuning!


# Let's Test some images on the trained model

In [None]:
test_image_dir = '/content/imagesFaceMask'
test_images_np = []
#get the images from the images directory in this numerical range
for i in range(50, 75):
  image_path = os.path.join(test_image_dir, 'maksssksksss' + str(i) + '.png')
  #print(image_path)
  #convert into numpy and add to list
  test_images_np.append(np.expand_dims(
      load_image_into_numpy_array(image_path), axis=0))

# Lets use tensorflow graph for speed
#@tf.function
def detect(input_tensor):
  """Run detection on an input image.

  Args:
    input_tensor: A [1, height, width, 3] Tensor of type tf.float32.
      The height and width can be anything since the image will be
      immediately resized according to the needs of the model within this
      function.

  Returns:
    A dictionary containing 3 Tensors (`detection_boxes`, `detection_classes`,
      and `detection_scores`) the outputs of our model
  """
  #preprocess the images as per model method
  preprocessed_image, shapes = detection_model.preprocess(input_tensor)
  #get the prediciton dict form the model
  prediction_dict = detection_model.predict(preprocessed_image, shapes)

  return detection_model.postprocess(prediction_dict, shapes)



In [None]:
#accounting for background class
label_id_offset = 1

#number of images tested
n = len(test_images_np)


#loop through the images
for i in range(n):
  
  #convert image to tensor as model expects
  input_tensor = tf.convert_to_tensor(test_images_np[i], dtype=tf.float32)
  #predict the image
  detections = detect(input_tensor)
  

  #run plot_detections above
  plot_detections(
    test_images_np[i][0],
    detections['detection_boxes'][0].numpy(),
    detections['detection_classes'][0].numpy().astype(np.uint32)
    + label_id_offset,
    detections['detection_scores'][0].numpy(),
    category_index, figsize=(30, 16))
  plt.show()
   #"gif_frame_" + ('%02d' % i) + ".jpg")

As we can see the model has learned useful features and detects the presence of masks in mamy cases (also dependent in the detection threshold) but sometimes it fails detecting masks. This is as a result of two factors:
1. The image quality and size differs from image to image as well as background, lighting, hue and so on..which makes the learning of features much diffictult. Basically even the images are not from a standardized distribution especially with those that the pretrained COCO model has been trained.

2. We need to increase the number of training images fed into the model during the training phase.

The model has good predictions for those cases where similar structural images are used, which means the feature detection with a pretrained model is doing a good job but needs to be extended to more images or the images dataset needs to be harmonized and cleaned to a certain standard.

## Automation of input images from XML annotations

Initially I tried to automate the process of reading annotations directly from the XML files of the images throug the following script  but for some reason the coordinate annotations of the images from the XML files was not compatible with the colab processings. It might be that the images are png and have 4 channels instead of 3 that .jpg images have and those functions here are optimized for JPG format, or something could be wrong on my calculations of the coordinates. The more images we can feed in the fine tunning the more accurate our detection could be. If we feed 600 images in fine tuning the the accuracy of detection would be state of the art, but also the dataset of images needs to be standardised.


In [None]:
import xml.etree.ElementTree as ET

def read_content(xml_file: str):

    tree = ET.parse(xml_file)
    root = tree.getroot()

    height = int(root.find("size")[0].text)
    width = int(root.find("size")[1].text)
    channels = int(root.find("size")[2].text)
    #print("height, width: ", height, width)

    list_with_all_boxes = []

    for member in root.findall('object'):
        class_name = member[0].text

        if class_name == 'without_mask':
          class_name=0
        elif class_name == 'with_mask':
          class_name=1

        filename = root.find('filename').text

        ymin, xmin, ymax, xmax = None, None, None, None

        ymin = int(member.find("bndbox/ymin").text)
        xmin = int(member.find("bndbox/xmin").text)
        ymax = int(member.find("bndbox/ymax").text)
        xmax = int(member.find("bndbox/xmax").text)

        if class_name==1:
         # print(xmin, ymin, xmax, ymax)
          list_with_single_boxes = [xmin/width, ymin/height, xmax/width, ymax/height]

          list_with_all_boxes.append(list_with_single_boxes)
        
    return np.array(list_with_all_boxes)

read_content('/content/annotationsFaceMask/maksssksksss0.xml')

array([[0.50546448, 0.1953125 , 0.61748634, 0.28125   ]])

In [None]:
gt_boxes = []
for i in range(0, 25):
  image_path = os.path.join('/content/annotationsFaceMask', 'maksssksksss' + str(i) + '.xml')
  gt_boxes.append(read_content(image_path))
gt_boxes =np.array(gt_boxes)

  """


In [None]:
#give all annotation a 100% detection score for plot visualization
dummy_scores = np.array([1.0]*25, dtype=np.float32)  # give boxes a score of 100%

plt.figure(figsize=(30, 15))
#choose the 25 first annotated images
for idx in range(25):
  plt.subplot(6, 5, idx+1)
  if gt_boxes[idx] is not None:
    #run plot_detections above
    plot_detections(
        train_images_np[idx],
        gt_boxes[idx],
        np.ones(shape=[gt_boxes[idx].shape[0]], dtype=np.int32),
        dummy_scores, category_index)
plt.show()