Image input requirements

include_top = include the fully connected layer at the top of the network

weights = None(random initialization), imagenet(pre-training on imagenet), or the path to the weights file to be loaded

input_tensor = to use as image input for the model (layers.Input())

input_shape = shape tuple only to be specified only if include_top is False  (otherwise (224,224,3) or (3,224,224), channel = 3, width and height should be no smaller than 32

pooling = mode for feature extracting when include_top is False:
    None -> output of the model will be the 4D tensor output of the last convolutional block

    avg -> global average pooling will be applied to the output of the last convolutional block, output of the model will be 2D tensor

    max -> global max pooling will be applied

classes = number of classes to classify images, only if include_top is True and if no weights is specified

classifier_activation = ignored unless include_top = True, classifier_activation = None to return the logits of the top layer

In [1]:
import tensorflow as tf


model = tf.keras.applications.ResNet50(
    include_top = False,
    weights = 'imagenet',
    input_tensor = None,
    input_shape = (224, 224, 3),
    #Using pooling = avg to end up with a fixed size vector (batch, 2048) to later combine with the text encoder
    pooling = 'avg',
    classes = None,
    classifier_activation = None
)


  if not hasattr(np, "object"):


Load and preprocess one image

In [2]:
import tensorflow as tf

image_path = '../data/images/flickr8k_images/images/667626_18933d713e.jpg'

#Loading the image
loaded_img = tf.keras.utils.load_img(
    image_path,
    color_mode = 'rgb',
    target_size = (224, 224),
    #Used by default
    interpolation = 'nearest',
    keep_aspect_ratio = False
)

#Converting PIL image to array
img_array = tf.keras.utils.img_to_array(loaded_img)

#Adding batch dimension
added_batch = tf.expand_dims(
        img_array,
        axis = 0
)

#preprocessing the array
pre_processed_array = tf.keras.applications.resnet.preprocess_input(
    added_batch,
    data_format= 'channels_last'
)

print(pre_processed_array.shape)
print(type(pre_processed_array))


(1, 224, 224, 3)
<class 'tensorflow.python.framework.ops.EagerTensor'>


Inspecting the output of an image after passing through resnet

In [3]:

#Freezing the model
model.trainable = False

#passing the preprocessed array through the model
features = model(pre_processed_array)

print(features.shape)
print(type(features))


(1, 2048)
<class 'tensorflow.python.framework.ops.EagerTensor'>


Batching multiple images and checking the shape

In [4]:


#Choosing 5 random images from the dataset
image_paths = ['../data/images/flickr8k_images/images/12830823_87d2654e31.jpg',
               '../data/images/flickr8k_images/images/17273391_55cfc7d3d4.jpg',
               '../data/images/flickr8k_images/images/27782020_4dab210360.jpg',
               '../data/images/flickr8k_images/images/35506150_cbdb630f4f.jpg',
               '../data/images/flickr8k_images/images/41999070_838089137e.jpg']

images = []
for i in image_paths:
    img = tf.keras.utils.load_img(
    i,
    color_mode = 'rgb',
    target_size = (224, 224),
    #Used by default
    interpolation = 'nearest',
    keep_aspect_ratio = False
    )

    img = tf.keras.utils.img_to_array(img)

    img = tf.keras.applications.resnet.preprocess_input(
    img,
    data_format= 'channels_last'
    )

    images.append(img)


batch = tf.stack(
    images, axis=0, name='stack'
)



print(batch.shape)



(5, 224, 224, 3)
