## Overview
The purpose of this notebook is to transform the images of book covers into their embeddings. We will use an off-shelf model from TensorFLow hub (EfficientNet b0 v2) to convert them to vectors of dimension 1280.

## Config

In [1]:
import tensorflow as tf
import numpy as np
import tensorflow_hub as hub 
from tqdm import tqdm
import os
import re
import cloudpickle

In [2]:
model_handle = "https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet1k_b0/feature_vector/2"
data_path = # YOUR DATA PATH
output_path = # YOUR OUTPUT PATH
pixels = 224
do_fine_tuning = False
IMAGE_SIZE = (pixels, pixels)

## Model loading

In [4]:
print("Building model with", model_handle)
model = tf.keras.Sequential([
    # Explicitly define the input shape so the model can be properly
    # loaded by the TFLiteConverter
    tf.keras.layers.InputLayer(input_shape=IMAGE_SIZE + (3,)),
    hub.KerasLayer(model_handle, trainable=do_fine_tuning)
])
model.build((None,)+IMAGE_SIZE+(3,))
model.summary()

Building model with https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet1k_b0/feature_vector/2
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 1280)              5919312   
Total params: 5,919,312
Trainable params: 0
Non-trainable params: 5,919,312
_________________________________________________________________


## Embedding dictionary generation

In [5]:
def build_dataset(data_dir):
  return tf.keras.preprocessing.image_dataset_from_directory(
      data_dir,
      # Seed needs to provided when using validation_split and shuffle = True.
      # A fixed seed is used so that the validation set is stable across runs.
      seed=123,
      image_size=IMAGE_SIZE,
      batch_size=1)

In [6]:
# Get the list of all files in directory tree at given path
listOfFiles = list()
for (dirpath, dirnames, filenames) in os.walk(data_path):
    listOfFiles += [os.path.join(dirpath, file) for file in filenames]
imgpaths = [i for i in listOfFiles if i.lower().endswith('.jpg')]
len(imgpaths)

1169

In [7]:
EMBEDDING_DICT = {}
for imgpath in tqdm(imgpaths):
    isbn = re.sub('.jpg','',os.path.basename(imgpath).lower())
    if not isbn in EMBEDDING_DICT:
        input_arr = tf.keras.preprocessing.image.load_img(
        imgpath, grayscale=False, color_mode='rgb', target_size=IMAGE_SIZE,
        interpolation='nearest'
        )
        input_arr = tf.keras.preprocessing.image.img_to_array(input_arr)
        input_arr = np.array([input_arr])  # Convert single image to a batch.
        features = model.predict(input_arr)
        EMBEDDING_DICT[isbn] = features.reshape(-1).tolist()

cloudpickle.dump(EMBEDDING_DICT, open(f'{output_path}/embedding_dictionary.pkl','wb'))

100%|██████████| 1169/1169 [00:51<00:00, 22.88it/s]
