In [1]:
import os
import string
import numpy as np
import pandas as pd
import random
import pickle
from keras.applications import InceptionResNetV2, EfficientNetB7
from keras.applications import inception_resnet_v2, efficientnet_v2
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from concurrent.futures import ThreadPoolExecutor
from notebooks.utils import logger, timestamp, file_with_timestamp
from notebooks.config import ModelType, FEATURES_DIRECTORY, IMAGES_DIRECTORY, VALIDATED_DATASET_DIRECTORY, SPLIT_DIRECTORY, IMAGE_SHAPE, START_TOKEN, END_TOKEN, PADDING_TOKEN, TRAIN_SIZE, TEST_SIZE, VOCABULARY_SIZE
from notebooks.utils import latest_file

2024-05-09 15:50:16.281530: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-09 15:50:16.352093: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-09 15:50:16.352151: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-09 15:50:16.355104: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-09 15:50:16.370585: I tensorflow/core/platform/cpu_feature_guar

In [2]:
logger = logger()
THREAD_NUMBER = 8
model_type = ModelType.INCEPTION_RES_NET_V2
PATH_TO_FEATURES = os.path.join(FEATURES_DIRECTORY, model_type.name)

if not os.path.exists(PATH_TO_FEATURES):
    os.makedirs(PATH_TO_FEATURES)
if not os.path.exists(SPLIT_DIRECTORY):
    os.makedirs(SPLIT_DIRECTORY)

features = set([int(i[:-4]) for i in os.listdir(PATH_TO_FEATURES)])
images = [int(i[:-4]) for i in os.listdir(IMAGES_DIRECTORY) if i not in features]

In [4]:
def preprocess_input(image):
    if model_type == ModelType.INCEPTION_RES_NET_V2:
        return inception_resnet_v2.preprocess_input(image)
    if model_type == ModelType.EFFICIENT_NET_B7:
        return efficientnet_v2.preprocess_input(image)

def model():
    if model_type == ModelType.INCEPTION_RES_NET_V2:
        return InceptionResNetV2(input_shape=IMAGE_SHAPE, include_top=False, weights='imagenet')
    if model_type == ModelType.EFFICIENT_NET_B7:
        return EfficientNetB7(input_shape=IMAGE_SHAPE, include_top=False, weights='imagenet')

def preprocess_image(image):
    img = tf.io.read_file(os.path.join(IMAGES_DIRECTORY, str(image) + '.jpg'))
    img = tf.image.decode_image(img, channels=3)
    img = tf.image.resize(img, (224, 244))
    img = preprocess_input(img)
    return tf.expand_dims(img, axis=0)

model = model()

def get_feature_vector(index, image):
    path = os.path.join(PATH_TO_FEATURES, str(image) + '.npy')
    if os.path.exists(path):
        return
    f = model.predict(preprocess_image(image), verbose=False)
    np.save(path, tf.reshape(f, (f.shape[0], -1, f.shape[3]))[0])
    if index % 1000 == 0 and index != 0:
        logger.info("{} images have been processed.".format(index))

with ThreadPoolExecutor(max_workers=THREAD_NUMBER) as executor:
    futures = [executor.submit(get_feature_vector, index, image) for index, image in enumerate(images)]

2024-04-08 10:55:14.618812: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-08 10:55:14.636934: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


KeyboardInterrupt: 

In [5]:
def process_caption(caption: str):
    caption = caption.translate(str.maketrans('', '', string.punctuation))
    caption = ' '.join(caption.split()).lower()
    return START_TOKEN + ' ' + caption + ' ' + END_TOKEN

path_to_dataset = latest_file(VALIDATED_DATASET_DIRECTORY)
dataset = pd.read_csv(path_to_dataset, sep='\t', index_col=0, header=None)
dataset = dataset[dataset.index.isin(images)]
data = []

for index, row in dataset.iterrows():
    data.append((index, process_caption(row[1])))

random.shuffle(data)
train_index = int(len(data) * TRAIN_SIZE)
test_index = train_index + int(len(data) * TEST_SIZE)

train = data[:train_index]
test = data[train_index:test_index]
val = data[test_index:]

tokenizer = Tokenizer(oov_token='<unk>', filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer.fit_on_texts([caption for index, caption in train])


In [46]:
import random
print(random.choice(train))

(360398, '<start> oldest remains of a wall as unexpected archaeological discovery <end>')


In [14]:
print(tokenizer.word_index)



In [3]:
def process_caption(caption: str):
    caption = caption.translate(str.maketrans('', '', string.punctuation))
    caption = ' '.join(caption.split()).lower()
    return START_TOKEN + ' ' + caption + ' ' + END_TOKEN

path_to_dataset = latest_file(VALIDATED_DATASET_DIRECTORY)
dataset = pd.read_csv(path_to_dataset, sep='\t', index_col=0, header=None)
dataset = dataset[dataset.index.isin(images)]
data = []

for index, row in dataset.iterrows():
    data.append((index, process_caption(row[1])))

random.shuffle(data)
train_index = int(len(data) * TRAIN_SIZE)
test_index = train_index + int(len(data) * TEST_SIZE)

train = data[:train_index]
test = data[train_index:test_index]
val = data[test_index:]

tokenizer = Tokenizer(num_words=VOCABULARY_SIZE, oov_token='<unk>', filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer.fit_on_texts([caption for index, caption in train])
tokenizer.word_index[PADDING_TOKEN] = 0
tokenizer.index_word[0] = PADDING_TOKEN

timestamp = timestamp()
np.save(os.path.join(SPLIT_DIRECTORY, file_with_timestamp("train.npy", timestamp)), train)
np.save(os.path.join(SPLIT_DIRECTORY, file_with_timestamp("test.npy", timestamp)), test)
np.save(os.path.join(SPLIT_DIRECTORY, file_with_timestamp("val.npy", timestamp)), val)

with open(os.path.join(SPLIT_DIRECTORY, file_with_timestamp('tokenizer.pickle', timestamp)), 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)