In [None]:
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
from numpy import array
import pandas as pd
import cv2
from glob import glob
import PIL
import time
from tqdm import tqdm
import os

In [None]:
dataset_path = "../datasets/Flickr8k/"

In [None]:
image_path = dataset_path + "Images/"
images = glob(image_path + "*.jpg")
len(images)

In [None]:
for i in np.random.randint(0, len(images), size=3):
    plt.figure()
    image = cv2.imread(images[i])
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.imshow(image)

In [None]:
def load(filename):
    with open(filename, 'r') as file:
        text = file.read()
    return text

file = dataset_path + "captions_removed_first_and_last.txt"
info = load(file)

In [None]:
for line in info.split("\n")[:3]:
    print(line)

In [None]:
def load_captions(info):
    data = dict()
    for line in info.split('\n'):
        img, caption = line.split('.jpg,')        
        if img not in data:
            data[img] = []
        data[img].append(caption)
        
    return data

data = load_captions(info)

In [None]:
print(list(data.items())[100])

In [None]:
def cleanse_data(data):
    clean_data = dict()

    for img, captions in data.items():
        for i in range(len(captions)):
            caption = captions[i]
            lower_words = [word.lower() for word in caption.split() if len(word) > 1]
            clean_caption = " ".join(lower_words)

            if img not in clean_data:
                clean_data[img] = []
            
            clean_data[img].append(clean_caption)
            
    return clean_data

clean_data = cleanse_data(data)

In [None]:
print(list(clean_data.items())[50])

In [None]:
def get_all_words(data):
    all_words = set()
    for img_key in data.keys():
        for caption in data[img_key]:
                all_words.update(caption.split())
    return all_words

# summarize vocabulary
all_words = get_all_words(clean_data)
print(len(all_words))

In [None]:
def save_data(data, filename):
    lines = list()
    for img_key, captions in data.items():
        for caption in captions:
            lines.append(img_key + ' ' + caption)
    data = '\n'.join(lines)

    with open(filename, 'w') as file:
        file.write(data)

save_data(clean_data, dataset_path + 'captions_clean.txt')

In [None]:
images = dataset_path + 'Images/'
img_paths = glob(images + '*.jpg')
print(len(img_paths))

In [None]:
def preprocess_img(image_path):
    # Convert all the images to size 299x299 as expected by the inception v3 model
    img = keras.preprocessing.image.load_img(image_path, target_size=(299, 299))
    # Convert PIL image to numpy array of 3-dimensions
    x = keras.preprocessing.image.img_to_array(img)
    # Add one more dimension
    x = np.expand_dims(x, axis=0)
    # preprocess the images using preprocess_input() from inception module
    x = keras.applications.inception_v3.preprocess_input(x)
    
    return x

In [None]:
# Load the inception v3 model
input1 = InceptionV3(weights='imagenet')

# Create a new model, by removing the last layer (output layer) from the inception v3
model = Model(input1.input, input1.layers[-2].output)

model.summary()

In [None]:
# Function to encode a given image into a vector of size (2048, )
def encode(image):
    image = preprocess_img(image) # preprocess the image
    feature_vec = model.predict(image) # Get the encoding vector for the image
    feature_vec = np.reshape(feature_vec, feature_vec.shape[1]) # reshape from (1, 2048) to (2048, )
    return feature_vec

In [None]:
encoding = {}

for img_path in tqdm(img_paths):
    encoding[img_path[len(images):]] = encode(img_path)

In [None]:
import pickle

# Save the features in the images1 pickle file
with open(dataset_path + "images_features.pkl", "wb") as encoded_pickle:
    pickle.dump(encoding, encoded_pickle)

In [None]:
import pickle

with open(dataset_path + "images_features.pkl", "rb") as encoded_pickle:
    encoding = pickle.load(encoded_pickle)

In [None]:
training_captions = []

for img_key, captions in clean_data.items():
    for cap in captions:
        training_captions.append(cap)
        
len(training_captions)

In [None]:
word_count_threshold = 5
word_counts = {}

for cap in training_captions:
    for w in cap.split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1

all_words = [w for w in word_counts if word_counts[w] >= word_count_threshold]
print(f'preprocessed words {len(word_counts)} -> {len(all_words)}')

In [None]:
ix_to_word = {}
word_to_ix = {}

for ix, w in enumerate(all_words):
    word_to_ix[w] = ix
    ix_to_word[ix] = w
    
vocab_size = len(ix_to_word) + 1 # one for appended 0's
vocab_size

In [None]:
with open(dataset_path + "word_to_ix.pkl", "wb") as encoded_pickle:
    pickle.dump(word_to_ix, encoded_pickle)
    
with open(dataset_path + "ix_to_word.pkl", "wb") as encoded_pickle:
    pickle.dump(ix_to_word, encoded_pickle)

In [None]:
def to_lines(data):
    captions = []
    for img_key in data.keys():
        for caption in data[img_key]:
            captions.append(caption)
    return captions

# calculate the length of the description with the most words
def max_length(data):
    lines = to_lines(data)
    return max(len(d.split()) for d in lines)

# determine the maximum sequence length
max_length = max_length(clean_data)
print(f'Description Length: {max_length}')

In [None]:
def data_generator(descriptions, photos, wordtoix, max_length, num_photos_per_batch):
    X1, X2, y = list(), list(), list()
    n=0
    # loop for ever over images
    while True:
        for key, desc_list in descriptions.items():
            n += 1
            # retrieve the photo feature
            photo = photos[key+'.jpg']
            for desc in desc_list:
                # encode the sequence
                seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
                # split one sequence into multiple X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pair
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    # store
                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(out_seq)
            # yield the batch data
            if n==num_photos_per_batch:
                yield ([array(X1), array(X2)], array(y))
                X1, X2, y = list(), list(), list()
                n=0

In [None]:
embeddings_index = {} # empty dictionary

with open('../datasets/glove.840B.300d.txt', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except Exception:
            continue
        embeddings_index[word] = coefs

print(f'Found {len(embeddings_index)} word vectors.')

In [None]:
embedding_dim = 300

# Get 200-dim dense vector for each of the words in our vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))

num_found = 0
for word, i in word_to_ix.items():
    #if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros,
        num_found += 1
        embedding_matrix[i] = embedding_vector
        
print(f'{len(all_words) - num_found} words not found.')
embedding_matrix.shape

In [None]:
from tensorflow.keras.layers import Dense, Input, Conv2D, MaxPool2D, LSTM, add
from tensorflow.keras.layers import Activation, Dropout, Flatten, Embedding
from tensorflow.keras.models import Model

inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
model = Model(inputs=[inputs1, inputs2], outputs=outputs)

In [None]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
epochs = 10
number_pics_per_bath = 3
steps = len(clean_data)//number_pics_per_bath

In [None]:
with open(dataset_path + "images_features.pkl", "rb") as p:
    features = pickle.load(p)

In [None]:
tf.config.run_functions_eagerly(True)
for i in range(epochs):
    generator = data_generator(clean_data, features, word_to_ix, max_length, number_pics_per_bath)
    model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    model.save('model_' + str(i) + '.h5')
model.save_weights('./model_weights/final_model.h5')