# Lab12-2: Image Captioning

In the last Lab, use a combination of convolutional neural networks to obtain the vectorial representation of images and recurrent neural networks to decode those representations into natural language sentences. 

In [1]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [2]:
import matplotlib.pyplot as plt

# Scikit-learn includes many helpful utilities
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import re
import numpy as np
import os
import time
import json
from glob import glob
from PIL import Image
import pickle

In [3]:
# Store captions and image names in vectors
all_captions = []
all_img_name_vector = [] # 140000 images

with open('./words_captcha/spec_train_val.txt') as f:
    for line in f:
        img_name, caption = line.strip().split()
        all_img_name_vector.append(f'./words_captcha/{img_name}.png')
        all_captions.append('<start> ' + ' '.join(caption) + ' <end>')
        
for i in range(120000, 140000):
    all_img_name_vector.append(f'./words_captcha/a{i}.png')
    
print(len(all_img_name_vector)) # 140000
print(all_img_name_vector[0])
print(all_captions[0])

140000
./words_captcha/a0.png
<start> t h u s <end>


In [7]:
# Find the maximum length of any caption in our dataset
def calc_max_length(tensor):
    return max(len(t) for t in tensor)

In [4]:
# Choose the top 5000 words from the vocabulary
top_k = 5000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k,
                                                  oov_token="<unk>",
                                                  filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer.fit_on_texts(all_captions)
train_seqs = tokenizer.texts_to_sequences(all_captions)

print(all_captions[0])
print(train_seqs[0])

<start> t h u s <end>
[2, 9, 18, 17, 6, 3]


In [5]:
# Shuffle captions and image_names together
# Set a random state
# img_name_train, caption_train = shuffle(all_img_name_vector[:100000], all_captions[:100000], random_state=514)
# img_name_valid, caption_valid = shuffle(all_img_name_vector[100000:120000], all_captions[100000:120000], random_state=514)

img_name_train, caption_train = all_img_name_vector[:100000], all_captions[:100000]
img_name_valid, caption_valid = all_img_name_vector[100000:120000], all_captions[100000:120000]

img_name_test = all_img_name_vector[120000:]

## Create a tf.data dataset for training

In [6]:
BATCH_SIZE = 100
BUFFER_SIZE = 5000
embedding_dim = 256
units = 512
vocab_size = len(tokenizer.word_index) + 1
num_steps = len(img_name_train) // BATCH_SIZE

IMAGE_SIZE = (160, 300)
EPOCHS = 10
LEARNING_RATE = 1e-4