First we started by reading the csv file 'captions.txt' and then displaying the first five rows

In [14]:

from os import truncate

import pandas as pd
from keras import Layer
from tensorflow.python.keras import Sequential

captions_path = "../data/captions/captions.txt"

df = pd.read_csv(captions_path)
print(df.head())


                       image  \
0  1000268201_693b08cb0e.jpg   
1  1000268201_693b08cb0e.jpg   
2  1000268201_693b08cb0e.jpg   
3  1000268201_693b08cb0e.jpg   
4  1000268201_693b08cb0e.jpg   

                                             caption  
0  A child in a pink dress is climbing up a set o...  
1              A girl going into a wooden building .  
2   A little girl climbing into a wooden playhouse .  
3  A little girl climbing the stairs to her playh...  
4  A little girl in a pink dress going into a woo...  


Now we want group all rows that have the same image filename together, keeping only the caption column, then for each group we convert all captions into a python list, and finally convert the pandas object into a dictionary

In [15]:
image_to_captions = (
    df.groupby('image')['caption'].apply(list).to_dict()
)

print("Number of images: ", len(image_to_captions))

Number of images:  8091


Creating a new column for captions in lower case and without punctuation

In [16]:

import string

#Removing the upper cases
df['lower_caption'] = df['caption'].str.lower()

#Function to remove the punctuation
def remove_punctuation(lower_caption):
    for char in string.punctuation:
        lower_caption = lower_caption.replace(char, '')
    return lower_caption

df['lower_caption'] = df['lower_caption'].apply(remove_punctuation)

#Comparing the new column with the old column
df.sample(10)



Unnamed: 0,image,caption,lower_caption
30876,3541915243_956c1aa8ef.jpg,Asian spectators .,asian spectators
4250,2043520315_4a2c782c90.jpg,A group of hikers led by a black and white dog...,a group of hikers led by a black and white dog...
15020,2815745115_c8479d560c.jpg,A carnival worker surrounded by stuffed animals .,a carnival worker surrounded by stuffed animals
28079,343218198_1ca90e0734.jpg,a black great Dane running toward the camera i...,a black great dane running toward the camera i...
22408,3205336477_037d4b6bd9.jpg,Two men wrestling at a match .,two men wrestling at a match
26788,3378553508_e37e281d25.jpg,Several people are looking out from a building...,several people are looking out from a building...
416,1095590286_c654f7e5a9.jpg,a dog chases another dog .,a dog chases another dog
6689,2256138896_3e24b0b28d.jpg,A rock climber with a white helmet is repelling .,a rock climber with a white helmet is repelling
8594,2396669903_5217a83641.jpg,Sombody has stuck the face of Groucho Marx on ...,sombody has stuck the face of groucho marx on ...
39756,825918657_d92f1761f4.jpg,A brown and white dog with a pink Frisbee in i...,a brown and white dog with a pink frisbee in i...


Tokenization for each caption

In [17]:

import nltk
from nltk.tokenize import word_tokenize
#Was having trouble with the import word_tokenize so had to add the line below
#nltk.download('punkt_tab')

#Creating a new column for tokens for each caption
df['tokens'] = df['lower_caption'].apply(lambda x: word_tokenize(x))
df.head()






Unnamed: 0,image,caption,lower_caption,tokens
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...,a child in a pink dress is climbing up a set o...,"[a, child, in, a, pink, dress, is, climbing, u..."
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .,a girl going into a wooden building,"[a, girl, going, into, a, wooden, building]"
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .,a little girl climbing into a wooden playhouse,"[a, little, girl, climbing, into, a, wooden, p..."
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...,a little girl climbing the stairs to her playh...,"[a, little, girl, climbing, the, stairs, to, h..."
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...,a little girl in a pink dress going into a woo...,"[a, little, girl, in, a, pink, dress, going, i..."


Building the vocabulary

In [20]:
from collections import Counter

#Couting all the words
word_counter = Counter()

for i in df['tokens']:
    word_counter.update(i)

#Defining special tokens
#To fill empty positions
PAD_TOKEN = "<PAD>"
#When a word is not in the vocabulary
UNK_TOKEN = "<UNK>"
#Start for inference
START_TOKEN = "<START>"
#End for inference
END_TOKEN = "<END>"

#Creating a vocabulary
word_to_idx = {
    PAD_TOKEN: 0,
    UNK_TOKEN: 1,
    START_TOKEN: 2,
    END_TOKEN: 3
}

for word in word_counter:
    #Auto increments the ID generator
    word_to_idx[word] = len(word_to_idx)


print("Vocabulary size:", len(word_to_idx))
print("Sample entries:", list(word_to_idx.items())[:10])
print("last entry", list(word_to_idx.items())[-1])


Vocabulary size: 8831
Sample entries: [('<PAD>', 0), ('<UNK>', 1), ('<START>', 2), ('<END>', 3), ('a', 4), ('child', 5), ('in', 6), ('pink', 7), ('dress', 8), ('is', 9)]
last entry ('patterns', 8830)


Convert the tokens to integers


In [21]:

#Creating a variable for unk token index
UNK_IDX = word_to_idx[UNK_TOKEN]

sequence = []

#For loop to check if the word is in the tokens dataframe; if it is just adds the index; if it's not it will add the unk index
for tokens in df['tokens']:
    seq = []
    for word in tokens:
        if word in word_to_idx:
            seq.append(word_to_idx[word])
        else:
            seq.append(UNK_IDX)
    #Appending to the empty sequence list
    sequence.append(seq)

print("Original: ", df['tokens'][1])
print("Sequence: ", sequence[1])



Original:  ['a', 'girl', 'going', 'into', 'a', 'wooden', 'building']
Sequence:  [4, 18, 19, 20, 4, 21, 22]


Organizing

In [22]:

#Dropping caption, and lower_caption columns to organize the dataframe since they aren't useful anymore
#Inplace = True to change the original and don't give us just a copy
df.drop(['caption', 'lower_caption'], axis= 'columns', inplace= True)


Sequence padding and truncating


In [23]:

#Checking the biggest sequence and the 95%
import numpy as np
lengths = [len(i) for i in sequence]
print(f"Max: {max(lengths)}, 95th percentile: {np.percentile(lengths, 95)}")

#Truncating sequences > 20
#Padding sequences < 20
from tensorflow.keras.preprocessing.sequence import pad_sequences

#Because we just added <START> and <END>
max_length = 22

padded_sequences = pad_sequences(
    sequence,
    maxlen = max_length,
    #Padding at the end
    padding = 'post',
    #Truncating at the end
    truncating = 'post'
)

#Printing the first 20 sequences as numpy arrays
print(padded_sequences[:20])
print(padded_sequences.shape)




Max: 36, 95th percentile: 18.0
[[ 4  5  6  4  7  8  9 10 11  4 12 13 14  6 15 16 17  0  0  0  0  0]
 [ 4 18 19 20  4 21 22  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 4 23 18 10 20  4 21 24  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 4 23 18 10 25 14 26 27 24  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 4 23 18  6  4  7  8 19 20  4 21 28  0  0  0  0  0  0  0  0  0  0]
 [ 4 29 30 31  4 32 30 33 34  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 4 29 30 31  4 35 30 36 37 38 39 40 25 41  0  0  0  0  0  0  0  0]
 [ 4 29 30 31  4 42 30 37 43 44 33 45 46 38 39  6 25 47  0  0  0  0]
 [48 49 13 50 51 52 46 38 39 40 25 41  0  0  0  0  0  0  0  0  0  0]
 [48 49 40 53 54 55 38 39  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 4 23 18 56  6 57 58  6 59 13  4 60 61 37 27 62  6  4 63  0  0  0]
 [ 4 23 18  9 64  6 59 13  4 65 60 61  0  0  0  0  0  0  0  0  0  0]
 [ 4 66 18  6 25 67 68 37 69  6 59 13  4 42 70 37  4 61 40 71  0  0]
 [72  9  4 18 37 73 64  6 59 13  4 61 74  0  0  0  0  0  0  0  0  0]
 [7

Embedding Layer

In [24]:

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.layers import Embedding, LSTM

#To check the size and length
print(f"Size of the vocabulary: {len(word_to_idx)}")
print(f"Length of each sequence: {padded_sequences.shape}")

vocab_size = len(word_to_idx)
#Default dimension for small / medium datasets (flickr8k)
embedding_dim = 128
input_length = 20

#Building the model
model = models.Sequential([
    Embedding(input_dim = vocab_size, output_dim = embedding_dim, input_length = input_length), LSTM(256) #Standard default
])

model.build(input_shape=(None, input_length))
model.summary()


Size of the vocabulary: 8831
Length of each sequence: (40455, 22)




Aligning images with captions

In [25]:

import os

#list to store the image path
image_paths = []
#list to store the tokenized sequence
caption_sequences = []

IMAGE_DIR = "../data/images/flickr8k_images/images"

#Looping over rows not images
for i in range(len(df)):
    #selecting the image file name for the i caption
    image_filename = df.iloc[i]['image']
    #building  a path so the model can find the image
    image_path = os.path.join(IMAGE_DIR, image_filename)

    #Adding to the lists image_path per caption
    image_paths.append(image_path)
    #the padded sequence of that same caption
    caption_sequences.append(padded_sequences[i])

print(len(image_paths))
print(len(caption_sequences))

#testing
idx = 5
print(image_paths[idx])
print(caption_sequences[idx])
print(df.iloc[idx])


40455
40455
../data/images/flickr8k_images/images\1001773457_577c3a7d70.jpg
[ 4 29 30 31  4 32 30 33 34  0  0  0  0  0  0  0  0  0  0  0  0  0]
image                             1001773457_577c3a7d70.jpg
tokens    [a, black, dog, and, a, spotted, dog, are, fig...
Name: 5, dtype: object
