First we started by reading the csv file 'captions.txt' and then displaying the first five rows

In [1]:

from os import truncate

import pandas as pd
from keras import Layer
from tensorflow.python.keras import Sequential

captions_path = "../data/captions/captions.txt"

df = pd.read_csv(captions_path)
print(df.head())


                       image  \
0  1000268201_693b08cb0e.jpg   
1  1000268201_693b08cb0e.jpg   
2  1000268201_693b08cb0e.jpg   
3  1000268201_693b08cb0e.jpg   
4  1000268201_693b08cb0e.jpg   

                                             caption  
0  A child in a pink dress is climbing up a set o...  
1              A girl going into a wooden building .  
2   A little girl climbing into a wooden playhouse .  
3  A little girl climbing the stairs to her playh...  
4  A little girl in a pink dress going into a woo...  


  if not hasattr(np, "object"):


Now we want group all rows that have the same image filename together, keeping only the caption column, then for each group we convert all captions into a python list, and finally convert the pandas object into a dictionary

In [2]:
image_to_captions = (
    df.groupby('image')['caption'].apply(list).to_dict()
)

print("Number of images: ", len(image_to_captions))

Number of images:  8091


Creating a new column for captions in lower case and without punctuation

In [3]:

import string

#Removing the upper cases
df['lower_caption'] = df['caption'].str.lower()

#Function to remove the punctuation
def remove_punctuation(lower_caption):
    for char in string.punctuation:
        lower_caption = lower_caption.replace(char, '')
    return lower_caption

df['lower_caption'] = df['lower_caption'].apply(remove_punctuation)

#Comparing the new column with the old column
df.sample(10)



Unnamed: 0,image,caption,lower_caption
13444,269898095_d00ac7d7a4.jpg,Two people sit on a bench at a park .,two people sit on a bench at a park
4822,2088910854_c6f8d4f5f9.jpg,A man wearing a black knit cap with a red and ...,a man wearing a black knit cap with a red and ...
4813,2088460083_42ee8a595a.jpg,"An old , beat-up jeep being towed away .",an old beatup jeep being towed away
10287,2484190118_e89363c465.jpg,A small boy stands on a cement stump in a park...,a small boy stands on a cement stump in a park...
15156,2827964381_408a310809.jpg,Two dogs are jumping up at each other in a gra...,two dogs are jumping up at each other in a gra...
19102,3046286572_d2050ab0d9.jpg,A man in a brown shirt and jeans is doing a tr...,a man in a brown shirt and jeans is doing a tr...
16915,2921112724_5cb85d7413.jpg,A man riding a three wheeled vehicle topples o...,a man riding a three wheeled vehicle topples o...
7615,2318659263_c24005a5cb.jpg,A fluffy white dog running across the snow .,a fluffy white dog running across the snow
23273,3240094420_a9eea11d39.jpg,"Four women , two with id badges .",four women two with id badges
3743,1819261140_6c022f4b1d.jpg,a man sitting at a table with a scary mask cov...,a man sitting at a table with a scary mask cov...


Tokenization for each caption

In [4]:

import nltk
from nltk.tokenize import word_tokenize
#Was having trouble with the import word_tokenize so had to add the line below
#nltk.download('punkt_tab')

#Creating a new column for tokens for each caption
df['tokens'] = df['lower_caption'].apply(lambda x: word_tokenize(x))
df.head()






Unnamed: 0,image,caption,lower_caption,tokens
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...,a child in a pink dress is climbing up a set o...,"[a, child, in, a, pink, dress, is, climbing, u..."
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .,a girl going into a wooden building,"[a, girl, going, into, a, wooden, building]"
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .,a little girl climbing into a wooden playhouse,"[a, little, girl, climbing, into, a, wooden, p..."
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...,a little girl climbing the stairs to her playh...,"[a, little, girl, climbing, the, stairs, to, h..."
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...,a little girl in a pink dress going into a woo...,"[a, little, girl, in, a, pink, dress, going, i..."


Building the vocabulary

In [5]:
from collections import Counter

#Couting all the words
word_counter = Counter()

for i in df['tokens']:
    word_counter.update(i)

#Defining special tokens
#To fill empty positions
PAD_TOKEN = "<PAD>"
#When a word is not in the vocabulary
UNK_TOKEN = "<UNK>"

#Creating a vocabulary
word_to_idx = {
    PAD_TOKEN: 0,
    UNK_TOKEN: 1
}

for word in word_counter:
    #Auto increments the ID generator
    word_to_idx[word] = len(word_to_idx)


print("Vocabulary size:", len(word_to_idx))
print("Sample entries:", list(word_to_idx.items())[:10])


Vocabulary size: 8829
Sample entries: [('<PAD>', 0), ('<UNK>', 1), ('a', 2), ('child', 3), ('in', 4), ('pink', 5), ('dress', 6), ('is', 7), ('climbing', 8), ('up', 9)]


Convert the tokens to integers


In [6]:

#Creating a variable for unk token index
UNK_IDX = word_to_idx[UNK_TOKEN]

sequence = []

#For loop to check if the word is in the tokens dataframe; if it is just adds the index; if it's not it will add the unk index
for tokens in df['tokens']:
    seq = []
    for word in tokens:
        if word in word_to_idx:
            seq.append(word_to_idx[word])
        else:
            seq.append(UNK_IDX)
    #Appending to the empty sequence list
    sequence.append(seq)

print("Original: ", df['tokens'][1])
print("Sequence: ", sequence[1])



Original:  ['a', 'girl', 'going', 'into', 'a', 'wooden', 'building']
Sequence:  [2, 16, 17, 18, 2, 19, 20]


Organizing

In [7]:

#Dropping caption, and lower_caption columns to organize the dataframe since they aren't useful anymore
#Inplace = True to change the original and don't give us just a copy
df.drop(['caption', 'lower_caption'], axis= 'columns', inplace= True)


Sequence padding and truncating


In [8]:

#Checking the biggest sequence and the 95%
import numpy as np
lengths = [len(i) for i in sequence]
print(f"Max: {max(lengths)}, 95th percentile: {np.percentile(lengths, 95)}")

#Truncating sequences > 20
#Padding sequences < 20
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 20

padded_sequences = pad_sequences(
    sequence,
    maxlen = max_length,
    #Padding at the end
    padding = 'post',
    #Truncating at the end
    truncating = 'post'
)

#Printing the first 20 sequences as numpy arrays
print(padded_sequences[:20])
print(padded_sequences.shape)




Max: 36, 95th percentile: 18.0
[[ 2  3  4  2  5  6  7  8  9  2 10 11 12  4 13 14 15  0  0  0]
 [ 2 16 17 18  2 19 20  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2 21 16  8 18  2 19 22  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2 21 16  8 23 12 24 25 22  0  0  0  0  0  0  0  0  0  0  0]
 [ 2 21 16  4  2  5  6 17 18  2 19 26  0  0  0  0  0  0  0  0]
 [ 2 27 28 29  2 30 28 31 32  0  0  0  0  0  0  0  0  0  0  0]
 [ 2 27 28 29  2 33 28 34 35 36 37 38 23 39  0  0  0  0  0  0]
 [ 2 27 28 29  2 40 28 35 41 42 31 43 44 36 37  4 23 45  0  0]
 [46 47 11 48 49 50 44 36 37 38 23 39  0  0  0  0  0  0  0  0]
 [46 47 38 51 52 53 36 37  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2 21 16 54  4 55 56  4 57 11  2 58 59 35 25 60  4  2 61  0]
 [ 2 21 16  7 62  4 57 11  2 63 58 59  0  0  0  0  0  0  0  0]
 [ 2 64 16  4 23 65 66 35 67  4 57 11  2 40 68 35  2 59 38 69]
 [70  7  2 16 35 71 62  4 57 11  2 59 72  0  0  0  0  0  0  0]
 [73 16 35 71 72 74  4 23 65  0  0  0  0  0  0  0  0  0  0  0]
 [ 2 75 76 38  2 77 78 7

Embedding Layer

In [9]:

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.layers import Embedding, LSTM

#To check the size and length
print(f"Size of the vocabulary: {len(word_to_idx)}")
print(f"Length of each sequence: {padded_sequences.shape}")

vocab_size = len(word_to_idx)
#Default dimension for small / medium datasets (flickr8k)
embedding_dim = 128
input_length = 20

#Building the model
model = models.Sequential([
    Embedding(input_dim = vocab_size, output_dim = embedding_dim, input_length = input_length), LSTM(256) #Standard default
])

model.build(input_shape=(None, input_length))
model.summary()


Size of the vocabulary: 8829
Length of each sequence: (40455, 20)




Aligning images with captions

In [13]:

import os

#list to store the image path
image_paths = []
#list to store the tokenized sequence
caption_sequences = []

IMAGE_DIR = "../data/images/flickr8k_images/images"

#Looping over rows not images
for i in range(len(df)):
    #selecting the image file name for the i caption
    image_filename = df.iloc[i]['image']
    #building  a path so the model can find the image
    image_path = os.path.join(IMAGE_DIR, image_filename)

    #Adding to the lists image_path per caption
    image_paths.append(image_path)
    #the padded sequence of that same caption
    caption_sequences.append(padded_sequences[i])

print(len(image_paths))
print(len(caption_sequences))

#testing
idx = 5
print(image_paths[idx])
print(caption_sequences[idx])
print(df.iloc[idx])


40455
40455
../data/images/flickr8k_images/images\1001773457_577c3a7d70.jpg
[ 2 27 28 29  2 30 28 31 32  0  0  0  0  0  0  0  0  0  0  0]
image                             1001773457_577c3a7d70.jpg
tokens    [a, black, dog, and, a, spotted, dog, are, fig...
Name: 5, dtype: object
