First we started by reading the csv file 'captions.txt' and then displaying the first five rows

In [10]:


import pandas as pd

captions_path = "../data/captions/captions.txt"

df = pd.read_csv(captions_path)
print(df.head())


                       image  \
0  1000268201_693b08cb0e.jpg   
1  1000268201_693b08cb0e.jpg   
2  1000268201_693b08cb0e.jpg   
3  1000268201_693b08cb0e.jpg   
4  1000268201_693b08cb0e.jpg   

                                             caption  
0  A child in a pink dress is climbing up a set o...  
1              A girl going into a wooden building .  
2   A little girl climbing into a wooden playhouse .  
3  A little girl climbing the stairs to her playh...  
4  A little girl in a pink dress going into a woo...  


Now we want group all rows that have the same image filename together, keeping only the caption column, then for each group we convert all captions into a python list, and finally convert the pandas object into a dictionary

In [11]:
image_to_captions = (
    df.groupby('image')['caption'].apply(list).to_dict()
)

print("Number of images: ", len(image_to_captions))

Number of images:  8091


Creating a new column for captions in lower case and without punctuation

In [12]:

import string

#Removing the upper cases
df['lower_caption'] = df['caption'].str.lower()

#Function to remove the punctuation
def remove_punctuation(lower_caption):
    for char in string.punctuation:
        lower_caption = lower_caption.replace(char, '')
    return lower_caption

df['lower_caption'] = df['lower_caption'].apply(remove_punctuation)

#Comparing the new column with the old column
df.sample(10)



Unnamed: 0,image,caption,lower_caption
8208,2366421102_2d60d53a0e.jpg,A little girl is holding flowers in her right ...,a little girl is holding flowers in her right ...
28433,3445296377_1e5082b44b.jpg,Two children play in the melting slush .,two children play in the melting slush
20959,3139393607_f0a54ca46d.jpg,The snowboarder is ready to jump .,the snowboarder is ready to jump
27182,3399843227_3b9d2a8dbf.jpg,A man holding a drink poses with a woman .,a man holding a drink poses with a woman
1044,1248940539_46d33ed487.jpg,Three young girls dance on the beach in the sa...,three young girls dance on the beach in the sand
12805,2657301826_aab4c36e6c.jpg,a baby wearing a blue bandanna is playing with...,a baby wearing a blue bandanna is playing with...
1383,1307635496_94442dc21a.jpg,A smiling boy runs through the grass .,a smiling boy runs through the grass
31431,3562816250_6e14d436b1.jpg,A person sits in a yellow kayak on the water w...,a person sits in a yellow kayak on the water w...
21608,3171035252_dba286ae5c.jpg,A person surfs on a craft with a sail .,a person surfs on a craft with a sail
12286,2623247254_3bfc795121.jpg,a young blond man sitting at a table with a br...,a young blond man sitting at a table with a br...


Tokenization for each caption

In [13]:

import nltk
from nltk.tokenize import word_tokenize
#Was having trouble with the import word_tokenize so had to add the line below
#nltk.download('punkt_tab')

#Creating a new column for tokens for each caption
df['tokens'] = df['lower_caption'].apply(lambda x: word_tokenize(x))
df.head()






Unnamed: 0,image,caption,lower_caption,tokens
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...,a child in a pink dress is climbing up a set o...,"[a, child, in, a, pink, dress, is, climbing, u..."
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .,a girl going into a wooden building,"[a, girl, going, into, a, wooden, building]"
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .,a little girl climbing into a wooden playhouse,"[a, little, girl, climbing, into, a, wooden, p..."
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...,a little girl climbing the stairs to her playh...,"[a, little, girl, climbing, the, stairs, to, h..."
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...,a little girl in a pink dress going into a woo...,"[a, little, girl, in, a, pink, dress, going, i..."


Building the vocabulary

In [14]:
from collections import Counter

#Couting all the words
word_counter = Counter()

for i in df['tokens']:
    word_counter.update(i)

#Defining special tokens
#To fill empty positions
PAD_TOKEN = "<PAD>"
#When a word is not in the vocabulary
UNK_TOKEN = "<UNK>"

#Creating a vocabulary
word_to_idx = {
    PAD_TOKEN: 0,
    UNK_TOKEN: 1
}

for word in word_counter:
    #Auto increments the ID generator
    word_to_idx[word] = len(word_to_idx)


print("Vocabulary size:", len(word_to_idx))
print("Sample entries:", list(word_to_idx.items())[:10])


Vocabulary size: 8829
Sample entries: [('<PAD>', 0), ('<UNK>', 1), ('a', 2), ('child', 3), ('in', 4), ('pink', 5), ('dress', 6), ('is', 7), ('climbing', 8), ('up', 9)]


Convert the tokens to integers


In [20]:

#Creating a variable for unk token index
UNK_IDX = word_to_idx[UNK_TOKEN]

sequence = []

#For loop to check if the word is in the tokens dataframe; if it is just adds the index; if it's not it will add the unk index
for tokens in df['tokens']:
    seq = []
    for word in tokens:
        if word in word_to_idx:
            seq.append(word_to_idx[word])
        else:
            seq.append(UNK_IDX)
    #Appending to the empty sequence list
    sequence.append(seq)

print("Original: ", df['tokens'][1])
print("Sequence: ", sequence[1])



Original:  ['a', 'girl', 'going', 'into', 'a', 'wooden', 'building']
Sequence:  [2, 16, 17, 18, 2, 19, 20]
