In [1]:
from string import punctuation

import numpy as np
from PIL import Image

In [2]:
PATH_TO_DATA = '../data/'

TEXT_FILES_DIR = PATH_TO_DATA + 'Flickr8k_text/'
IMAGES_DIR = PATH_TO_DATA + 'Flickr8k_Dataset/'

***

## Understanding and formating the data

In [3]:
with open(TEXT_FILES_DIR + 'Flickr8k.token.txt', 'r') as flickr8_token:
    raw_image_description = flickr8_token.read().split('\n')[:-1]

print('\n'.join(raw_image_description[:10]))

1000268201_693b08cb0e.jpg#0	A child in a pink dress is climbing up a set of stairs in an entry way .
1000268201_693b08cb0e.jpg#1	A girl going into a wooden building .
1000268201_693b08cb0e.jpg#2	A little girl climbing into a wooden playhouse .
1000268201_693b08cb0e.jpg#3	A little girl climbing the stairs to her playhouse .
1000268201_693b08cb0e.jpg#4	A little girl in a pink dress going into a wooden cabin .
1001773457_577c3a7d70.jpg#0	A black dog and a spotted dog are fighting
1001773457_577c3a7d70.jpg#1	A black dog and a tri-colored dog playing with each other on the road .
1001773457_577c3a7d70.jpg#2	A black dog and a white dog with brown spots are staring at each other in the street .
1001773457_577c3a7d70.jpg#3	Two dogs of different breeds looking at each other on the road .
1001773457_577c3a7d70.jpg#4	Two dogs on pavement moving toward each other .


In [4]:
def get_dict_image_descriptions(raw_image_description):
    image_descriptions = dict()
    
    i = 0
    while i < len(raw_image_description):
        img_name = raw_image_description[i].split('.')[0]
        image_descriptions[img_name] = []
        
        while i < len(raw_image_description) and img_name == raw_image_description[i].split('.')[0]:
            descr = raw_image_description[i].split('\t')[1]
            image_descriptions[img_name].append(descr)
            i+=1
            
    return image_descriptions

image_descriptions = get_dict_image_descriptions(raw_image_description)

In [5]:
image_descriptions['101654506_8eb26cfb60']

['A brown and white dog is running through the snow .',
 'A dog is running in the snow',
 'A dog running through snow .',
 'a white and brown dog is running through a snow covered field .',
 'The white and brown dog is running over the surface of the snow .']

***

## Data Cleaning

In [6]:
def clean(sentence):
    # Tokenize
    tokens = sentence.split()
    
    # Lower Case
    tokens = [token.lower() for token in tokens]
    
    # Remove punct
    for i in range(len(tokens)):
        tokens[i] = ''.join([ch for ch in tokens[i] if ch not in punctuation])
    
    # Remove hanging chars
    tokens = [token for token in tokens if (len(token) > 1 or token == 'a')]
    
    # Remove tokens with digits in it
    tokens = [token for token in tokens if token.isalpha()]
    
    return ' '.join(tokens)

In [7]:
clean(
    'A HellO! How12 are yoU>??'
)

'a hello are you'

In [8]:
for img_name, descriptions in image_descriptions.items():
    image_descriptions[img_name] = [clean(descr) for descr in descriptions]

In [9]:
image_descriptions['101654506_8eb26cfb60']

['a brown and white dog is running through the snow',
 'a dog is running in the snow',
 'a dog running through snow',
 'a white and brown dog is running through a snow covered field',
 'the white and brown dog is running over the surface of the snow']

***

## Create a Vocabulary

In [10]:
vocabulary = set()

for key in image_descriptions.keys():
    [vocabulary.update(descr.split()) for descr in image_descriptions[key]]
    
print(f'Original Vocabulary size: {len(vocabulary)}')

Original Vocabulary size: 8764


### Add zero for zero-padding

In [11]:
vocabulary.update('0')

### Sort the Vocabulary lexically

In [12]:
vocabulary = sorted(vocabulary)

## Word2Idx and Idx2Word maps

In [13]:
word2idx = dict([(list(vocabulary)[i], i) for i in range(len(vocabulary))])
idx2word = dict([(i, list(vocabulary)[i]) for i in range(len(vocabulary))])

In [14]:
word2idx['dog']

2222

In [15]:
idx2word[2222]

'dog'

***

## Max Description Length

In [20]:
descriptions = []

for key, val in image_descriptions.items():
    for descr in val:
        descriptions.append(descr)

print(f'Total number of descriptions: {len(descriptions)}')

Total number of descriptions: 40460


In [21]:
longets_description = max(descriptions, key=lambda x: len(x.split()))

print(f'Longes description:\n\n"{longets_description}"')

Longes description:

"an africanamerican man wearing a green sweatshirt and blue vest is holding up dollar bills in front of his face while standing on a busy sidewalk in front of a group of men playing instruments"


In [22]:
max_description_length = len(longets_description.split())

print(f'Max Description Length: {max_description_length}')

Max Description Length: 35


***

## Train/Dev/Test split

In [23]:
def load_set_images(filename):
    with open(filename, 'r') as f:
        img_names = f.read().split('\n')[:-1]
        
    img_names = [name.split('.')[0] for name in img_names]
    return img_names

In [24]:
train_images = load_set_images(TEXT_FILES_DIR + 'Flickr_8k.trainImages.txt')
dev_images = load_set_images(TEXT_FILES_DIR + 'Flickr_8k.devImages.txt')
test_images = load_set_images(TEXT_FILES_DIR + 'Flickr_8k.testImages.txt')

print(f'Train size: {len(train_images)}')
print(f'Dev size: {len(dev_images)}')
print(f'Test size: {len(test_images)}')

Train size: 6000
Dev size: 1000
Test size: 1000


In [25]:
def init_image_descriptions_set(set_images):
    image_descriptions_set = dict()
    
    for img_name in set_images:
        image_descriptions_set[img_name] = []
        descriptions = image_descriptions[img_name]
        
        for descr in descriptions:
            image_descriptions_set[img_name].append(
                '<SOS> ' + descr + ' <EOS>'
            )
    
    return image_descriptions_set

In [26]:
train_image_descriptions = init_image_descriptions_set(train_images)
dev_image_descriptions = init_image_descriptions_set(dev_images)
test_image_descriptions = init_image_descriptions_set(test_images)

print(f'Train size: {len(train_image_descriptions)}')
print(f'Dev size: {len(dev_image_descriptions)}')
print(f'Test size: {len(test_image_descriptions)}')

Train size: 6000
Dev size: 1000
Test size: 1000


In [27]:
train_image_descriptions['2513260012_03d33305cf']

['<SOS> a black dog is running after a white dog in the snow <EOS>',
 '<SOS> black dog chasing brown dog through snow <EOS>',
 '<SOS> two dogs chase each other across the snowy ground <EOS>',
 '<SOS> two dogs play together in the snow <EOS>',
 '<SOS> two dogs running through a low lying body of water <EOS>']

`<SOS>` = **Start Of Sequence** <br>
`<EOS>` = **End Of Sequence** <br>

***