In [181]:
from os import listdir
from pickle import dump, load
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

RANDOM_SEED = 42

In [3]:
captions = pd.read_csv('data/flickr_8k/captions.txt')

In [87]:
def clean_description(text):
    '''
    returns new array of tokens representing the text

    - lowercased
    - removes 1 - letter punctuation
    - removes numbers
    - appends 's to previous words
    - reconstructs string

    <start> is appended to the start
    <end> is appended to the end

    Notes:
    maybe keep in numbers
    maybe remove all dashes 
    '''
    output = []

    text = text.lower().replace('"', '')
    
    tokens = text.split()
    for token in tokens:
        if token.isalpha() or ((not token.isalpha() and len(token) > 1) and not token.isnumeric()):
            output.append(token)

    for i, token in enumerate(output):
        if token == "'s":
            output[i-1] = output[i-1] + "'s"
            output.remove("'s")

        if len(token) == 2 and '.' in token:
            output[i] = token.replace('.', '')
    
    output = ['<start>'] + output + ['<end>']

    return output

def clean_descriptions(filename):
    data = pd.read_csv('data/flickr_8k/captions.txt')
    data['caption'] = data['caption'].apply(lambda caption: clean_description(caption))
    return data

In [86]:
# output = []
# for i, caption in enumerate(captions['caption']):
#     words = clean_description(caption)
#     for word in words:
#         word = word.replace('-', '')
#         word = word.replace("'", '')
#         if not word.isalpha() and not word == '.' and not word == ',':
#             output.append(word)

# print(output)

# clean_description('test sequence saaa')

['<start>', 'test', 'sequence', 'saaa', '<end>']

In [90]:
cleaned_data = clean_descriptions('data/flickr_8k/captions.txt')
print(cleaned_data)

                           image  \
0      1000268201_693b08cb0e.jpg   
1      1000268201_693b08cb0e.jpg   
2      1000268201_693b08cb0e.jpg   
3      1000268201_693b08cb0e.jpg   
4      1000268201_693b08cb0e.jpg   
...                          ...   
40450   997722733_0cb5439472.jpg   
40451   997722733_0cb5439472.jpg   
40452   997722733_0cb5439472.jpg   
40453   997722733_0cb5439472.jpg   
40454   997722733_0cb5439472.jpg   

                                                 caption  
0      [<start>, a, child, in, a, pink, dress, is, cl...  
1      [<start>, a, girl, going, into, a, wooden, bui...  
2      [<start>, a, little, girl, climbing, into, a, ...  
3      [<start>, a, little, girl, climbing, the, stai...  
4      [<start>, a, little, girl, in, a, pink, dress,...  
...                                                  ...  
40450  [<start>, a, man, in, a, pink, shirt, climbs, ...  
40451  [<start>, a, man, is, rock, climbing, high, in...  
40452  [<start>, a, person, in, a, r

In [116]:
all_filenames = list(set(cleaned_data['image']))
train_filenames, test_filenames = train_test_split(all_filenames, test_size=0.2, random_state=RANDOM_SEED)
test_filenames, validation_filenames = train_test_split(test_filenames, test_size=0.5, random_state=RANDOM_SEED)

training_samples = cleaned_data.loc[cleaned_data['image'].isin(train_filenames)]
validation_samples = cleaned_data.loc[cleaned_data['image'].isin(validation_filenames)]
test_samples = cleaned_data.loc[cleaned_data['image'].isin(test_filenames)]

In [131]:
training_samples

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,"[<start>, a, child, in, a, pink, dress, is, cl..."
1,1000268201_693b08cb0e.jpg,"[<start>, a, girl, going, into, a, wooden, bui..."
2,1000268201_693b08cb0e.jpg,"[<start>, a, little, girl, climbing, into, a, ..."
3,1000268201_693b08cb0e.jpg,"[<start>, a, little, girl, climbing, the, stai..."
4,1000268201_693b08cb0e.jpg,"[<start>, a, little, girl, in, a, pink, dress,..."
...,...,...
40445,997338199_7343367d7f.jpg,"[<start>, a, person, stands, near, golden, wal..."
40446,997338199_7343367d7f.jpg,"[<start>, a, woman, behind, a, scrolled, wall,..."
40447,997338199_7343367d7f.jpg,"[<start>, a, woman, standing, near, a, decorat..."
40448,997338199_7343367d7f.jpg,"[<start>, the, walls, are, covered, in, gold, ..."


In [117]:
print(train_filenames[0])
len(train_filenames), len(test_filenames), len(validation_filenames)

3298199743_d8dd8f94a0.jpg


(6472, 809, 810)

In [178]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(training_samples['caption']))
VOCAB_SIZE = len(tokenizer.word_index) + 1
# training_samples_indexed = tokenizer.texts_to_sequences(list(training_samples['caption']))


In [179]:
VOCAB_SIZE

8133

In [147]:
def samples_to_dict(samples):
	descriptions = dict()
	for image, caption in zip(samples['image'], samples['caption']):
		if image not in descriptions.keys():
			descriptions[image] = [caption]
		else:
			descriptions[image].append(caption)	

		
	return descriptions

training_dict = samples_to_dict(training_samples)
validation_dict = samples_to_dict(validation_samples)
test_dict = samples_to_dict(test_samples)

In [152]:
# load photo features
def load_photo_features(features_file, corresponding_filenames):
    # load all features
    all_features = load(open(features_file, 'rb'))
    # filter features
    features = {k: all_features[k.split('.')[0]] for k in corresponding_filenames}
    return features

In [154]:
training_image_features = load_photo_features('8k_features.pkl', train_filenames)

In [182]:

def dictionary_to_model_samples(dictionary):
    # list of filenames
    X1 = []
    # word inputs (as word indexes)
    X2 = []
    # next word
    y = []

    for filename, samples in dictionary.items():
        samples = tokenizer.texts_to_sequences(samples)
        for sample in samples:
            for i in range(len(sample) - 1):
                X1.append(filename)

                X2.append(sample[:i + 1])


                y.append(to_categorical(sample[i+1], VOCAB_SIZE))

    return X1, X2, y

dictionary_to_model_samples(training_dict)[2]

[array([0., 1., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 1., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 1., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 1., 0., ..., 0.,

In [187]:
MAX_LENGTH = max(training_samples['caption'].apply(lambda c : len(c)))