In [1]:
from os import listdir
from pickle import dump, load
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dropout, Embedding, LSTM, Dense, Input
from keras.models import Model
from keras.layers.merging import add
import numpy as np
import tensorflow as tf
from nltk.translate.bleu_score import corpus_bleu



RANDOM_SEED = 42

In [2]:
captions = pd.read_csv('data/flickr_8k/captions.txt')

In [3]:
def clean_description(text):
    '''
    returns new array of tokens representing the text

    - lowercased
    - removes 1 - letter punctuation
    - removes numbers
    - appends 's to previous words
    - reconstructs string

    <start> is appended to the start
    <end> is appended to the end

    Notes:
    maybe keep in numbers
    maybe remove all dashes 
    '''
    output = []

    text = text.lower().replace('"', '')
    
    tokens = text.split()
    for token in tokens:
        if token.isalpha() or ((not token.isalpha() and len(token) > 1) and not token.isnumeric()):
            output.append(token)

    for i, token in enumerate(output):
        if token == "'s":
            output[i-1] = output[i-1] + "'s"
            output.remove("'s")

        if len(token) == 2 and '.' in token:
            output[i] = token.replace('.', '')
    
    output = ['<start>'] + output + ['<end>']

    return output

def clean_descriptions(filename):
    data = pd.read_csv('data/flickr_8k/captions.txt')
    data['caption'] = data['caption'].apply(lambda caption: clean_description(caption))
    return data

In [4]:
# output = []
# for i, caption in enumerate(captions['caption']):
#     words = clean_description(caption)
#     for word in words:
#         word = word.replace('-', '')
#         word = word.replace("'", '')
#         if not word.isalpha() and not word == '.' and not word == ',':
#             output.append(word)

# print(output)

# clean_description('test sequence saaa')

In [5]:
cleaned_data = clean_descriptions('data/flickr_8k/captions.txt')
print(cleaned_data)

                           image  \
0      1000268201_693b08cb0e.jpg   
1      1000268201_693b08cb0e.jpg   
2      1000268201_693b08cb0e.jpg   
3      1000268201_693b08cb0e.jpg   
4      1000268201_693b08cb0e.jpg   
...                          ...   
40450   997722733_0cb5439472.jpg   
40451   997722733_0cb5439472.jpg   
40452   997722733_0cb5439472.jpg   
40453   997722733_0cb5439472.jpg   
40454   997722733_0cb5439472.jpg   

                                                 caption  
0      [<start>, a, child, in, a, pink, dress, is, cl...  
1      [<start>, a, girl, going, into, a, wooden, bui...  
2      [<start>, a, little, girl, climbing, into, a, ...  
3      [<start>, a, little, girl, climbing, the, stai...  
4      [<start>, a, little, girl, in, a, pink, dress,...  
...                                                  ...  
40450  [<start>, a, man, in, a, pink, shirt, climbs, ...  
40451  [<start>, a, man, is, rock, climbing, high, in...  
40452  [<start>, a, person, in, a, r

In [6]:
all_filenames = list(set(cleaned_data['image']))
train_filenames, test_filenames = train_test_split(all_filenames, test_size=0.2, random_state=RANDOM_SEED)
test_filenames, validation_filenames = train_test_split(test_filenames, test_size=0.5, random_state=RANDOM_SEED)

training_samples = cleaned_data.loc[cleaned_data['image'].isin(train_filenames)]
validation_samples = cleaned_data.loc[cleaned_data['image'].isin(validation_filenames)]
test_samples = cleaned_data.loc[cleaned_data['image'].isin(test_filenames)]

In [7]:
training_samples

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,"[<start>, a, child, in, a, pink, dress, is, cl..."
1,1000268201_693b08cb0e.jpg,"[<start>, a, girl, going, into, a, wooden, bui..."
2,1000268201_693b08cb0e.jpg,"[<start>, a, little, girl, climbing, into, a, ..."
3,1000268201_693b08cb0e.jpg,"[<start>, a, little, girl, climbing, the, stai..."
4,1000268201_693b08cb0e.jpg,"[<start>, a, little, girl, in, a, pink, dress,..."
...,...,...
40450,997722733_0cb5439472.jpg,"[<start>, a, man, in, a, pink, shirt, climbs, ..."
40451,997722733_0cb5439472.jpg,"[<start>, a, man, is, rock, climbing, high, in..."
40452,997722733_0cb5439472.jpg,"[<start>, a, person, in, a, red, shirt, climbi..."
40453,997722733_0cb5439472.jpg,"[<start>, a, rock, climber, in, a, red, shirt,..."


In [8]:
print(train_filenames[0])
len(train_filenames), len(test_filenames), len(validation_filenames)

3171250845_5ae0d2a8bc.jpg


(6472, 809, 810)

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(training_samples['caption']))
VOCAB_SIZE = len(tokenizer.word_index) + 1
# training_samples_indexed = tokenizer.texts_to_sequences(list(training_samples['caption']))


In [10]:
VOCAB_SIZE

8089

In [11]:
def samples_to_dict(samples):
	descriptions = dict()
	for image, caption in zip(samples['image'], samples['caption']):
		if image not in descriptions.keys():
			descriptions[image] = [caption]
		else:
			descriptions[image].append(caption)	

		
	return descriptions

training_dict = samples_to_dict(training_samples)
validation_dict = samples_to_dict(validation_samples)
test_dict = samples_to_dict(test_samples)

In [12]:
# load photo features
def load_photo_features(features_file, corresponding_filenames):
    # load all features
    all_features = load(open(features_file, 'rb'))
    # filter features
    features = {k: all_features[k.split('.')[0]] for k in corresponding_filenames}
    return features

In [13]:
train_image_features = load_photo_features('8k_features.pkl', train_filenames)
test_image_features = load_photo_features('8k_features.pkl', test_filenames)
val_image_features = load_photo_features('8k_features.pkl', validation_filenames)

In [14]:
MAX_LENGTH = max(training_samples['caption'].apply(lambda c : len(c)))

In [15]:

def dictionary_to_model_samples(dictionary, image_features):
    # list of image features
    X1 = []
    # word inputs (as word indexes)
    X2 = []
    # next word
    y = []

    for filename, samples in dictionary.items():
        samples = tokenizer.texts_to_sequences(samples)
        for sample in samples:
            for i in range(len(sample) - 1):
                X1.append(image_features[filename].reshape(-1,))
                # padding x2 to always be max sequence length
                x2 = pad_sequences([sample[:i + 1]], maxlen=MAX_LENGTH, padding='post')[0]
                X2.append(x2)


                y.append(to_categorical(sample[i+1], VOCAB_SIZE))

    return tf.convert_to_tensor(np.asarray(X1)), tf.convert_to_tensor(np.asarray(X2)), tf.convert_to_tensor(y)


In [22]:
# define the RNN model to predict image captions
def generate_model(vocab_size, max_length):

    # first input - VGG generated image features
    image_input = Input(shape=(4096,))
    condensed_image = Dense(256, activation='relu')(image_input)

    # taking in text input which is words 1 through n-1 where y is next word
    # all sequences are padded to be max_length so netowrk has same sized inputs
    text_input = Input(shape=(max_length,))
    embedding_layer = Embedding(vocab_size, 256, mask_zero=True)(text_input)
    lstm_layer = LSTM(256)(embedding_layer)

    # combining condensed image and text layers via addition
    combo_layer1 = add([condensed_image, lstm_layer])
    combo_layer2 = Dense(256, activation='relu')(combo_layer1)

    # softmax layer for all words in vocabulary to generate final prediction
    output = Dense(vocab_size, activation='softmax')(combo_layer2)

    # creating and compiling model
    model = Model(inputs=[image_input, text_input], outputs=output)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [17]:
# X1train, X2train, ytrain = dictionary_to_model_samples(training_dict, train_image_features)
X1test, X2test, ytest = dictionary_to_model_samples(test_dict, test_image_features)
X1val, X2val, yval = dictionary_to_model_samples(validation_dict, val_image_features)

In [18]:
X1test[0].shape

TensorShape([4096])

In [19]:
X2test[0].shape

TensorShape([37])

In [20]:
ytest[0].shape

TensorShape([8089])

In [23]:
# fit model
model = generate_model(VOCAB_SIZE, MAX_LENGTH)
model.fit([X1val, X2val], yval, epochs=1, verbose=1, validation_data=([X1test, X2test], ytest))
model.save('VGG_RNN_model')

   2/1468 [..............................] - ETA: 1:33:45 - loss: 8.9752

KeyboardInterrupt: 

In [58]:
def generate_caption(model, filename, image_features):
    tokens = ['<start>']
    file_image_features = image_features[filename]
    for i in range(0, MAX_LENGTH):
        seq = tokenizer.texts_to_sequences([tokens])[0]
        seq = pad_sequences([seq], maxlen=MAX_LENGTH, padding='post')
        pred = model.predict([file_image_features, seq], verbose=0).argmax()
        next_word = tokenizer.index_word[pred]
        tokens.append(next_word)
        if next_word == '<end>':
            break
    return tokens

In [59]:
generate_caption(model, '667626_18933d713e.jpg', train_image_features)

['<start>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>',
 '<end>']

In [60]:
def model_score(model, image_features, eval_dict):
    # creating all predictions for file in given dict
    all_pred = [generate_caption(model, file, image_features) for file in eval_dict.keys()]
    
    # getting original captions for each file in given dict
    all_original_captions = list(eval_dict.values)
    
def bleu_score(all_original_captions, all_pred):
    # printing final model score
    print('BLEU-1: %f' % corpus_bleu(all_original_captions, all_pred, weights=(1.0, 0, 0, 0)))
    return corpus_bleu(all_original_captions, all_pred, weights=(1.0, 0, 0, 0))





    

In [62]:
model_score(model, test_image_features, test_dict)

KeyboardInterrupt: 