In [57]:
from os import listdir
from pickle import dump, load
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

# from keras.layers import Dropout, Embedding, LSTM, Dense, Input
from tensorflow.keras.layers import Dropout, Embedding, LSTM, Dense, Input, add
from keras.models import Model

import numpy as np
import tensorflow as tf
from nltk.translate.bleu_score import corpus_bleu
from common import get_tokenizer_from_samples, clean_descriptions, samples_to_dict, import_image_features



RANDOM_SEED = 42

In [4]:
import tensorflow as tf
tf.__version__

# import keras
# keras.__version__

'2.11.0'

In [5]:
captions = pd.read_csv('data/flickr_8k/captions.txt')

In [28]:
cleaned_data = clean_descriptions('data/flickr_8k/captions.txt')
print(cleaned_data)

                           image  \
0      1000268201_693b08cb0e.jpg   
1      1000268201_693b08cb0e.jpg   
2      1000268201_693b08cb0e.jpg   
3      1000268201_693b08cb0e.jpg   
4      1000268201_693b08cb0e.jpg   
...                          ...   
40450   997722733_0cb5439472.jpg   
40451   997722733_0cb5439472.jpg   
40452   997722733_0cb5439472.jpg   
40453   997722733_0cb5439472.jpg   
40454   997722733_0cb5439472.jpg   

                                                 caption  
0      [<start>, a, child, in, a, pink, dress, is, cl...  
1      [<start>, a, girl, going, into, a, wooden, bui...  
2      [<start>, a, little, girl, climbing, into, a, ...  
3      [<start>, a, little, girl, climbing, the, stai...  
4      [<start>, a, little, girl, in, a, pink, dress,...  
...                                                  ...  
40450  [<start>, a, man, in, a, pink, shirt, climbs, ...  
40451  [<start>, a, man, is, rock, climbing, high, in...  
40452  [<start>, a, person, in, a, r

In [8]:
all_filenames = list(set(cleaned_data['image']))
train_filenames, test_filenames = train_test_split(all_filenames, test_size=0.2, random_state=RANDOM_SEED)
test_filenames, validation_filenames = train_test_split(test_filenames, test_size=0.5, random_state=RANDOM_SEED)

training_samples = cleaned_data.loc[cleaned_data['image'].isin(train_filenames)]
validation_samples = cleaned_data.loc[cleaned_data['image'].isin(validation_filenames)]
test_samples = cleaned_data.loc[cleaned_data['image'].isin(test_filenames)]

In [9]:
training_samples

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,"[<start>, a, child, in, a, pink, dress, is, cl..."
1,1000268201_693b08cb0e.jpg,"[<start>, a, girl, going, into, a, wooden, bui..."
2,1000268201_693b08cb0e.jpg,"[<start>, a, little, girl, climbing, into, a, ..."
3,1000268201_693b08cb0e.jpg,"[<start>, a, little, girl, climbing, the, stai..."
4,1000268201_693b08cb0e.jpg,"[<start>, a, little, girl, in, a, pink, dress,..."
...,...,...
40450,997722733_0cb5439472.jpg,"[<start>, a, man, in, a, pink, shirt, climbs, ..."
40451,997722733_0cb5439472.jpg,"[<start>, a, man, is, rock, climbing, high, in..."
40452,997722733_0cb5439472.jpg,"[<start>, a, person, in, a, red, shirt, climbi..."
40453,997722733_0cb5439472.jpg,"[<start>, a, rock, climber, in, a, red, shirt,..."


In [58]:
tokenizer = get_tokenizer_from_samples(training_samples)
VOCAB_SIZE = len(tokenizer.word_index) + 1
print(VOCAB_SIZE)

8087


In [29]:
training_dict = samples_to_dict(training_samples)
validation_dict = samples_to_dict(validation_samples)
test_dict = samples_to_dict(test_samples)

In [55]:
train_image_features = import_image_features('8k_features.pkl', train_filenames)
test_image_features = import_image_features('8k_features.pkl', test_filenames)
val_image_features = import_image_features('8k_features.pkl', validation_filenames)

In [56]:
MAX_LENGTH = max(training_samples['caption'].apply(lambda caption : len(caption)))

In [15]:
def dictionary_to_model_samples(dictionary, image_features):
    # list of image features
    X1 = []
    # word inputs (as word indexes)
    X2 = []
    # next word
    y = []

    for filename, samples in dictionary.items():
        samples = tokenizer.texts_to_sequences(samples)
        for sample in samples:
            for i in range(len(sample) - 1):
                X1.append(image_features[filename].reshape(-1,))
                # padding x2 to always be max sequence length
                x2 = pad_sequences([sample[:i + 1]], maxlen=MAX_LENGTH, padding='post')[0]
                X2.append(x2)


                y.append(to_categorical(sample[i+1], VOCAB_SIZE))

    return tf.convert_to_tensor(np.asarray(X1)), tf.convert_to_tensor(np.asarray(X2)), tf.convert_to_tensor(y)

In [16]:
# TODO change and comment

def create_sequences(image_features, descriptions):
    '''
    
    given 5 descriptions corresponding to one image, output a list of:
    (image feature vector, first i-1 words in a sequence, ith word) 

    '''
    # list of image features
    X1 = []
    # word inputs (as word indexes)
    X2 = []
    # next word
    y = []

    image_features = image_features.reshape(-1,)

    descriptions = tokenizer.texts_to_sequences(descriptions)
    for description in descriptions:
        for i in range(len(description) - 1):
            X1.append(image_features)

            # padding x2 to always be max sequence length
            x2 = pad_sequences([description[:i + 1]], maxlen=MAX_LENGTH, padding='post')[0]
            X2.append(x2)
            y.append(to_categorical(description[i+1], VOCAB_SIZE))

    return tf.convert_to_tensor(np.asarray(X1)), tf.convert_to_tensor(np.asarray(X2)), tf.convert_to_tensor(y)


In [17]:
# TODO change and comment

# data generator, intended to be used in a call to model.fit_generator()
def data_generator(filename_description_dictionary, img_features_dict, loops):
    while loops >= 1:
        np.random.seed(RANDOM_SEED)

        # shuffle filename order for better distribution over multiple loops (epochs)
        all_filenames = list(filename_description_dictionary.keys())
        np.random.shuffle(all_filenames)
        
        # loop for ever over files
        for filename in all_filenames:
            # get the corresponding descriptions
            descriptions = filename_description_dictionary[filename]

            # retrieve the photo feature
            img_features = img_features_dict[filename][0]

            in_img, in_seq, out_word = create_sequences(img_features, descriptions)
            yield [in_img, in_seq], out_word

        loops -= 1

In [18]:
test_gen = data_generator(training_dict, train_image_features, 1)

In [19]:
for item in test_gen:
    print(item[1].shape)
    break

(81, 8087)


In [20]:
# TODO maybe add dropouts

# define the RNN model to predict image captions
def generate_model(vocab_size, max_length):

    # first input - VGG generated image features
    image_input = Input(shape=(4096,))
    condensed_image = Dense(256, activation='relu')(image_input)

    # taking in text input which is words 1 through n-1 where y is next word
    # all sequences are padded to be max_length so netowrk has same sized inputs
    text_input = Input(shape=(max_length,))
    embedding_layer = Embedding(vocab_size, 256, mask_zero=True)(text_input)
    lstm_layer = LSTM(256)(embedding_layer)

    # combining condensed image and text layers via addition
    combo_layer1 = add([condensed_image, lstm_layer])
    combo_layer2 = Dense(256, activation='relu')(combo_layer1)

    # softmax layer for all words in vocabulary to generate final prediction
    output = Dense(vocab_size, activation='softmax')(combo_layer2)

    # creating and compiling model
    model = Model(inputs=[image_input, text_input], outputs=output)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [21]:
# X1train, X2train, ytrain = dictionary_to_model_samples(training_dict, train_image_features)
# X1test, X2test, ytest = dictionary_to_model_samples(test_dict, test_image_features)
# X1val, X2val, yval = dictionary_to_model_samples(validation_dict, val_image_features)

In [22]:
# fit model
model = generate_model(VOCAB_SIZE, MAX_LENGTH)

In [23]:
# model.fit([X1train, X2train], ytrain, epochs=15, verbose=1, validation_data=([X1val, X2val], yval))

generator = data_generator(training_dict, train_image_features, 5)
model.fit_generator(generator, epochs=1, verbose=1)


  model.fit_generator(generator, epochs=1, verbose=1)


    302/Unknown - 465s 2s/step - loss: 5.6007

KeyboardInterrupt: 

In [None]:
model.save('5_epochs_random_order_trained_on_train_with_generator')



INFO:tensorflow:Assets written to: 5_epochs_random_order_trained_on_train_with_generator\assets


INFO:tensorflow:Assets written to: 5_epochs_random_order_trained_on_train_with_generator\assets
  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


In [None]:
# TODO comment

def generate_caption(model, filename, image_features):
    tokens = ['<start>']
    file_image_features = image_features[filename]
    for i in range(0, MAX_LENGTH):
        seq = tokenizer.texts_to_sequences([tokens])[0]
        seq = pad_sequences([seq], maxlen=MAX_LENGTH, padding='post')
        pred = model.predict([file_image_features, seq], verbose=0).argmax()
        next_word = tokenizer.index_word[pred]
        tokens.append(next_word)
        if next_word == '<end>':
            break
    return tokens

In [None]:
test_filenames

['2934022873_3fdd69aee4.jpg',
 '537758332_8beb9cf522.jpg',
 '2929272606_2a5923b38e.jpg',
 '3724759125_2dc0e1f4a3.jpg',
 '3404906655_bc51c69c1e.jpg',
 '2822891602_ff61df2ece.jpg',
 '98377566_e4674d1ebd.jpg',
 '161905204_247c6ca6de.jpg',
 '1459250022_bf1eddad11.jpg',
 '3423802527_94bd2b23b0.jpg',
 '507758961_e63ca126cc.jpg',
 '2551632823_0cb7dd779b.jpg',
 '2623982903_58ec7c5026.jpg',
 '3173014908_b3e69594b6.jpg',
 '115684808_cb01227802.jpg',
 '3555729342_cc7a3b67fd.jpg',
 '3284887033_e2e48f1863.jpg',
 '3677954655_df4c0845aa.jpg',
 '3396157719_6807d52a81.jpg',
 '542317719_ed4dd95dc2.jpg',
 '223299142_521aedf9e7.jpg',
 '2553550034_5901aa9d6c.jpg',
 '3325497914_f9014d615b.jpg',
 '2885891981_6b02620ae9.jpg',
 '3244171699_ace4b5d999.jpg',
 '2588456052_8842b47005.jpg',
 '256283122_a4ef4a17cb.jpg',
 '97105139_fae46fe8ef.jpg',
 '1539166395_0cdc0accee.jpg',
 '3406802138_ef77bbddd0.jpg',
 '3539840291_1c3eed701d.jpg',
 '2514581496_8f4102377e.jpg',
 '2534424894_ccd091fcb5.jpg',
 '2950637275_98f1e30c

In [None]:
generate_caption(model, '2822891602_ff61df2ece.jpg', test_image_features)

['<start>',
 'a',
 'man',
 'in',
 'a',
 'blue',
 'bathing',
 'suit',
 'is',
 'surfing',
 'on',
 'a',
 'wave',
 '<end>']

In [None]:
# TODO finish function
def model_score(model, image_features, eval_dict):
    # creating all predictions for file in given dict
    all_pred = [generate_caption(model, file, image_features) for file in eval_dict.keys()]
    
    # getting original captions for each file in given dict
    all_original_captions = list(eval_dict.values)
    
# maybe add other BLEU scores
def bleu_score(all_original_captions, all_pred):
    # printing final model score
    print('BLEU-1: %f' % corpus_bleu(all_original_captions, all_pred, weights=(1.0, 0, 0, 0)))
    return corpus_bleu(all_original_captions, all_pred, weights=(1.0, 0, 0, 0))





    

In [None]:
model_score(model, test_image_features, test_dict)

In [None]:
from sklearn.linear_model import LogisticRegression



class LogisticDecoder():
    '''
    '''

    def __init__(self, caption_max_length):
        '''
        '''
        self.max_len = caption_max_length
        self.models = [LogisticRegression(multi_class='ovr', solver='liblinear') for i in range(caption_max_length)]

    