In [1]:
from os import listdir
from pickle import dump, load
import pandas as pd
from keras_preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import ReLU

from tensorflow.keras.layers import Dropout, Embedding, LSTM, Dense, Input, add
from keras.models import Sequential

import numpy as np
import tensorflow as tf
from nltk.translate.bleu_score import corpus_bleu

from common import RANDOM_SEED, clean_descriptions, ALL_FILENAMES, TRAIN_FILENAMES, VALIDATION_FILENAMES, TEST_FILENAMES, TRAIN_AND_VAL_FILENAMES, samples_to_dict, import_image_features, max_and_average_sequence_length

In [2]:
# import the cleaned data and print one example
cleaned_data = clean_descriptions('data/flickr_8k/captions.txt')
print(cleaned_data.iloc[0])

image                              1000268201_693b08cb0e.jpg
caption    [<start>, a, child, in, a, pink, dress, is, cl...
Name: 0, dtype: object


In [3]:
''' 
get the actual rows from the df corresponding to the different sets
'''
# get the samples with the given filenames
train_samples = cleaned_data.loc[cleaned_data['image'].isin(TRAIN_FILENAMES)]
validation_samples = cleaned_data.loc[cleaned_data['image'].isin(VALIDATION_FILENAMES)]
test_samples = cleaned_data.loc[cleaned_data['image'].isin(TEST_FILENAMES)]
train_and_val_samples = cleaned_data.loc[cleaned_data['image'].isin(TRAIN_AND_VAL_FILENAMES)]


In [4]:
train_samples.iloc[0]

image                              1000268201_693b08cb0e.jpg
caption    [<start>, a, child, in, a, pink, dress, is, cl...
Name: 0, dtype: object

In [5]:

training_dict = samples_to_dict(train_samples)
validation_dict = samples_to_dict(validation_samples)
train_and_val_dict = samples_to_dict(train_and_val_samples)
test_dict = samples_to_dict(test_samples)

In [6]:
all_image_features = import_image_features('8k_features.pkl', ALL_FILENAMES)

train_image_features = import_image_features('8k_features.pkl', TRAIN_FILENAMES)
val_image_features = import_image_features('8k_features.pkl', VALIDATION_FILENAMES)
train_and_val_image_features = import_image_features('8k_features.pkl', TRAIN_AND_VAL_FILENAMES)
test_image_features = import_image_features('8k_features.pkl', TEST_FILENAMES)

In [7]:
MAX_LENGTH, AVG_LENGTH = max_and_average_sequence_length(train_and_val_samples)

The top 30 sequence lengths are:
[37, 35, 35, 35, 34, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30]
The longest sequence length from the training and validation samples is 37
The average sequence length from the training and validation samples is 12


In [11]:
def get_samples_of_specific_size(image_vector, descriptions, desired_caption_size, tokenizer):
    '''
    Given:
    - one image vector representing a single image
    - one value from our samples dict for the corresponding image (a list of 5 tokenized captions)
    - the desired caption size N
    - a Tokenizer

    Return X:
        a tensor with 5 elements (one for each caption) where each item is an array with length 4096 + N
        - the first 4096 elements are the VGG extracted features of the corresponding image
        - the next N elements are the first N words of the caption (converted to numbers by the passed in tokenizer)
    And Y:
        a tensor with 5 elements where each element is:
        - the N+1 word in the sequence

    NOTE: If any caption has a total length less than or equal to N, then it will not be added to the output, 
    meaning that the output could be tensors with 0 dimensions
    '''
    # initalize empty arrays for the samples
    X = []
    y = []

    # reshape the image vector to be one dimension
    image_vector = image_vector.reshape(-1,)
                                
    # convert the descriptions to number lists instead of string lists
    descriptions = tokenizer.texts_to_sequences(descriptions)

    # for each description
    for description in descriptions:
        # only if the caption is at least 1 longer than N (desired_caption_size)
        if len(description) + 1 >= desired_caption_size:
            # get the caption as a numpy array of elements (length will be desired caption size)
            caption = np.array(description[:desired_caption_size])
        
            # concatenate the image vector with the caption vector
            combined_X = np.concatenate([image_vector, caption])

            # append the combined X vector to the output x list
            X.append(combined_X)

            # get the one-hot encoding of the last word (required for keras models)
            # and append it to the output y list
            last_word = description[desired_caption_size]
            y.append(to_categorical(last_word, VOCAB_SIZE)[0])

    return tf.convert_to_tensor(X), tf.convert_to_tensor(y)


In [33]:
def data_generator(filename_description_dictionary, desired_caption_size, loops, tokenizer):
    '''
    Given:
    - A dictionary containing the samples we want to create a generator for where
        key: filename (string)
        value: list of captions (where each caption is a list of strings)
    - A desired caption size (N)
    - A number of loops L
    - A tokenizer for converting seen words to numbers
    
    loops are used because a generator is expended once it yields its last result, and 
    therefore cannot be used over multiple epohchs

    Iterate through the filename_description dictionary (L times).
    For each filename, generate corresponding number of samples where the caption size is N
    Each value in the X samples vector will be 4096 + N
        Those N values are the first N words of the corresponding caption
    Each value in the y vector will be the N+1 word

    Used to save memory
    Each loop it shuffles the order of the samples
    '''
    for _ in range(loops):
        # shuffle filename order for better distribution over multiple loops (epochs)
        np.random.seed(RANDOM_SEED)
        all_filenames = list(filename_description_dictionary.keys())
        np.random.shuffle(all_filenames)
        
        # loop for ever over filenames
        for filename in all_filenames:
            # get the corresponding descriptions
            descriptions = filename_description_dictionary[filename]

            # retrieve the image feature vector
            image_vector = all_image_features[filename][0]

            # get the samples of the desired size (N) 
            x_samples, y_samples = get_samples_of_specific_size(image_vector, descriptions, desired_caption_size, tokenizer)
            yield x_samples, y_samples


In [18]:
# TODO REMOVE IF NOT USED

# def dictionary_to_model_samples(dictionary, image_features, max_length):
#     # list of image features concatenated with corresponding first i-1 words
#     all_Xs = [[] for i in range(max_length)]

#     # next word for corresponding sentence
#     all_ys = [[] for i in range(max_length)]

#     for filename, samples in dictionary.items():
#         samples = tokenizer.texts_to_sequences(samples)
#         for sample in samples:
#             for i in range(len(sample) - 1):

#                 if i > max_length - 1:
#                     break 

#                 x1 = image_features[filename].reshape(-1,)
#                 x2 = np.array(sample[:i + 1])

#                 combined_X = np.concatenate([x1, x2])

#                 y = to_categorical(sample[i+1], VOCAB_SIZE)
            
#                 all_Xs[i].append(combined_X)
#                 all_ys[i].append(y)


#     all_Xs = [tf.convert_to_tensor(samples) for samples in all_Xs]
#     all_ys = [tf.convert_to_tensor(samples) for samples in all_ys]
    
#     return all_Xs, all_ys

In [44]:
def generate_logistic_model(input_size):
    '''
    generate a logistic regression model using keras api
    
    since our model uses multiple logistic regression models, 
    we wanted to run it on the gpu which is simple with keras
    '''
    # create a linear activation function, relu which doesn't punish values < 0
    linear_activation = ReLU(negative_slope=1)

    # FF NN
    model = Sequential()

    # input layer is given input size (4096 + number of words for corresponding decoder)
    model.add(Dense(input_size, activation=linear_activation))

    # output layer with softmax for the whole vocabulary
    model.add(Dense(VOCAB_SIZE, activation='softmax'))

    # compile and return
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(train_samples['caption']))
VOCAB_SIZE = len(tokenizer.word_index) + 1
print(VOCAB_SIZE)

8080


In [14]:
from sklearn.model_selection import train_test_split

test1, test2 = train_test_split(ALL_FILENAMES, test_size=0.2, random_state=RANDOM_SEED)

NameError: name 'ALL_FILENAMES' is not defined

In [23]:
class LogisticDecoder():
    '''
    this class represents a model that can generate captions for text based on image input

    it works by creating a logistic regression classifier for each position in the output string

    each logistic regression model is assigned an index corresponding to the number of words it takes as input
    for example:
        model at index 3 is responsible for taking in a feature vector of length 4096 + 3
        the first 4096 values are the image input, and the +3 represents the first 3 strings in the caption
        the model predicts the 4th word
    '''

    def __init__(self, caption_max_length):
        '''
        given a max caption length N, initalize N logistic regression models, one for each position in our caption length
        '''
        self.max_len = caption_max_length

        # generate a model that takes in an image feature vector, and the caption so far, and outputs the next word
        self.models = [generate_logistic_model(4096 + i + 1) for i in range(caption_max_length)]


    def fit(self, sample_dictionary, epochs, verbose=False):
        for i in range(self.max_len):
            if verbose:
                print(f'Training model #{i+1}')

            # TODO deal with tokenizer
            # TODO deal with saving 
            # TODO deal with importing
            # TODO do prediction

            
            current_generator = data_generator(sample_dictionary, desired_caption_size=i+1, loops=epochs, )
            self.models[i].fit(current_generator)

    def prediction(self, image_filename):
        for i in range(self.max_len):
            

In [24]:
logistic_decoder = LogisticDecoder(1)

In [46]:
test_model = logistic_decoder.models[0]

In [49]:
first_generator = data_generator(validation_dict, val_image_features, 1, loops=1)

In [50]:
test_model.fit_generator(first_generator, epochs=1, verbose=1)

  """Entry point for launching an IPython kernel.




<keras.callbacks.History at 0x1c211bd0ec8>