In [1]:
import pandas as pd
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import ReLU

from tensorflow.keras.layers import Dense
from keras.models import Sequential

import numpy as np
import tensorflow as tf
from nltk.translate.bleu_score import corpus_bleu

from keras.models import load_model

from common import RANDOM_SEED, clean_descriptions, ALL_FILENAMES
from common import samples_to_dict, import_image_features, max_and_average_sequence_length, START_TOK, END_TOK, get_tokenizer_from_samples, reset_keras

In [2]:
# import the cleaned data and print one example
cleaned_data = clean_descriptions('data/flickr_8k/captions.txt')
print(cleaned_data.iloc[0])

image                              1000268201_693b08cb0e.jpg
caption    [<start>, a, child, in, a, pink, dress, is, cl...
Name: 0, dtype: object


In [29]:
''' 
get the actual rows from the df corresponding to the different sets
'''
# get the samples with the given filenames
train_samples = clean_descriptions('data/flickr_8k/train.csv')
validation_samples = clean_descriptions('data/flickr_8k/validation.csv')
test_samples = clean_descriptions('data/flickr_8k/test.csv')
train_and_val_samples = clean_descriptions('data/flickr_8k/train_and_val.csv')


In [30]:
train_samples.iloc[0]

Unnamed: 0                                                    0
image                                 1000268201_693b08cb0e.jpg
caption       [<start>, a, child, in, a, pink, dress, is, cl...
Name: 0, dtype: object

In [27]:

training_dict = samples_to_dict(train_samples)
validation_dict = samples_to_dict(validation_samples)
train_and_val_dict = samples_to_dict(train_and_val_samples)
test_dict = samples_to_dict(test_samples)

In [6]:
all_image_features = import_image_features('8k_features.pkl', ALL_FILENAMES)

# NOTE: we dont use these anymore and just use the one all_image_features variable to get the features from
# train_image_features = import_image_features('8k_features.pkl', TRAIN_FILENAMES)
# val_image_features = import_image_features('8k_features.pkl', VALIDATION_FILENAMES)
# train_and_val_image_features = import_image_features('8k_features.pkl', TRAIN_AND_VAL_FILENAMES)
# test_image_features = import_image_features('8k_features.pkl', TEST_FILENAMES)

In [28]:
MAX_LENGTH, AVG_LENGTH = max_and_average_sequence_length(train_and_val_samples)

The top 30 sequence lengths are:
[37, 35, 35, 35, 34, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30]
The longest sequence length from the training and validation samples is 37
The average sequence length from the training and validation samples is 12


In [8]:
def get_samples_of_specific_size(image_vector, descriptions, desired_caption_size, tokenizer):
    '''
    Given:
    - one image vector representing a single image
    - one value from our samples dict for the corresponding image (a list of 5 tokenized captions)
    - the desired caption size N
    - a Tokenizer

    Return X:
        a tensor with 5 elements (one for each caption) where each item is an array with length 4096 + N
        - the first 4096 elements are the VGG extracted features of the corresponding image
        - the next N elements are the first N words of the caption (converted to numbers by the passed in tokenizer)
    And Y:
        a tensor with 5 elements where each element is:
        - the N+1 word in the sequence

    NOTE: If any caption has a total length less than or equal to N, then it will not be added to the output, 
    meaning that the output could be tensors with 0 dimensions
    '''
    # initalize empty arrays for the samples
    X = []
    y = []

    # reshape the image vector to be one dimension
    image_vector = image_vector.reshape(-1,)
                                
    # convert the descriptions to number lists instead of string lists
    descriptions = tokenizer.texts_to_sequences(descriptions)
    
    # get vocab size from tokenizer
    vocab_size = len(tokenizer.index_word) + 1

    # for each description
    for description in descriptions:
        # only if the caption is at least 1 longer than N (desired_caption_size)
        if len(description) >= desired_caption_size + 1:
            # get the caption as a numpy array of elements (length will be desired caption size)
            caption = np.array(description[:desired_caption_size])
        
            # concatenate the image vector with the caption vector
            combined_X = np.concatenate([image_vector, caption])

            # append the combined X vector to the output x list
            X.append(combined_X)

            # get the one-hot encoding of the last word (required for keras models)
            # and append it to the output y list
            last_word = description[desired_caption_size]
            
            y.append(to_categorical(last_word, vocab_size))

    return tf.convert_to_tensor(X), tf.convert_to_tensor(y)


In [9]:
def data_generator(filename_description_dictionary, desired_caption_size, loops, tokenizer):
    '''
    Given:
    - A dictionary containing the samples we want to create a generator for where
        key: filename (string)
        value: list of captions (where each caption is a list of strings)
    - A desired caption size (N)
    - A number of loops L
    - A tokenizer for converting seen words to numbers
    
    loops are used because a generator is expended once it yields its last result, and 
    therefore cannot be used over multiple epohchs

    Iterate through the filename_description dictionary (L times).
    For each filename, generate corresponding number of samples where the caption size is N
    Each value in the X samples vector will be 4096 + N
        Those N values are the first N words of the corresponding caption
    Each value in the y vector will be the N+1 word

    Used to save memory
    Each loop it shuffles the order of the samples
    '''
    for _ in range(loops):
        # shuffle filename order for better distribution over multiple loops (epochs)
        np.random.seed(RANDOM_SEED)
        all_filenames = list(filename_description_dictionary.keys())
        np.random.shuffle(all_filenames)
        
        # loop for ever over filenames
        for filename in all_filenames:
            # get the corresponding descriptions
            descriptions = filename_description_dictionary[filename]

            # retrieve the image feature vector
            image_vector = all_image_features[filename][0]

            # get the samples of the desired size (N) 
            x_samples, y_samples = get_samples_of_specific_size(image_vector, descriptions, desired_caption_size, tokenizer)
            
            # if there are no samples of that shape
            if y_samples.shape == (0,):
                # continue the loop until there are
                continue
            
            yield x_samples, y_samples


In [10]:
# TODO REMOVE IF NOT USED

# def dictionary_to_model_samples(dictionary, image_features, max_length):
#     # list of image features concatenated with corresponding first i-1 words
#     all_Xs = [[] for i in range(max_length)]

#     # next word for corresponding sentence
#     all_ys = [[] for i in range(max_length)]

#     for filename, samples in dictionary.items():
#         samples = tokenizer.texts_to_sequences(samples)
#         for sample in samples:
#             for i in range(len(sample) - 1):

#                 if i > max_length - 1:
#                     break 

#                 x1 = image_features[filename].reshape(-1,)
#                 x2 = np.array(sample[:i + 1])

#                 combined_X = np.concatenate([x1, x2])

#                 y = to_categorical(sample[i+1], VOCAB_SIZE)
            
#                 all_Xs[i].append(combined_X)
#                 all_ys[i].append(y)


#     all_Xs = [tf.convert_to_tensor(samples) for samples in all_Xs]
#     all_ys = [tf.convert_to_tensor(samples) for samples in all_ys]
    
#     return all_Xs, all_ys

In [11]:
def generate_logistic_model(input_size, output_size):
    '''
    generate a logistic regression model using keras api
    
    since our model uses multiple logistic regression models, 
    we wanted to run it on the gpu which is simple with keras
    '''
    # create a linear activation function, relu which doesn't punish values < 0
    linear_activation = ReLU(negative_slope=1)

    # FF NN
    model = Sequential()

    # input layer is given input size (4096 + number of words for corresponding decoder)
    model.add(Dense(input_size, activation=linear_activation))

    # output layer with softmax for the whole vocabulary
    model.add(Dense(output_size, activation='softmax'))

    # compile and return
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [12]:
class LogisticDecoder():
    '''
    this class represents a model that can generate captions for text based on image input

    it works by creating a logistic regression classifier for each position in the output string

    each logistic regression model is assigned an index corresponding to the number of words it takes as input
    for example:
        model at index 3 is responsible for taking in a feature vector of length 4096 + 3
        the first 4096 values are the image input, and the +3 represents the first 3 strings in the caption
        the model predicts the 4th word
    '''

    def __init__(self, caption_max_length, tokenizer):
        '''
        given a max caption length N, initalize N logistic regression models, one for each position in our caption length
        given a tokenizer, store it in the model
        '''
        self.max_len = caption_max_length

        # store the tokenizer for later use
        self.tokenizer = tokenizer
        
        # get the vocab size (add one due to the way keras tokenizer works)
        self.vocab_size = len(tokenizer.word_index) + 1

        # generate a model that takes in an image feature vector, and the caption so far, and outputs the next word
        self.models = [None for i in range(caption_max_length)]


    def fit(self, sample_dictionary, epochs, model_save_directory, verbose=False):
        '''
        given a dictionary of samples (key is a filename and value is all associated captions tokenized into lists of strings)
        and a number of epochs

        train the logistic decoders to generate captions

        if model_save_directory is given, save the logistic models into the given directory
        don't add a / at the end of the directory
        '''
        for i in range(self.max_len):
            if verbose:
                print(f'Training model #{i+1}')

            current_generator = data_generator(sample_dictionary, desired_caption_size=i+1, loops=epochs, tokenizer=self.tokenizer)
            
            current_model = generate_logistic_model(4096 + i + 1, self.vocab_size)

            current_model.fit_generator(current_generator)

            # save the model to a designated parent folder
            save_path = f'{model_save_directory}/decoder{i+1}'
            current_model.save(save_path)
            
            # clear memory for next model
            reset_keras(current_model)
            
            if verbose:
                print(f'Model #{i+1} saved to {save_path}')
                
            # TODO save tokenizer as well and load it in the load function 
              
        # after training, reload all saved models
        self.load(model_save_directory)
                    
    def load(self, directory_path):
        '''
        load in a model from a folder that has all decoders saved into it
        
        do not add a / at the end of the directory path
        '''
        for i in range(self.max_len):
            self.models[i] = load_model(f'{directory_path}/decoder{i+1}')
            
        print(f'Model loaded from {directory_path}')
        
                
    def generate_caption(self, image_filename, verbose=True):
        '''
        given a filename use the trained models to decode each next word for a full caption
        '''
        caption = [START_TOK]

        image_vector = all_image_features[image_filename].reshape(-1,)

        for i in range(self.max_len):
            # should be length i + 1 because one word is added each iteration
            caption_as_indices = self.tokenizer.texts_to_sequences([caption])[0] 
            
            # should be length 4096 + (i+1) for the input of the corresponding decoder
            next_input = np.concatenate([image_vector, np.array(caption_as_indices)])
            
            # reshape it into 1 x 4096 + (i+1) shape for keras input
            next_input = next_input.reshape(1, len(next_input))

            # get the current model
            current_model = self.models[i]
            
            # get the probability distribution for output layer
            probablities = current_model.predict(next_input).reshape(-1,)
                    
            # predict the index of the next word (randomly sample from the vocab based on the prediction output distribution)
            predicted_word_index = np.random.choice(self.vocab_size, p=probablities)
            
            # convert the index to a word based on the tokenizer
            predicted_word = self.tokenizer.index_word[predicted_word_index]

            # add the word to out caption
            caption.append(predicted_word)

            # if it is the end of sequence token break out the loop
            if predicted_word == END_TOK:
                break
        
        if verbose:
            # TODO come back and make caption into a string
            print(f'Caption for {image_filename}: {caption}')

        return caption

In [13]:
tokenizer = get_tokenizer_from_samples(train_and_val_samples)

In [14]:
'''
we choose 12 as our max caption length, despite there being many longer captions,
computationally we were limited on time, and therefore chose to decrease the number of decoders
to the average caption length rather than the maximum seen caption length
'''
logistic_decoder = LogisticDecoder(12, tokenizer)

In [15]:
# logistic_decoder.load('LogisticDecoders/train_val_20_epoch_maxlen_12')

Model loaded from LogisticDecoders/train_val_20_epoch_maxlen_12


In [16]:
# caption = logistic_decoder.generate_caption('111766423_4522d36e56.jpg')

In [24]:
# logistic_decoder.vocab_size, caption.shape

(8523, (8497,))

In [22]:
# np.random.choice(logistic_decoder.vocab_size, p=caption)

ValueError: 'a' and 'p' must have same size

In [17]:
# logistic_decoder.fit(train_and_val_dict, epochs=20, model_save_directory='', verbose=True)


In [18]:
# check = list(data_generator(validation_dict, 1, 1, tokenizer))

# x = [item[0].numpy() for item in check]
# y = [item[1].numpy() for item in check]


# output_x = []
# output_y = []
# for itemsx, itemsy in zip(x, y):
#     for itemx, itemy in zip(itemsx, itemsy):
#         output_x.append(itemx)
#         output_y.append(itemy.argmax())
        
# x = output_x
# y = output_y


# from sklearn.linear_model import LogisticRegression


# test = LogisticRegression(multi_class='ovr', solver='liblinear')
# test.fit(x, y)


In [19]:
# caption = logistic_decoder.generate_caption('111766423_4522d36e56.jpg')

In [20]:
# new_model = LogisticDecoder(5, tokenizer)
# new_model.load('LogisticDecoders/train_val_1_epoch_maxlen_4')

In [21]:
# caption = new_model.generate_caption('111766423_4522d36e56.jpg')