In [1]:
import cv2, numpy as np
import time
import theano
import os
from collections import OrderedDict, defaultdict
import six.moves.cPickle as pkl
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, LSTM
from keras.layers import Embedding
from keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D 
from keras.optimizers import SGD
from keras.layers import GRU, TimeDistributed, RepeatVector, Merge, TimeDistributedDense
import h5py
import json
from collections import Counter
import matplotlib.pyplot as plt
import skimage.transform

Using Theano backend.


In [2]:
MEAN_VALUES = np.array([104, 117, 123]).reshape((3,1,1))
SEQUENCE_LENGTH = 32
MAX_SENTENCE_LENGTH = SEQUENCE_LENGTH - 3 # 1 for image, 1 for start token, 1 for end token
BATCH_SIZE = 20
CNN_FEATURE_SIZE = 1000
EMBEDDING_SIZE = 256

In [3]:
def word_processing(dataset):
    allwords = Counter()
    for item in dataset:
        for sentence in item['sentences']:
            allwords.update(sentence['tokens'])
            
    vocab = [k for k, v in allwords.items()]
    vocab.insert(0, '<NULL>')
    vocab.append('<START>')
    vocab.append('<END>')
    vocab.append('<UNK>')

    word_to_index = {w: i for i, w in enumerate(vocab)}
    index_to_word = {i: w for i, w in enumerate(vocab)}
    return vocab, word_to_index, index_to_word

def import_flickr8kdataset():
    dataset = json.load(open('captions/dataset_flickr8k.json'))['images']
    #reduced length to a 300 for testing
    val_set = list(filter(lambda x: x['split'] == 'val', dataset))
    train_set = list(filter(lambda x: x['split'] == 'train', dataset))
    test_set = list(filter(lambda x: x['split'] == 'test', dataset))
    return train_set[:800]+val_set[:200]

In [5]:
def floatX(arr):
    return np.asarray(arr, dtype=theano.config.floatX)

#Prep Image uses an skimage transform
def prep_image(im):
    if len(im.shape) == 2:
        im = im[:, :, np.newaxis]
        im = np.repeat(im, 3, axis=2)
    # Resize so smallest dim = 224, preserving aspect ratio
    h, w, _ = im.shape
    if h < w:
        im = skimage.transform.resize(im, (224, w*224/h), preserve_range=True)
    else:
        im = skimage.transform.resize(im, (h*224/w, 224), preserve_range=True)

    # Central crop to 224x224
    h, w, _ = im.shape
    im = im[h//2-112:h//2+112, w//2-112:w//2+112]
    
    rawim = np.copy(im).astype('uint8')
    
    # Shuffle axes to c01
    im = np.swapaxes(np.swapaxes(im, 1, 2), 0, 1)
    
    # Convert to BGR
    im = im[::-1, :, :]

    im = im - MEAN_VALUES
    return rawim, floatX(im[np.newaxis])

In [6]:
def VGG_16(weights_path=None):
    model = Sequential()
    model.add(ZeroPadding2D((1,1),input_shape=(3,224,224)))
    model.add(Convolution2D(64, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(64, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(128, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(128, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(Flatten())
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1000, activation='softmax'))

    if weights_path:
        model.load_weights(weights_path)

    return model

In [8]:
dataset = import_flickr8kdataset()
# Currently testing it out
dataset = [i for i in dataset[:100]]
vocab,word_to_index, index_to_word = word_processing(dataset)

In [16]:
def chunks(l, n):
    for i in xrange(0, len(l), n):
        yield l[i:i + n]

def process_cnn_features(dataset, model, coco=False, d_set="Flicker8k_Dataset"):
    ind_process = 1
    total = len(dataset)
    for chunk in chunks(dataset, 25):
        cnn_input = floatX(np.zeros((len(chunk), 3, 224, 224)))
        for i, image in enumerate(chunk):
            print "ind_process %s total %s" %(str(ind_process),str(total))
            ind_process+=1
            if coco:
                fn = './coco/{}/{}'.format(image['filepath'], image['filename'])
            else:
                fn = d_set+'/{}'.format(image['filename'])
            try:
                im = plt.imread(fn)
                _, cnn_input[i] = prep_image(im)
            except IOError:
                continue
        features = model.predict(cnn_input)
        print "Processing Features For Chunk"
        for i, image in enumerate(chunk):
            image['cnn features'] = features[i]

In [43]:
def process_caption_features(dataset, coco=False, d_set="Flicker8k_Dataset"):
    ind_process = 1
    total = len(dataset)
    for image in dataset:
        partial_caption_ar = np.zeros(SEQUENCE_LENGTH, dtype=np.int)
        #print SEQUENCE_LENGTH, len(image['sentences'][0]['tokens'])
        words = ['<START>'] + image['sentences'][0]['tokens'] + ['<END>']
        #print image['sentences'][0]['tokens']
        assert len(words)<=SEQUENCE_LENGTH
        for i in range(len(words)):
            
            if words[i] in word_to_index:
                partial_caption_ar[i] = word_to_index[words[i]]
            else:
                partial_caption_ar[i] = word_to_index["<UNK>"]
        image['captions'] = partial_caption_ar

In [10]:
model = VGG_16('weights/vgg16_weights.h5')
sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(optimizer=sgd, loss='categorical_crossentropy')

  mode='max')
  mode='max')
  mode='max')


In [12]:
import json
dataset = json.load(open('captions/dataset_flickr8k.json'))['images']
val_set = list(filter(lambda x: x['split'] == 'val', dataset))
train_set = list(filter(lambda x: x['split'] == 'train', dataset))
test_set = list(filter(lambda x: x['split'] == 'test', dataset))
dataset = train_set[:200]+val_set[:50]

In [13]:
process_cnn_features(dataset, model, False, "Flicker8k_Dataset")

ind_process 1 total 250
ind_process 2 total 250
ind_process 3 total 250
ind_process 4 total 250
ind_process 5 total 250
ind_process 6 total 250
ind_process 7 total 250
ind_process 8 total 250
ind_process 9 total 250
ind_process 10 total 250
ind_process 11 total 250
ind_process 12 total 250
ind_process 13 total 250
ind_process 14 total 250
ind_process 15 total 250
ind_process 16 total 250
ind_process 17 total 250
ind_process 18 total 250
ind_process 19 total 250
ind_process 20 total 250
ind_process 21 total 250
ind_process 22 total 250
ind_process 23 total 250
ind_process 24 total 250
ind_process 25 total 250
Processing Features For Chunk
ind_process 26 total 250
ind_process 27 total 250
ind_process 28 total 250
ind_process 29 total 250
ind_process 30 total 250
ind_process 31 total 250
ind_process 32 total 250
ind_process 33 total 250
ind_process 34 total 250
ind_process 35 total 250
ind_process 36 total 250
ind_process 37 total 250
ind_process 38 total 250
ind_process 39 total 250
ind_

In [55]:
process_caption_features(dataset, coco=False, d_set="Flicker8k_Dataset")

In [59]:
img_cnn_input = []#np.zeros((len(dataset),CNN_FEATURE_SIZE))
sentences_input = []#np.zeros((len(dataset),SEQUENCE_LENGTH - 1))
nextwords_input = []#np.zeros((len(dataset),SEQUENCE_LENGTH - 1))

for ind, x in enumerate(dataset):
    img_cnn_input.append(x['cnn features'])
    sentences_input.append(x['captions'][0:len(x['captions'])-1])
    nextwords_input.append(x['captions'][1:])
img_cnn_input = np.array(img_cnn_input)#np.zeros((len(dataset),CNN_FEATURE_SIZE))
sentences_input = np.array(sentences_input)#np.zeros((len(dataset),SEQUENCE_LENGTH - 1))
nextwords_input = np.array(nextwords_input)#np.zeros((len(dataset),SEQUENCE_LENGTH - 1))

In [66]:
VOCAB_COUNT = len(vocab)

In [67]:

image_model = Sequential()
language_model = Sequential()
language_model.add(Embedding(VOCAB_COUNT, EMBEDDING_SIZE, input_length=SEQUENCE_LENGTH - 1))
language_model.add(GRU(output_dim=128, return_sequences=True))
language_model.add(TimeDistributed(Dense(128)))

# let's repeat the image vector to turn it into a sequence.
image_model.add(Dense(EMBEDDING_SIZE,input_shape=(CNN_FEATURE_SIZE))

# the output of both models will be tensors of shape (samples, max_caption_len, 128).
# let's concatenate these 2 vector sequences.
imcap_model = Sequential()
imcap_model.add(Merge([image_model, language_model], mode='concat', concat_axis=-1))
# let's encode this vector sequence into a single vector
imcap_model.add(GRU(EMBEDDING_SIZE, return_sequences=False))
# which will be used to compute a probability
# distribution over what the next word in the caption should be!
imcap_model.add(Dense(VOCAB_COUNT))
imcap_model.add(Activation('softmax'))

imcap_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')


AssertionError: Keyword argument not understood: input_length