In [123]:
import cv2, numpy as np
import time
import theano
import os
from collections import OrderedDict, defaultdict
import six.moves.cPickle as pkl
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, LSTM
from keras.layers import Embedding
from keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D 
from keras.optimizers import SGD
from keras.layers import GRU, TimeDistributed, RepeatVector, Merge, TimeDistributedDense
import h5py
import json
from collections import Counter
import matplotlib.pyplot as plt
import skimage.transform

In [3]:
MEAN_VALUES = np.array([104, 117, 123]).reshape((3,1,1))
SEQUENCE_LENGTH = 32
MAX_SENTENCE_LENGTH = SEQUENCE_LENGTH - 3 # 1 for image, 1 for start token, 1 for end token
BATCH_SIZE = 20
CNN_FEATURE_SIZE = 1000
EMBEDDING_SIZE = 256

In [100]:
def word_processing(dataset):
    allwords = Counter()
    for item in dataset:
        for sentence in item['sentences']:
            allwords.update(sentence['tokens'])
            
    vocab = [k for k, v in allwords.items() if v >= 5]
    vocab.insert(0, '#START#')
    vocab.append('#UNK#')
    vocab.append('#END#')

    word_to_index = {w: i for i, w in enumerate(vocab)}
    index_to_word = {i: w for i, w in enumerate(vocab)}
    return vocab, word_to_index, index_to_word

def import_flickr8kdataset():
    dataset = json.load(open('captions/dataset_flickr8k.json'))['images']
    #reduced length to a 300 for testing
    val_set = list(filter(lambda x: x['split'] == 'val', dataset))
    train_set = list(filter(lambda x: x['split'] == 'train', dataset))
    test_set = list(filter(lambda x: x['split'] == 'test', dataset))
    return train_set[:800]+val_set[:200]


In [101]:
def floatX(arr):
    return np.asarray(arr, dtype=theano.config.floatX)

#Prep Image uses an skimage transform
def prep_image(im):
    if len(im.shape) == 2:
        im = im[:, :, np.newaxis]
        im = np.repeat(im, 3, axis=2)
    # Resize so smallest dim = 224, preserving aspect ratio
    h, w, _ = im.shape
    if h < w:
        im = skimage.transform.resize(im, (224, w*224/h), preserve_range=True)
    else:
        im = skimage.transform.resize(im, (h*224/w, 224), preserve_range=True)

    # Central crop to 224x224
    h, w, _ = im.shape
    im = im[h//2-112:h//2+112, w//2-112:w//2+112]
    
    rawim = np.copy(im).astype('uint8')
    
    # Shuffle axes to c01
    im = np.swapaxes(np.swapaxes(im, 1, 2), 0, 1)
    
    # Convert to BGR
    im = im[::-1, :, :]

    im = im - MEAN_VALUES
    return rawim, floatX(im[np.newaxis])

In [217]:
def VGG_16(weights_path=None):
    model = Sequential()
    model.add(ZeroPadding2D((1,1),input_shape=(3,224,224)))
    model.add(Convolution2D(64, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(64, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(128, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(128, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(Flatten())
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1000, activation='softmax'))

    if weights_path:
        model.load_weights(weights_path)

    return model


In [216]:
def language_model():
    model = Sequential()
    print('Adding Embedding')
    model.add(Embedding(VOCAB_COUNT, EMBEDDING_SIZE, input_length=SEQUENCE_LENGTH-1))
    print('Adding LSTM')
    model.add(LSTM(EMBEDDING_SIZE, return_sequences=True))
    print('Adding TimeDistributed Dense')
    model.add(TimeDistributed(Dense(CNN_FEATURE_SIZE)))
    return model

In [219]:
dataset = import_flickr8kdataset()
# Currently testing it out
dataset = [i for i in dataset[:100]]
vocab,word_to_index, index_to_word = word_processing(dataset)
print vocab

['#START#', u'pointing', u'yellow', u'four', u'children', u'young', u'to', u'bike', u'brown', u'woman', u'sitting', u'bubbles', u'large', u'race', u'sidewalk', u'round', u'sign', u'street', u'blue', u'plays', u'stands', u'near', u'uniform', u'pose', u'men', u'water', u'baseball', u'along', u'boy', u'family', u'cheerleader', u'standing', u'from', u'camera', u'tennis', u'under', u'trail', u'carrying', u'stick', u'women', u'car', u'grassy', u'high', u'something', u'sunglasses', u'tan', u'pink', u'sit', u'beach', u'after', u'jumping', u'wave', u'man', u'a', u'green', u'playing', u'shoes', u'over', u'through', u'looks', u'smiling', u'its', u'group', u'cheerleaders', u'covered', u'runs', u'hands', u'front', u'slide', u'rock', u'side', u'catching', u'girl', u'out', u'looking', u'hill', u'red', u'dirt', u'scarf', u'one', u'another', u'city', u'little', u'toy', u'top', u'girls', u'their', u'shorts', u'white', u'dogs', u'store', u'park', u'tree', u'light', u'and', u'blond', u'mountain', u'snow',

In [220]:
def chunks(l, n):
    for i in xrange(0, len(l), n):
        yield l[i:i + n]

def process_images(dataset, coco=False, d_set="Flicker8k_Dataset"):
    ind_process = 1
    total = len(dataset)
    cnn_input = floatX(np.zeros((len(dataset), 3, 224, 224)))
    rawim_input = []
    sentences_tokens = []
    for i, image in enumerate(dataset):
        print "ind_process %s total %s" %(str(ind_process),str(total))
        ind_process+=1
        if coco:
            fn = './coco/{}/{}'.format(image['filepath'], image['filename'])
        else:
            fn = d_set+'/{}'.format(image['filename'])
        try:
            im = plt.imread(fn)
            rawim, cnn_input[i] = prep_image(im)
            sentences_tokens.append(image['sentences'][0]['tokens'])
            rawim_input.append(rawim)
        except IOError:
            continue
    return rawim_input, cnn_input, sentences_tokens


In [221]:
rawim_array, cnnim_array, sentences_tokens = process_images(dataset, coco=False, d_set="Flicker8k_Dataset")
%matplotlib inline

ind_process 1 total 100
ind_process 2 total 100
ind_process 3 total 100
ind_process 4 total 100
ind_process 5 total 100
ind_process 6 total 100
ind_process 7 total 100
ind_process 8 total 100
ind_process 9 total 100
ind_process 10 total 100
ind_process 11 total 100
ind_process 12 total 100
ind_process 13 total 100
ind_process 14 total 100
ind_process 15 total 100
ind_process 16 total 100
ind_process 17 total 100
ind_process 18 total 100
ind_process 19 total 100
ind_process 20 total 100
ind_process 21 total 100
ind_process 22 total 100
ind_process 23 total 100
ind_process 24 total 100
ind_process 25 total 100
ind_process 26 total 100
ind_process 27 total 100
ind_process 28 total 100
ind_process 29 total 100
ind_process 30 total 100
ind_process 31 total 100
ind_process 32 total 100
ind_process 33 total 100
ind_process 34 total 100
ind_process 35 total 100
ind_process 36 total 100
ind_process 37 total 100
ind_process 38 total 100
ind_process 39 total 100
ind_process 40 total 100
ind_proce

In [222]:
def gen_image_partial_captions(images, captions, word_to_index, vocab_count):
    a_images = []
    a_captions = []
    next_words = []
    #vocab_size = len(vocab)
    for ind, image in enumerate(images):
        sentence = captions[ind]
        partial_caption_ar = np.zeros(SEQUENCE_LENGTH-1, dtype=np.int)
        
        words = ['#START#'] + sentence + ['#END#']
        assert len(words)<SEQUENCE_LENGTH
        for i in range(len(words) - 1):
            pc_copy = partial_caption_ar.copy()
            if words[i] in word_to_index:
                pc_copy[i] = word_to_index[words[i]]
            else:
                pc_copy[i] = word_to_index["#UNK#"]
            a_images.append(image)
            a_captions.append(pc_copy)
            #Generate next word output vector
            next_word = words[i + 1]
            if next_word in word_to_index:
                next_word_index = word_to_index[next_word]
            else:
                next_word_index = word_to_index["#UNK#"]
            next_word_ar = np.zeros(vocab_count, dtype=np.int)
            next_word_ar[next_word_index] = 1
            next_words.append(next_word_ar)
    v_i = np.array(a_images)
    v_c = np.array(a_captions)
    v_nw = np.array(next_words)
    return v_i, v_c, v_nw 

In [223]:
vocab_count = len(word_to_index)
print cnnim_array.shape
v_i, v_c, v_nw = gen_image_partial_captions(cnnim_array, sentences_tokens, word_to_index, vocab_count)

(100, 3, 224, 224)


In [224]:
VOCAB_COUNT = len(word_to_index)

In [225]:
def build_model(weights_path):
    image_model = VGG_16(weights_path)
    image_model.add(RepeatVector(SEQUENCE_LENGTH-1))
    print('Built Image Model')
    print('Building Language Model')
    lang_model = language_model()
    model = Sequential()
    model.add(Merge([image_model, lang_model], mode='concat',  concat_axis=-1))
    model.add(LSTM(EMBEDDING_SIZE, return_sequences=False))
    #print(vocab_size)
    model.add(Dense(VOCAB_COUNT, activation='softmax'))

    #print(model.summary())
    return model

In [None]:
model=build_model('weights/vgg16_weights.h5')
print('Built model.')
print('Compiling Now')
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print('Fitting Now')
model.fit([v_i, v_c], v_nw, batch_size=3, nb_epoch=1)

Built Image Model
Building Language Model
Adding Embedding
Adding LSTM
Adding TimeDistributed Dense
Built model.
Compiling Now
Fitting Now
Epoch 1/1
  30/1274 [..............................] - ETA: 2805s - loss: 5.1810 - acc: 0.0333

In [None]:
def predict(model, images, index_to_word, word_to_index):
    for image in images:
        caption = np.zeros(SEQUENCE_LENGTH - 1).reshape(1, SEQUENCE_LENGTH - 1)
        print(caption.shape)
        caption[0,0] = 0
        count=0
        sentence = []
        a = image.reshape(1,3,224,224)
        #a = np.array([image])
        while True:
            out = model.predict([a, caption])
            index = out.argmax(-1)
            index = index[0]
            word = index_to_word[index]
            sentence.append(word)
            count+= 1
            if count >= SEQUENCE_LENGTH - 1 or index == word_to_index["#END#"]: #max caption length reach of '<eos>' encountered
                break
            caption[0,count] = index
        sent_str = " ".join(sentence)
        print("The Oracle says : %s" %sent_str)


In [None]:
cnnim_list = []
for i in cnnim_array:
    cnnim_list.append(i)

In [None]:
predict(model, cnnim_list, index_to_word, word_to_index)

In [211]:
predict(model, cnnim_list, index_to_word)

TypeError: predict() takes exactly 4 arguments (3 given)

In [203]:
caption = np.zeros(SEQUENCE_LENGTH).reshape(1, SEQUENCE_LENGTH)
caption[0] = word_to_index["#START#"]
print cnnim_array[0].shape
t = np.array(cnnim_array[0])
print t.shape
out = model.predict([v_i, v_c])
print out


(3, 224, 224)
(3, 224, 224)
[[ 0.01479965  0.02197597  0.1271223   0.0757821   0.11408521  0.02933407
   0.02511705  0.06509422  0.47890654  0.04778289]
 [ 0.01412566  0.02101637  0.13756378  0.06709128  0.09927694  0.03693941
   0.02461312  0.06562667  0.4821761   0.05157067]
 [ 0.01461078  0.02260523  0.13023429  0.07412055  0.10645258  0.03562364
   0.02546796  0.06531136  0.47494298  0.05063061]
 [ 0.01535605  0.02531418  0.12274525  0.08307311  0.11395948  0.03609204
   0.02680946  0.06548851  0.46061364  0.05054829]
 [ 0.01574896  0.02529401  0.11877078  0.08701741  0.12167319  0.0309793
   0.02692072  0.06540363  0.45984092  0.04835107]
 [ 0.01577112  0.0246546   0.11802998  0.0869009   0.12414648  0.02843006
   0.02665788  0.06534739  0.46276993  0.04729164]
 [ 0.01573384  0.02382465  0.11917216  0.08517403  0.12476258  0.02666877
   0.02624967  0.0654726   0.46632922  0.04661248]
 [ 0.01556872  0.02274938  0.12185793  0.08195729  0.12369877  0.02534524
   0.02561617  0.0658129

In [202]:
print out.shape
print dataset[0]

(33, 10)
{u'filename': u'2513260012_03d33305cf.jpg', u'imgid': 0, u'sentences': [{u'tokens': [u'a', u'black', u'dog', u'is', u'running', u'after', u'a', u'white', u'dog', u'in', u'the', u'snow'], u'raw': u'A black dog is running after a white dog in the snow .', u'imgid': 0, u'sentid': 0}, {u'tokens': [u'black', u'dog', u'chasing', u'brown', u'dog', u'through', u'snow'], u'raw': u'Black dog chasing brown dog through snow', u'imgid': 0, u'sentid': 1}, {u'tokens': [u'two', u'dogs', u'chase', u'each', u'other', u'across', u'the', u'snowy', u'ground'], u'raw': u'Two dogs chase each other across the snowy ground .', u'imgid': 0, u'sentid': 2}, {u'tokens': [u'two', u'dogs', u'play', u'together', u'in', u'the', u'snow'], u'raw': u'Two dogs play together in the snow .', u'imgid': 0, u'sentid': 3}, {u'tokens': [u'two', u'dogs', u'running', u'through', u'a', u'low', u'lying', u'body', u'of', u'water'], u'raw': u'Two dogs running through a low lying body of water .', u'imgid': 0, u'sentid': 4}], 

In [196]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
zeropadding2d_209 (ZeroPadding2D)(None, 3, 226, 226)   0                                            
____________________________________________________________________________________________________
convolution2d_209 (Convolution2D)(None, 64, 224, 224)  1792                                         
____________________________________________________________________________________________________
zeropadding2d_210 (ZeroPadding2D)(None, 64, 226, 226)  0                                            
____________________________________________________________________________________________________
convolution2d_210 (Convolution2D)(None, 64, 224, 224)  36928                                        
___________________________________________________________________________________________