In [None]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD, Adadelta, Adagrad
from keras.utils import np_utils, generic_utils
from keras.callbacks import EarlyStopping
from keras.layers.advanced_activations import PReLU, LeakyReLU
from keras.layers import Embedding,GRU,TimeDistributed,RepeatVector, LSTM, concatenate , Input, Reshape
from keras.preprocessing.text import one_hot
from keras.preprocessing import sequence
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
import numpy as np
from keras import backend as K 
from keras.applications.inception_resnet_v2 import InceptionResNetV2
from keras.preprocessing import image
from keras.applications.inception_resnet_v2 import preprocess_input
from keras.models import Model
K.set_image_dim_ordering('th')

In [None]:
max_caption_len = 11
vocab_size = 11

In [None]:
img_path = 'screenshot.jpg'
img = image.load_img(img_path, target_size=(299, 299))
img = image.img_to_array(img)
img = np.expand_dims(img, axis=0)
img = preprocess_input(img)

In [None]:
# Our html: "<HTML><BODY><center><H1>Hello World!</H1><center></BODY></HTML>"
html = ['<HTML>', '<BODY>', '<center>', '<H1>', 'Hello', ' ', 'World!', '</H1>', '</center>', '</BODY>', '</HTML>']
syntax_to_index = {}
index_to_syntax = {}
for i,word in enumerate(html):
    syntax_to_index[word] = i
    index_to_syntax[i] = word
html_input = np.array([[syntax_to_index[element] for element in html]])

In [None]:
next_words = np.zeros((1, max_caption_len - 1, vocab_size), dtype='float32')

In [None]:
for i, word in enumerate(html):
    if i > 0:
        next_words[0, i - 1, syntax_to_index[word]] = 1.

In [None]:
IR2 = InceptionResNetV2(weights=None, include_top=True)
ir2_out = Dense(1024, activation='relu')(IR2.output)
ir2_out = Dropout(0.2)(ir2_out)
ir2_out = Dense(1024, activation='relu')(ir2_out)
ir2_out = RepeatVector(max_caption_len)(ir2_out)
image_vector = Model(IR2.input, outputs=ir2_out)

In [None]:
syntax = Input(shape=(max_caption_len,))
language_model = Embedding(vocab_size, 256)(syntax)
language_model = LSTM(128, return_sequences=True)(language_model)
language_model = TimeDistributed(Dense(128))(language_model)

In [None]:
decoder = concatenate([image_vector, language_model])
decoder = LSTM(512, return_sequences=True)(decoder)
decoder = LSTM(512, return_sequences=False)(decoder)
decoder_output = Dense(vocab_size, activation='softmax')(decoder)

model = Model(inputs=[IR2.input, syntax], outputs=decoder_output)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [None]:
model.fit([img, html_input], next_words, batch_size=1, nb_epoch=5)