In [30]:


%matplotlib inline
import os, argparse
import numpy as np
import cv2 as cv2
import spacy as spacy
import matplotlib.pyplot as plt
from keras.models import Model, Input
from keras.layers.core import Dense, Dropout, Reshape
from keras.layers.recurrent import LSTM
from keras.layers.merge import concatenate
from keras.applications.vgg16 import VGG16
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input
import numpy as np
from sklearn.externals import joblib
import PIL.Image



In [31]:
# mapping id -> labels for categories
label_encoder_file_name = 'FULL_labelencoder_trainval.pkl'
# max length across corpus
max_length_questions = 30
# VGG output 
length_vgg_features = 4096
# Embedding outout
length_feature_space = 300
# pre-trained weights
VQA_weights_file = 'VQA_MODEL_WEIGHTS.hdf5'

In [32]:
'''image features'''
def get_image_features(img_path, VGG16modelFull):
    '''given an image returns a tensor with (1, 4096) VGG16 features'''
    # Since VGG was trained as a image of 224x224, every new image
    # is required to go through the same transformation
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    # this is required because of the original training of VGG was batch
    # even if we have only one image we need to be consistent 
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    features = VGG16modelFull.predict(x)
    model_extractfeatures = Model(inputs=VGG16modelFull.input, 
                                  outputs=VGG16modelFull.get_layer('fc2').output)
    fc2_features = model_extractfeatures.predict(x)
    fc2_features = fc2_features.reshape((1, length_vgg_features))
    return fc2_features

In [33]:
'''embedding'''
def get_question_features(question):
    ''' given a question, a unicode string, returns the time series vector
    with each word (token) transformed into a 300 dimension representation
    calculated using Glove Vector '''
    word_embeddings = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
    tokens = word_embeddings(question)
    ntokens = len(tokens)
    if (ntokens > max_length_questions) :
        ntokens = max_length_questions
    question_tensor = np.zeros((1, max_length_questions, 384))
    for j in range(len(tokens)):
            question_tensor[0,j,:] = tokens[j].vector
    return question_tensor

In [60]:
image_file_name = 'cropped_panda.jpg'
img0 = PIL.Image.open(image_file_name)
img0.show()
#get the salient features
model = VGG16(weights='imagenet', include_top=True)
model.summary()
image_features = get_image_features(image_file_name, model)
print (image_features.shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [61]:
question = u"Who is in this picture?"
language_features = get_question_features(question)
print (language_features.shape)

(1, 30, 384)


In [62]:
'''combine'''
def build_combined_model(
    number_of_LSTM              = 3,
    number_of_hidden_units_LSTM = 512,
    number_of_dense_layers      = 3,
    number_of_hidden_units      = 1024,
    activation_function         = 'tanh',
    dropout_pct                 = 0.5
):
    
    #input image
    # not clear how to read 4096 from input
    input_image = Input(shape=(length_vgg_features,),
                       name="input_image")
    model_image = Reshape((length_vgg_features,), 
                          input_shape=(length_vgg_features,))(input_image)
   
    
    #input language 
    # not clear our to read max_length_questions, 300 from input. Should i reshape it?
    input_language = Input(shape=(max_length_questions,length_feature_space,),
                          name="input_language")
    
    #build a sequence of LSTM
    model_language = LSTM(number_of_hidden_units_LSTM, 
                         return_sequences=True, 
                         name = "lstm_1")(input_language)
    model_language = LSTM(number_of_hidden_units_LSTM, 
                          return_sequences=True,
                         name = "lstm_2")(model_language)
    model_language = LSTM(number_of_hidden_units_LSTM, 
                          return_sequences=False,
                         name = "lstm_3")(model_language)

    #concatenate 4096+512
    model = concatenate([model_image, model_language])
    
    #Dense, Dropout
    for _ in range(number_of_dense_layers):
        model = Dense(number_of_hidden_units, 
                     kernel_initializer='uniform')(model)
        model = Dropout(dropout_pct)(model)

    model = Dense(1000,
                 activation='softmax')(model)
    

    #create model from tensors
    model = Model(inputs=[input_image, input_language], outputs = model)
    
    return model

In [63]:
combined_model = build_combined_model()
combined_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_language (InputLayer)     (None, 30, 300)      0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 30, 512)      1665024     input_language[0][0]             
__________________________________________________________________________________________________
input_image (InputLayer)        (None, 4096)         0                                            
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, 30, 512)      2099200     lstm_1[0][0]                     
__________________________________________________________________________________________________
reshape_5 

In [64]:
combined_model.load_weights(VQA_weights_file)

In [65]:
combined_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [66]:
print(image_features.shape)
print(language_features.shape)
language_features=language_features[:,:,0:300]
print (language_features.shape)

(1, 4096)
(1, 30, 384)
(1, 30, 300)


In [67]:
y_output = combined_model.predict([image_features, language_features])

In [68]:
# This task here is represented as a classification into a 1000 top answers
# this means some of the answers were not part of training and thus would 
# not show up in the result.
# These 1000 answers are stored in the sklearn Encoder class
labelencoder = joblib.load(label_encoder_file_name)
for label in reversed(np.argsort(y_output)[0,-5:]):
    print(str(round(y_output[0,label]*100,2)).zfill(5), "% ", labelencoder.inverse_transform(label))



  """
  if diff:


100.0 %  black


  if diff:


000.0 %  zoo


  if diff:


000.0 %  english


  if diff:


000.0 %  field


  if diff:


000.0 %  fence
