In [None]:
import tensorflow as tf
from tensorflow.keras.utils import Sequence, to_categorical, plot_model
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Concatenate, LSTM
from tensorflow.keras import callbacks
import math
import numpy as np
import pandas as pd
import string
import random
import pydot
import os
dataset_dir = '/ML/datasets/'

In [None]:
tf.config.list_physical_devices('GPU')

In [None]:
#ensure gpu is used and use dynamic memory allocation(otherwise could run out of memory)
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
train_data = pd.read_csv(dataset_dir+'vqa-v1/train_data.csv')

train_data

In [None]:
val_data = pd.read_csv(dataset_dir+'vqa-v1/val_data.csv')

val_data

In [None]:
train_image_embeddings = dict()
directory = dataset_dir+'vqa-v1/images/train_image_features/'
files = os.listdir(directory)
for i, file in enumerate(files):
    #print('processed',i+1,'features',end='\r')
    feature = np.load(directory+file)['arr_0']
    name = file[:-4]
    train_image_embeddings[name] = feature

In [None]:
val_image_embeddings = dict()
directory = dataset_dir+'vqa-v1/images/val_image_features/'
files = os.listdir(directory)
for i, file in enumerate(files):
    #print('processed',i+1,'features',end='\r')
    feature = np.load(directory+file)['arr_0']
    name = file[:-4]
    val_image_embeddings[name] = feature

In [None]:
#question feature extraction
#glove embeddings

glove_embeddings = dict()
f = open(dataset_dir+'pre-trained/glove/glove.6B.300d.txt', encoding='utf-8')
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float64')
  glove_embeddings[word] = coefs
f.close()

print(len(glove_embeddings))
print(glove_embeddings['king'][:50])

In [None]:
#answer feature extraction
answers = list(pd.unique(train_data['answ']))
random.shuffle(answers)
print(len(answers))
print(answers[:100])

In [None]:
class datagen(Sequence):
    
    def __init__(self, dataframe, split, batch_size, predict_mode=False):
        
        self.dataframe = dataframe
        self.split = split
        self.batch_size = batch_size
        self.predict_mode = predict_mode
        
    def __len__(self):
        return math.ceil(len(self.dataframe) / self.batch_size)
        
    def __getitem__(self, ind):
        
        #print('Starting batch',ind)
        
        partial_dataframe = self.dataframe[ind * self.batch_size : (ind + 1) * self.batch_size]
        
        img_path = list(partial_dataframe['im_path'])
        ques = list(partial_dataframe['ques'])
        if not self.predict_mode:
            ans = list(partial_dataframe['answ'])
        
        image_features = self.get_img_feature(img_path)
        question_features = self.get_ques_feature(ques)

        if not self.predict_mode:
            answer_features = self.get_ans_feature(ans)
        
        x = (image_features, question_features)
        if not self.predict_mode:
            y = answer_features
        
        #print('Finished batch',ind+1)
        
        if self.predict_mode:
            return x, 
        else:
            return x, y
    
    def get_img_feature(self, img_path):
        
        #print('getting image feature')
        
        if self.split == 'train':
            image_embeddings = train_image_embeddings
        else:
            image_embeddings = val_image_embeddings
        
        r = np.ndarray((len(img_path), 2048), dtype='float32')
        
        for i, path in enumerate(img_path):
            img = image_embeddings[path[:-4]]
            r[i] = img
            
        return r
    
    def get_ques_feature(self, ques):
        
        #print('getting question feature')
        
        r = np.zeros((len(ques), 15, 300), dtype='float32')
        default = np.zeros((300), dtype='float32')
        
        for i, q in enumerate(ques):
            
            table = str.maketrans(dict.fromkeys(string.punctuation))
            q = q.translate(table).lower()
            words = q.split()[:15]
            e = [glove_embeddings.get(w, default) for w in words]
            r[i,:len(e)] = e
            
        return r
    
    def get_ans_feature(self, ans):
        
        #print('getting answer feature')
        
        r = np.zeros((len(ans), 3000), dtype='float32')
        
        for i, a in enumerate(ans):
            ind = answers.index(a)
            r[i][ind] = 1.0
            
        return r
    
    def on_epoch_end(self):
        
        self.dataframe = self.dataframe.sample(frac = 1) 

In [None]:
'''
mask = train_data['ques'].str.startswith("What animal")

train_data_whatanimal = train_data[mask]
print(len(train_data_whatanimal))

test_data_whatanimal = train_data_whatanimal.loc[[768]]

train_data_whatanimal = train_data_whatanimal.drop([768])
print(len(train_data_whatanimal))

#train_data_whatanimal.loc[[768]]
'''

In [None]:
batch_size = 512

train_datagen = datagen(train_data, 'train', batch_size=batch_size)
'''
for x in train_datagen:
    
    print('image features')
    print(x[0][0][0].shape)
    print(x[0][0][0])
        
    print('question features')
    print(x[0][1][0].shape)
    print(x[0][1][0])
        
    print('answer features')
    print(x[1][0].shape)
    print(x[1][0])
    break
'''

val_datagen = datagen(val_data, 'val', batch_size=batch_size)

In [None]:
input1 = Input(shape=(2048), dtype='float32', name='image_input')

input2 = Input(shape=(15, 300), dtype='float32', name='question_input')
#flattened = Flatten(dtype='float32', name='flatten_question')(input2)

#img_encode = Dense(512, activation='relu', dtype='float32', name='img_encode')(input1)
ques_encode = LSTM(512, name='ques_encode')(input2)

concatenated = Concatenate(dtype='float32', name='concatenate')([input1, ques_encode])

fc1 = Dense(512, activation='relu', dtype='float32', name='fc1')(concatenated)
fc2 = Dense(512, activation='relu', dtype='float32', name='fc2')(fc1)
output = Dense(3000, activation='softmax', dtype='float32', name='classification')(fc2)

model = Model(inputs=[input1,input2], outputs=output)

In [None]:
model.summary()

In [None]:
plot_model(model)

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),
    loss = 'categorical_crossentropy',
    metrics = ['accuracy']
)

checkpoint_filepath = dataset_dir+'vqa-v1/weights.hdf5'

model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

history = model.fit(
    train_datagen, 
    epochs = 200,
    validation_data = val_datagen,
    callbacks = [model_checkpoint_callback]
)

In [None]:
'''
test_datagen = datagen(test_data_whatanimal, 'train2014', batch_size=32, predict_mode=True)
test_output = model.predict(test_datagen)
answers[np.argmax(test_output)]
'''

In [None]:
model.load_weights(checkpoint_filepath)