In [19]:
import tensorflow as tf

#BERT related
import tensorflow_hub as hub
import tensorflow_text as text

from tensorflow.keras.utils import Sequence, to_categorical, plot_model
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Concatenate, LSTM
from tensorflow.keras import callbacks

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import math
import string
import random
import pydot
import os
import shutil

tf.get_logger().setLevel('ERROR')

dataset_dir = '/ML/datasets/'

In [20]:
print(tf.config.list_physical_devices('GPU'))

#ensure gpu is used and use dynamic memory allocation(otherwise could run out of memory)
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

[]


IndexError: list index out of range

In [None]:
train_data = pd.read_csv(dataset_dir+'vqa-v1/train_data.csv')

train_data

In [None]:
val_data = pd.read_csv(dataset_dir+'vqa-v1/val_data.csv')

val_data

In [None]:
train_image_embeddings = dict()
directory = dataset_dir+'vqa-v1/images/train_image_features/'
files = os.listdir(directory)
for i, file in enumerate(files):
    #print('processed',i+1,'features',end='\r')
    feature = np.load(directory+file)['arr_0']
    name = file[:-4]
    train_image_embeddings[name] = feature

In [None]:
val_image_embeddings = dict()
directory = dataset_dir+'vqa-v1/images/val_image_features/'
files = os.listdir(directory)
for i, file in enumerate(files):
    #print('processed',i+1,'features',end='\r')
    feature = np.load(directory+file)['arr_0']
    name = file[:-4]
    val_image_embeddings[name] = feature

In [None]:
tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
tfhub_handle_encoder = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1"

bert_preprocessor = hub.KerasLayer(tfhub_handle_preprocess, name='BERT_preprocess')
bert_encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encode')

In [None]:
#answer feature extraction
answers = list(pd.unique(train_data['answ']))
random.shuffle(answers)
print(len(answers))
print(answers[:100])

In [None]:
class datagen(Sequence):
    
    def __init__(self, dataframe, split, batch_size, predict_mode=False):
        
        self.dataframe = dataframe
        self.split = split
        self.batch_size = batch_size
        self.predict_mode = predict_mode
        
    def __len__(self):
        return math.ceil(len(self.dataframe) / self.batch_size)
        
    def __getitem__(self, ind):
        
        #print('Starting batch',ind)
        
        partial_dataframe = self.dataframe[ind * self.batch_size : (ind + 1) * self.batch_size]
        
        img_path = list(partial_dataframe['im_path'])
        ques = list(partial_dataframe['ques'])
        if not self.predict_mode:
            ans = list(partial_dataframe['answ'])
        
        image_features = self.get_img_feature(img_path)
        question_features = self.get_ques_feature(ques)

        if not self.predict_mode:
            answer_features = self.get_ans_feature(ans)
        
        x = (image_features, question_features)
        if not self.predict_mode:
            y = answer_features
        
        #print('Finished batch',ind+1)
        
        if self.predict_mode:
            return x, 
        else:
            return x, y
    
    def get_img_feature(self, img_path):
        
        #print('getting image feature')
        
        if self.split == 'train':
            image_embeddings = train_image_embeddings
        else:
            image_embeddings = val_image_embeddings
        
        r = np.ndarray((len(img_path), 2048), dtype='float32')
        
        for i, path in enumerate(img_path):
            img = image_embeddings[path[:-4]]
            r[i] = img
            
        return r
    
    def get_ques_feature(self, ques):
        
        #print('getting question feature')
        
        return tf.constant(ques);
    
    def get_ans_feature(self, ans):
        
        #print('getting answer feature')
        
        r = np.zeros((len(ans), 1000), dtype='float32')
        
        for i, a in enumerate(ans):
            ind = answers.index(a)
            r[i][ind] = 1.0
            
        return r
    
    def on_epoch_end(self):
        
        self.dataframe = self.dataframe.sample(frac = 1) 

In [11]:
batch_size = 512

train_datagen = datagen(train_data, 'train', batch_size=batch_size)

for x in train_datagen:
    
    print('image features')
    print(x[0][0][0].shape)
    print(x[0][0][0])
        
    print('question features')
    print(x[0][1][0].shape)
    print(x[0][1][0])
        
    print('answer features')
    print(x[1][0].shape)
    print(x[1][0])
    break


val_datagen = datagen(val_data, 'val', batch_size=batch_size)

image features
(2048,)
[0.7878499 4.1447735 1.6018527 ... 5.839238  9.191399  0.4151845]
question features
What sport is on the TV?
answer features
(1000,)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

In [15]:
input1 = Input(shape=(2048), dtype='float32', name='image_input')
input2 = Input(shape=(), dtype=tf.string, name='question_input')

encoded_img = Dense(512, activation='relu', dtype='float32', name='img_encode')(input1)
    
preprocessed_ques = bert_preprocessor(input2)
encoded_ques = bert_encoder(preprocessed_ques)['pooled_output']

concatenated = Concatenate(dtype='float32', name='concatenate')([encoded_img, encoded_ques])

fc1 = Dense(512, activation='relu', dtype='float32', name='fc1')(concatenated)
fc2 = Dense(512, activation='relu', dtype='float32', name='fc2')(fc1)
output = Dense(1000, activation='softmax', dtype='float32', name='classification')(fc2)

model = Model(inputs=[input1,input2], outputs=output)

In [16]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
question_input (InputLayer)     [(None,)]            0                                            
__________________________________________________________________________________________________
image_input (InputLayer)        [(None, 2048)]       0                                            
__________________________________________________________________________________________________
BERT_preprocess (KerasLayer)    {'input_mask': (None 0           question_input[0][0]             
__________________________________________________________________________________________________
img_encode (Dense)              (None, 512)          1049088     image_input[0][0]                
______________________________________________________________________________________________

In [17]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),
    loss = 'categorical_crossentropy',
    metrics = ['accuracy']
)

checkpoint_filepath = '../datasets/vqa-v1/weights.hdf5'

model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

history = model.fit(
    train_datagen, 
    epochs = 200,
    validation_data = val_datagen,
    callbacks = [model_checkpoint_callback]
)

AttributeError: 'str' object has no attribute 'shape'