In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [5]:
import numpy as np 
import pandas as pd 
import os
import tensorflow as tf
import numpy as np

SEED = 1234
tf.random.set_seed(SEED)
np.random.seed(SEED)

cwd = os.getcwd()

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    print(e)

1 Physical GPUs, 1 Logical GPUs


In [6]:
from google.colab import drive
drive.mount('/content/drive')
!unzip /content/drive/My\ Drive/VQA_Dataset.zip

[1;30;43mGörüntülenen çıkış son 5000 satıra kısaltıldı.[0m
  inflating: VQA_Dataset/Images/971.png  
  inflating: __MACOSX/VQA_Dataset/Images/._971.png  
  inflating: VQA_Dataset/Images/23861.png  
  inflating: __MACOSX/VQA_Dataset/Images/._23861.png  
  inflating: VQA_Dataset/Images/12464.png  
  inflating: __MACOSX/VQA_Dataset/Images/._12464.png  
  inflating: VQA_Dataset/Images/7894.png  
  inflating: __MACOSX/VQA_Dataset/Images/._7894.png  
  inflating: VQA_Dataset/Images/14015.png  
  inflating: __MACOSX/VQA_Dataset/Images/._14015.png  
  inflating: VQA_Dataset/Images/9661.png  
  inflating: __MACOSX/VQA_Dataset/Images/._9661.png  
  inflating: VQA_Dataset/Images/6552.png  
  inflating: __MACOSX/VQA_Dataset/Images/._6552.png  
  inflating: VQA_Dataset/Images/10273.png  
  inflating: __MACOSX/VQA_Dataset/Images/._10273.png  
  inflating: VQA_Dataset/Images/2734.png  
  inflating: __MACOSX/VQA_Dataset/Images/._2734.png  
  inflating: VQA_Dataset/Images/4345.png  
  inflating: __MA

In [7]:
from PIL import Image
import numpy as np
import json
import cv2
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

imgs_path = "/content/VQA_Dataset/Images/"
train_json_path = "/content/VQA_Dataset/train_questions_annotations.json"
test_json_path = "/content/VQA_Dataset/test_questions.json"

SEED = 1234
DATASET_SPLIT = 0.75
img_h = 299
img_w = 299
BATCH_SIZE = 16

classes = {
'0': 0,
'1': 1,
'2': 2,
'3': 3,
'4': 4,
'5': 5,
'apple': 6,
'baseball': 7,
'bench': 8,
'bike': 9,
'bird': 10,
'black': 11,
'blanket': 12,
'blue': 13,
'bone': 14,
'book': 15,
'boy': 16,
'brown': 17,
'cat': 18,
'chair': 19,
'couch': 20,
'dog': 21,
'floor': 22,
'food': 23,
'football': 24,
'girl': 25,
'grass': 26,
'gray': 27,
'green': 28,
'left': 29,
'log': 30,
'man': 31,
'monkey bars': 32,
'no': 33,
'nothing': 34,
'orange': 35,
'pie': 36,
'plant': 37,
'playing': 38,
'red': 39,
'right': 40,
'rug': 41,
'sandbox': 42,
'sitting': 43,
'sleeping': 44,
'soccer': 45,
'squirrel': 46,
'standing': 47,
'stool': 48,
'sunny': 49,
'table': 50,
'tree': 51,
'watermelon': 52,
'white': 53,
'wine': 54,
'woman': 55,
'yellow': 56,
'yes': 57
}

N_CLASSES = len(classes)

if 'tokenizer' not in globals():        # only if it does not exists yet
    # Use the Tokenizer to transform the text (questions) into sequence
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    train_data=[]
    with open(os.path.join(test_json_path), 'r') as f:
        train_data_jsonload = json.load(f)
        for v in train_data_jsonload:
            train_data.append(train_data_jsonload.get(v))

        for question in train_data:
            quest = question['question'].split(" ")
            for i in range(len(quest)):
                quest[i] = quest[i].replace("?", "")
            #print(quest)

            # Updates internal vocabulary based on the questions of the dataset
            tokenizer.fit_on_texts(quest)            
    f.close()
words_number = len(tokenizer.word_index) + 1

In [8]:
class DataGenerator(tf.keras.utils.Sequence):
    
    def __init__(self, list_IDs, image_path, train_input_questions, max_length, to_fit=True,
                 batch_size=BATCH_SIZE, dim=(img_h, img_w), n_channels=3, n_classes=N_CLASSES, shuffle=True):
        self.list_IDs = list_IDs
        self.train_input_questions = train_input_questions
        self.image_path = image_path
        self.to_fit = to_fit
        self.batch_size = batch_size
        self.dim = dim
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.img_h = dim[0]
        self.img_w = dim[1]
        self.max_length = max_length
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        # Generate indexes of the batch
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X = self._generate_X(list_IDs_temp)

        if self.to_fit:
            y = self._generate_y(list_IDs_temp)
            return X, y
        else:
            return X

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def _generate_X(self, list_IDs_temp):
        # Initialization
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        X2 = np.empty((self.batch_size, self.max_length))

        # Generate data
        for i,ID in enumerate(list_IDs_temp):
            # Store sample
            X[i,] = self._load_image(self.image_path[i], self.img_w, self.img_h)
            X2[i,] = (self.train_input_questions[i]).tolist()
        ole = [X2, X]
        
        return ole

    def _generate_y(self, list_IDs_temp):
        y = np.empty((self.batch_size, 1), dtype=int)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            y[i] = self.list_IDs[ID]

        return y

    def _load_image(self, image_path, img_w, img_h):
        image = cv2.imread(imgs_path + image_path+".png") 
        image = cv2.resize(image, (img_w, img_h))
        image = image/ 255.
        return image

In [9]:
def readTrainJson(data, first, last):
        images = []
        questions = []
        answers = []

        for question in data[first:last]:
            name = question.get('image_id')
            quest = question.get('question').split(" ")
            for i in range(len(quest)):
                quest[i] = quest[i].replace("?", "")
            ans = question.get('answer')
            images.append(name)
            questions.append(quest)
            answers.append(classes[ans])
        return images, questions, answers

def readTestJson(data, first, last):
    quest_id = []
    images = []
    questions = []

    for question in data[first:last]:
        qid = question.get('question')
        name = question.get('image_id')
        quest = question.get('question').split(" ")
        for i in range(len(quest)):
            quest[i] = quest[i].replace("?", "")
        quest_id.append(qid)
        images.append(name)
        questions.append(quest)
    return images, questions, quest_id

In [10]:
#read train JSON file
train_data=[]
with open(os.path.join(train_json_path), 'r') as f:
  train_data_jsonload = json.load(f)
  for v in train_data_jsonload:
      train_data.append(train_data_jsonload.get(v))
f.close()

#read test JSON file
test_data=[]
image_data=[]
with open(os.path.join(test_json_path), 'r') as f:
  test_data_jsonload = json.load(f)
  for v in test_data_jsonload:
      image_data.append(v)
      test_data.append(test_data_jsonload.get(v))
f.close()

TOT_QUESTIONS = len(train_data)
TRAIN_QUESTIONS = int(TOT_QUESTIONS*DATASET_SPLIT)
VALID_QUESTIONS = TOT_QUESTIONS-TRAIN_QUESTIONS

#extract images, questions and answer (or quest_id) from the train and test files
train_images, train_questions, train_answers = readTrainJson(train_data, 0, TRAIN_QUESTIONS)
valid_images, valid_questions, valid_answers = readTrainJson(train_data, TRAIN_QUESTIONS, TOT_QUESTIONS)
test_images, test_questions, questions_id = readTestJson(test_data, 0, len(test_data))

sequences = tokenizer.texts_to_sequences(train_questions)
max_length = max(len(sequence) for sequence in sequences)
train_input_questions = pad_sequences(sequences, maxlen=max_length)

sequences = tokenizer.texts_to_sequences(valid_questions)
valid_input_questions = pad_sequences(sequences, maxlen=max_length)

tokenizer.fit_on_texts(test_questions)
sequences = tokenizer.texts_to_sequences(test_questions)
test_input_questions = pad_sequences(sequences, maxlen=max_length)

words_number = len(tokenizer.word_index) + 1

training_generator = DataGenerator(train_answers, train_images, train_input_questions, max_length, batch_size=BATCH_SIZE, dim=(img_h, img_w), n_classes=N_CLASSES)
validation_generator = DataGenerator(valid_answers, valid_images, valid_input_questions, max_length, batch_size=BATCH_SIZE, dim=(img_h, img_w), n_classes=N_CLASSES)
test_generator = DataGenerator(questions_id, test_images, test_input_questions,  max_length, to_fit=False, batch_size=1, dim=(img_h, img_w), n_classes=N_CLASSES, shuffle=False)

In [11]:
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.applications import VGG19
 
INPUT_SIZE_MERGE = 128
EMBEDDING_SIZE = 512

# Define CNN for Image Input
base_model = tf.keras.applications.VGG19(input_shape=(img_h, img_w, 3), include_top=False, weights='imagenet')
vision_model = tf.keras.models.Sequential()
vision_model.add(base_model)
vision_model.add(tf.keras.layers.Dropout(0.2))
vision_model.add(tf.keras.layers.Flatten())
vision_model.add(tf.keras.layers.Dense(EMBEDDING_SIZE))
image_input = tf.keras.layers.Input(shape=(img_h, img_w, 3))
encoded_image = vision_model(image_input)

# Define RNN for language input
question_input = tf.keras.layers.Input(shape=[max_length], dtype='int32')
embedded_question = tf.keras.layers.Embedding(input_dim=words_number, output_dim=512, input_length=100)(question_input)
hidden_layer = LSTM(128,return_sequences=True)(embedded_question)
encoded_question = LSTM(INPUT_SIZE_MERGE, dropout=0.2, recurrent_dropout=0.1, unroll=True)(embedded_question)

# Combine CNN and RNN to create the final model
merged = tf.keras.layers.concatenate([encoded_question, encoded_image])
output = tf.keras.layers.Dense(EMBEDDING_SIZE, activation='relu')(merged)
output = tf.keras.layers.Dropout(0.2)(output)
output = tf.keras.layers.Dense(len(classes), activation='softmax')(output)
vqa_model = tf.keras.models.Model(inputs=[question_input, image_input], outputs=output)

vision_model.summary()
vqa_model.summary() 

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5




Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg19 (Functional)           (None, 9, 9, 512)         20024384  
_________________________________________________________________
dropout (Dropout)            (None, 9, 9, 512)         0         
_________________________________________________________________
flatten (Flatten)            (None, 41472)             0         
_________________________________________________________________
dense (Dense)                (None, 512)               21234176  
Total params: 41,258,560
Trainable params: 41,258,560
Non-trainable params: 0
_________________________________________________________________
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)         

In [12]:
from keras.callbacks import EarlyStopping

# CALLBACKS
# -------------------
callbacks=[]

es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
callbacks.append(es_callback)

# learning rate
lr = 1e-5
optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr, rho=0.9)

# Loss
loss = tf.keras.losses.SparseCategoricalCrossentropy()
# Compile Model
vqa_model.compile(optimizer = optimizer , loss=loss, metrics=['sparse_categorical_accuracy'])
vqa_model.fit_generator(generator=training_generator,callbacks=callbacks, validation_steps=100, steps_per_epoch=100,validation_data=validation_generator,epochs=5)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f6e0357b320>

In [13]:
import os
from datetime import datetime

def create_csv(results_dir='/content/'):

    csv_fname = 'results_'
    csv_fname += datetime.now().strftime('%b%d_%H-%M-%S') + '.csv'

    with open(os.path.join(results_dir, csv_fname), 'w') as f:

        f.write('Id,Category\n')

        for i in range(len(test_generator)):
            f.write(str(image_data[i]) + ',' + str(np.argmax(vqa_model.predict(test_generator[i]))) + '\n')

create_csv()