# Dataset Handling

The VQA dataset contains ~120K images and ~330K question/answer pairs and is quite large for this baby VQA project. Therefore, we will extract a small set of it for training and testing our model. 

In [2]:
# a bit of setup as usual
import h5py, json

import numpy as np
import cPickle as pickle

## 0 - Loading the full VQA dataset

In [3]:
h5_img_file_train = h5py.File('data/vqa_data_img_vgg_train.h5', 'r')
fv_im_train = h5_img_file_train.get('/images_train') # 82460 x 14 x 14 x 512

h5_img_file_test = h5py.File('data/vqa_data_img_vgg_test.h5', 'r')
fv_im_test = h5_img_file_test.get('/images_test') # 40504 x 14 x 14 x 512

h5_ques_file = h5py.File('data/vqa_data_prepro.h5', 'r')
ques_train = h5_ques_file.get('/ques_train') # 215375 x 26
ques_len_train = h5_ques_file.get('/ques_len_train') # 215375 x 1
img_pos_train = h5_ques_file.get('/img_pos_train') # 215375 x 1
ques_id_train = h5_ques_file.get('/ques_id_train') # 215375 x 1
answers = h5_ques_file.get('/answers') # 215375 x 1
split_train = h5_ques_file.get('/split_train') # 215375 x 1

ques_test = h5_ques_file.get('/ques_test') # 121512 x26
ques_len_test = h5_ques_file.get('/ques_len_test')
img_pos_test = h5_ques_file.get('/img_pos_test')
ques_id_test = h5_ques_file.get('/ques_id_test')
split_test = h5_ques_file.get('/split_test')
ans_test = h5_ques_file.get('/ans_test')

json_file = json.load(open('data/vqa_data_prepro.json', 'r'))
ix_to_word = json_file['ix_to_word']
ix_to_ans = json_file['ix_to_ans']

vocab_size = len(ix_to_word) # 12604

## 1 - Extracting a small dataset for training & testing

In [4]:
train_small_size = 8000
train_im_small_idx = []

ix = 0
qa_data_train_small = []
while len(train_im_small_idx) < train_small_size:
    if img_pos_train[ix] in train_im_small_idx:
        im_ix = train_im_small_idx.index(img_pos_train[ix])
    else:
        im_ix = len(train_im_small_idx)
        train_im_small_idx.append(img_pos_train[ix])
    qa_data_train_small.append((ques_train[ix], ques_len_train[ix], im_ix, answers[ix]))
    ix += 1

train_im_small = []
for im_ix in train_im_small_idx:
    train_im_small.append(fv_im_train[im_ix, :])

with open('data/qa_data_train_small.pkl', 'wb') as fp:
    pickle.dump(qa_data_train_small, fp)

with h5py.File('data/vqa_data_img_vgg_train_small.h5', 'w') as hf:
    hf.create_dataset('images_train', data=train_im_small)

In [6]:
test_small_size = 4000
test_im_small_idx = []

qa_data_test_small = []
while len(test_im_small_idx) < test_small_size:
    if img_pos_test[ix] in test_im_small_idx:
        im_ix = test_im_small_idx.index(img_pos_test[ix])
    else:
        im_ix = len(test_im_small_idx)
        test_im_small_idx.append(img_pos_test[ix])
    qa_data_test_small.append((ques_test[ix], ques_len_test[ix], im_ix, ans_test[ix]))
    ix += 1

test_im_small = []
for im_ix in test_im_small_idx:
    test_im_small.append(fv_im_test[im_ix, :])

with open('data/qa_data_test_small.pkl', 'wb') as fp:
    pickle.dump(qa_data_test_small, fp)

with h5py.File('data/vqa_data_img_vgg_test_small.h5', 'w') as hf:
    hf.create_dataset('images_test', data=test_im_small)

## 2 - Extracting a tiny dataset for debugging

In [29]:
train_tiny_size = 100
train_im_tiny_idx = []

ix =0
qa_data_train_tiny = []
while len(train_im_tiny_idx) < train_tiny_size:
    if img_pos_train[ix] in test_im_tiny_idx:
        im_ix = train_im_tiny_idx.index(img_pos_train[ix])
    else:
        im_ix = len(train_im_tiny_idx)
        train_im_tiny_idx.append(img_pos_train[ix])
    qa_data_train_tiny.append((ques_train[ix], ques_len_train[ix], im_ix, answers[ix]))
    ix += 1

train_im_tiny = []
for im_ix in train_im_tiny_idx:
    train_im_tiny.append(fv_im_train[im_ix, :])
    
with open('data/qa_data_train_tiny.pkl', 'wb') as fp:
    pickle.dump(qa_data_train_tiny, fp)

with h5py.File('data/vqa_data_img_vgg_train_tiny.h5', 'w') as hf:
    hf.create_dataset('images_train', data=train_im_tiny)

In [32]:
test_tiny_size = 100
test_im_tiny_idx = []

ix =0
qa_data_test_tiny = []
while len(test_im_tiny_idx) < test_tiny_size:
    if img_pos_test[ix] in test_im_tiny_idx:
        im_ix = test_im_tiny_idx.index(img_pos_test[ix])
    else:
        im_ix = len(test_im_tiny_idx)
        test_im_tiny_idx.append(img_pos_test[ix])
    qa_data_test_tiny.append((ques_test[ix], ques_len_test[ix], im_ix, ans_test[ix]))
    ix += 1

test_im_tiny = []
for im_ix in test_im_tiny_idx:
    test_im_tiny.append(fv_im_test[im_ix, :])
    
with open('data/qa_data_test_tiny.pkl', 'wb') as fp:
    pickle.dump(qa_data_test_tiny, fp)

with h5py.File('data/vqa_data_img_vgg_test_tiny.h5', 'w') as hf:
    hf.create_dataset('images_test', data=test_im_tiny)

In [25]:
ques, ques_len, im_ix, ans = zip(*qa_data_test_tiny)
# print [ix_to_word.get(str(ix), 'UNK') for ix in ques], im_ix, ix_to_ans.get(str(ans), 'UNK')