In [11]:
import h5py
import numpy as np
import os
import pickle
import json
import random
import torch
import yaml
from tqdm import tqdm

from dask import delayed, threaded, compute

import vqa.datasets as datasets

In [7]:
def batchify(example_list, batch_size, shuffle=True):
    if shuffle:
        random.shuffle(example_list)

    batched_dataset = []
    for i in range(0, len(example_list), batch_size):
        batched_dataset.append(example_list[i:min(i + batch_size, len(example_list))])

    return batched_dataset

In [3]:
with open('options/vqa2/counterexamples_default.yaml', 'r') as handle:
    options = yaml.load(handle)
options['vgenome'] = None

trainset_fname = 'trainset_augmented.pickle'
trainset = pickle.load(open(os.path.join(options['vqa']['path_trainset'], 'pickle_old', trainset_fname), 'rb'))

In [4]:
cache_train = h5py.File('data/cx/vqa_trainset_cached.hdf5', 'r')
cache_val = h5py.File('data/cx/vqa_valset_cached.hdf5', 'r')

In [5]:
idx_to_qid = list(cache_train.get('q_ids'))

In [9]:
import time

start = time.time()

for batch in batchify(trainset['examples_list'], batch_size=10):
    batch_size = len(batch)
    
    q_ids = [ex['question_id'] for ex in batch]
    
    cache_idxs = [idx_to_qid.index(q_id) for q_id in q_ids]
    
    
    answers = cache_train.get('answers')[sorted(cache_idxs)]
    
    print(type(answers))
    
    break
    
print(time.time() - start)

<class 'numpy.ndarray'>
3.129316568374634


In [13]:
len(cache_train['answers'])

211626

In [77]:
def batch_iterator(example_list, cache, batch_size, shuffle=True):
    
    data_size = len(example_list)
    
    shuffle_indices = np.arange(data_size)
    if shuffle:
        shuffle_indices = np.random.permutation(shuffle_indices)

    num_batches_per_epoch = int((data_size-1)/batch_size) + 1
    for batch_num in range(num_batches_per_epoch):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, data_size)
        batch_indices = sorted(list(shuffle_indices[start_index:end_index]))
        
        yield batch_num, \
              compute([delayed(cache['answers'].__getitem__)(i) for i in batch_indices], get=threaded.get)
        
train_iterator = batch_iterator(trainset['examples_list'], cache_train, batch_size=10)

start = time.time()

print(next(train_iterator))
for i, x in train_iterator:
    y = np.array(x[0])
    if i > 100:
        break
    
print(time.time() - start)

2.956390380859375e-05


In [34]:
train_iterator

<generator object batch_iterator at 0x7fbf9742afc0>

6.985664367675781e-05


In [None]:
def batch_iterator(number_of_examples: int=None, batch_size: int=None, num_epochs: int=10, shuffle=False):
    """Generates a batch iterator for a dataset."""
    if batch_size is None:
        batch_size = self.batch_size
    names = self.h5_file['names']
    data_size = len(names)
    if number_of_examples is not None:
        data_size = number_of_examples
    x_dat = self.h5_file['x_data']
    y_dat = self.h5_file['y_data']
    num_batches_per_epoch = int((data_size-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        shuffle_indices = np.arange(data_size)
        if shuffle:
            shuffle_indices = np.random.permutation(shuffle_indices)
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            batch_indices = sorted(list(shuffle_indices[start_index:end_index]))
            yield epoch, batch_num, \
                  compute([delayed(x_dat.__getitem__)(i) for i in batch_indices], get=threaded.get), \
                  compute([delayed(y_dat.__getitem__)(i) for i in batch_indices], get=threaded.get), \
                  compute([delayed(names.__getitem__)(i) for i in batch_indices], get=threaded.get)
