Data Preprocessing
--

In [1]:
%who

Interactive namespace is empty.


In [1]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from build_vocab import *
from build_answers import *
from vqaTools.vqa import VQA
from PIL import Image

In [2]:
rootDir = '../../data2'
dataSubType = 'val2014'
annFile ='{}/v2_mscoco_{}_annotations.json'.format(rootDir, dataSubType)
quesFile ='{}/v2_OpenEnded_mscoco_{}_questions.json'.format(rootDir, dataSubType)

Build an answers wrapper
--

In [3]:
# instantiate VQA object
vqa = VQA(annFile, quesFile)

loading VQA annotations and questions into memory...
0:00:04.365680
creating index...
index created!


In [4]:
answers = build_answers(annFile, quesFile)

loading VQA annotations and questions into memory...
0:00:04.538660
creating index...
index created!
len of annotations dict: 214354
[214354/214354] Answers tally completed.


In [5]:
answers_path = '../../dotCuda/notebook/valanswers.pkl'

In [6]:
with open(answers_path, 'wb') as f:
    pickle.dump(answers, f)
print("Total answers size: {}".format(len(answers)))
print("Saved the answers wrapper to '{}'".format(answers_path))

Total answers size: 3001
Saved the answers wrapper to '../../dotCuda/notebook/valanswers.pkl'


In [7]:
with open(answers_path, 'rb') as f:
    answers = pickle.load(f)

In [8]:
answers.idx2ans[0]

'<unk>'

Build a vocabulary wrapper
--

In [9]:
vocab = build_vocab(annFile, quesFile, threshold=4)

loading VQA annotations and questions into memory...
0:00:03.997276
creating index...
index created!
[214354/214354] Tokenized the questions.


In [10]:
vocab_path = '../../dotCuda/notebook/vocab.pkl'

In [9]:
with open(vocab_path, 'wb') as f:
    pickle.dump(vocab, f)
print("Total vocabulary size: {}".format(len(vocab)))
print("Saved the vocabulary wrapper to '{}'".format(vocab_path))

Total vocabulary size: 5556
Saved the vocabulary wrapper to '../../dotCuda/notebook/valvocab.pkl'


In [11]:
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)

Transform the image
--

In [12]:
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

Dataset
--

In [16]:
class COCODataset(Dataset):
    
    def __init__(self, vocab, answers, rootDir='../../data2', dataSubType='train2014', transform=transform):
        
        annFile ='{}/v2_mscoco_{}_annotations.json'.format(rootDir, dataSubType)
        quesFile ='{}/v2_OpenEnded_mscoco_{}_questions.json'.format(rootDir, dataSubType)
        self.vqa = VQA(annFile, quesFile)
        self.imgDir = '{}/{}'.format(rootDir, dataSubType)
        self.vocab = vocab
        self.answers = answers
        self.quesIds = self.vqa.getQuesIds()
        self.dataSubType = dataSubType
        self.transform = transform
        
    def __getitem__(self, index):
        
        """
        returns:
            question: tensor of word-indices
            transformed image: tensor of shape (3, 224, 224)
            answers: tensor of indices mapped to 3000 most frequently occurring answers
            answers not found among 300 most frequently occurring answers are eliminated
        """
        
        quesId = self.quesIds[index]
        
        img_id = self.vqa.qqa[quesId]['image_id'] 
        img_id = str(img_id).zfill(12)
        path = 'COCO_{}_{}.jpg'.format(self.dataSubType, img_id)
#         print(os.path.join(self.imgDir, path))
        image = Image.open(os.path.join(self.imgDir, path)).convert('RGB')

        if self.transform is not None:
            image = self.transform(image)
            
            
        # Convert question to word ids
        vocab = self.vocab
        question = self.vqa.qqa[quesId]['question']
        print(question)
        
        tokens = nltk.tokenize.word_tokenize(question.lower())
        question_list = []
        question_list.append(vocab('<start>'))
        question_list.extend([vocab(token) for token in tokens])
        question_list.append(vocab('<end>'))
        question_tensor = torch.Tensor(question_list)
        
        qa = self.vqa.loadQA(quesId)
        
        ans_list = [a['answer'] for a in qa[0]['answers']]
        print(ans_list)
        
        ans_index_list = [self.answers(ans) for ans in ans_list]
        answer_tensor = torch.Tensor(ans_index_list)
        
        return question_tensor, image, answer_tensor     
        
    def __len__(self):
        return len(self.vqa.dataset['annotations'])
        

In [19]:
li = ['1','2','3']
li2 = ['1','2','4']

li == li2

# [ans for ans in ans_list if ans in self.answers.ans2idx.keys() else unk]

False

In [19]:
dataset = COCODataset(vocab=vocab, answers=answers)

loading VQA annotations and questions into memory...
0:00:08.065886
creating index...
index created!


In [20]:
dataset[0]

What is this photo taken looking through?
['net', 'net', 'net', 'netting', 'net', 'net', 'mesh', 'net', 'net', 'net']


(tensor([  1.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,   2.]),
 tensor([[[ 0.2282,  0.2453,  0.3138,  ...,  0.9474,  0.8618,  0.9988],
          [ 0.0741,  0.1083,  0.2111,  ...,  1.0331,  0.9474,  1.0673],
          [ 0.4851,  0.4508,  0.5193,  ...,  1.0159,  0.9303,  1.0502],
          ...,
          [-0.1143, -0.2684, -0.1999,  ..., -1.1589, -1.1418, -1.1418],
          [-0.2856, -0.1486, -0.1143,  ..., -1.2445, -1.2788, -1.1760],
          [-0.1828, -0.3198, -0.1999,  ..., -1.2103, -1.2274, -1.2103]],
 
         [[ 0.5203,  0.5378,  0.6429,  ...,  0.5903,  0.5728,  0.7129],
          [ 0.2752,  0.3452,  0.3978,  ...,  0.7129,  0.6954,  0.8354],
          [ 0.3803,  0.4153,  0.5378,  ...,  0.7129,  0.6954,  0.8354],
          ...,
          [-0.3725, -0.5476, -0.4951,  ..., -1.0378, -1.0378, -1.0553],
          [-0.4951, -0.3550, -0.3200,  ..., -1.1078, -1.1604, -1.0903],
          [-0.3725, -0.5126, -0.4076,  ..., -1.1078, -1.1604, -1.2129]],
 
         [[ 0.0082,  0.0256, 

In [50]:
question_tensor, image, ans_index_list = dataset[0]
ans_index_list.shape

len(dataset)

443757

Collator
--

In [62]:
def collate(batch):
    """
    args: list of (question, image, answer) tuples
         question: 1D tensor of variable length
         image: tensor of shape (3, 224, 224)
         answer: 1D tensor of variable length
         
    returns:
        question: packed sequence (data: 1D tensor of total questions length, batch_sizes: 1D tensor of max ques length)
        image: tensor of shape (batchsize, 3, 224, 224)
        answer: tuple of 1D tensors of variable length
    """
    # sort batch in descending order by question length
    sorted_batch = sorted(batch, key=lambda x: len(x[0]), reverse=True) 
    question, image, answer = zip(*sorted_batch)
    
    return torch.nn.utils.rnn.pack_sequence(question), torch.stack(image), answer

Dataloader
--

In [66]:
data_loader = DataLoader(dataset=dataset,
                         batch_size=5,
                         shuffle=True,
                         collate_fn=collate)

In [67]:
iter(data_loader).next()

(PackedSequence(data=tensor([    1.,     1.,     1.,     1.,     1.,    85.,   162.,     4.,
             4.,    82.,    16.,    16.,    72.,    73.,     5.,   142.,
           344.,    73.,   174.,    16.,   336.,   164.,    16.,   721.,
            38.,    86.,   453.,  1409.,    25.,    11.,  2907.,   318.,
            11.,    11.,     2.,   601.,    99.,     2.,     2.,    16.,
            11.,  1229.,     2.,    11.,     2.]), batch_sizes=tensor([ 5,  5,  5,  5,  5,  5,  5,  4,  2,  2,  1,  1])),
 tensor([[[[-2.0837, -2.0837, -2.0837,  ..., -2.1179, -2.1179, -2.1179],
           [-1.9295, -1.9295, -1.8953,  ..., -2.0494, -2.0494, -2.0494],
           [-0.3198, -0.1486, -0.1143,  ..., -1.3987, -1.4672, -1.4672],
           ...,
           [ 0.7077,  0.7591,  0.7591,  ..., -0.3027,  0.1426,  0.8789],
           [-1.8439, -1.8439, -1.8439,  ..., -1.9467, -1.8953, -1.8439],
           [-2.1008, -2.1008, -2.1008,  ..., -2.1179, -2.1008, -2.1008]],
 
          [[-2.0007, -2.0007, -2.000