In [70]:
%matplotlib inline
import os

Data folder structure
--

In [3]:
dataDir ='../../data2'
os.listdir(dataDir)

['v2_mscoco_val2014_complementary_pairs.json',
 'v2_Questions_Val_mscoco.zip',
 'v2_mscoco_val2014_annotations.json',
 'v2_OpenEnded_mscoco_val2014_questions.json',
 'v2_mscoco_train2014_annotations.json',
 'annotations',
 'train2014',
 'v2_OpenEnded_mscoco_train2014_questions.json',
 'v2_Questions_Train_mscoco.zip',
 'v2_mscoco_train2014_complementary_pairs.json',
 'val2014']

Notes on data folder
--
- `annotations`: not required, only for image captioning
- `train2014` and `val2014`: img folders with format e.g. `COCO_val2014_000000059710.jpg`
- `v2_Questions_Train_mscoco.zip` and `v2_Questions_Val_mscoco.zip`: unzip into respective questions `json`

In [8]:
from vqaTools.vqa import VQA

dataDir ='../../data2'
versionType ='v2_' # this should be '' when using VQA v2.0 dataset
taskType ='OpenEnded' # 'OpenEnded' only for v2.0. 'OpenEnded' or 'MultipleChoice' for v1.0
dataType ='mscoco'  # 'mscoco' only for v1.0. 'mscoco' for real and 'abstract_v002' for abstract for v1.0.
dataSubType ='train2014'
annFile ='{}/{}{}_{}_annotations.json'.format(dataDir, versionType, dataType, dataSubType)
quesFile ='{}/{}{}_{}_{}_questions.json'.format(dataDir, versionType, taskType, dataType, dataSubType)
imgDir = '{}/{}/'.format(dataDir, dataSubType)


In [9]:
# initialize VQA api for QA annotations
vqa = VQA(annFile, quesFile)

loading VQA annotations and questions into memory...
0:00:10.382276
creating index...
index created!


In [79]:
import nltk
image = vqa.qqa[458752000]['image_id']
qa = vqa.loadQA(458752002)
# qa['multiple_choice_answer']
# [answer for answer in qa[0]['answers']

qa
 
# for a in qa[0]['answers']:
#     print(a['answer'])
# ans_list = [a['answer'] for a in qa[0]['answers']]


# len(vqa.dataset['annotations']) # 443757
# len(vqa.getQuesIds())

[{'answer_type': 'other',
  'multiple_choice_answer': 'orange',
  'answers': [{'answer': 'orange', 'answer_confidence': 'yes', 'answer_id': 1},
   {'answer': 'orange', 'answer_confidence': 'yes', 'answer_id': 2},
   {'answer': 'orange', 'answer_confidence': 'maybe', 'answer_id': 3},
   {'answer': 'orange', 'answer_confidence': 'yes', 'answer_id': 4},
   {'answer': 'orange', 'answer_confidence': 'yes', 'answer_id': 5},
   {'answer': 'orange', 'answer_confidence': 'yes', 'answer_id': 6},
   {'answer': 'orange', 'answer_confidence': 'yes', 'answer_id': 7},
   {'answer': 'orange', 'answer_confidence': 'yes', 'answer_id': 8},
   {'answer': 'orange', 'answer_confidence': 'yes', 'answer_id': 9},
   {'answer': 'orange', 'answer_confidence': 'yes', 'answer_id': 10}],
  'image_id': 458752,
  'question_type': 'what color is the',
  'question_id': 458752002}]

Questions
--

### Structure
```
{
"info" : info,
"task_type" : str,
"data_type": str,
"data_subtype": str,
"questions" : [question],
"license" : license
}

info {
"year" : int,
"version" : str,
"description" : str,
"contributor" : str,
"url" : str,
"date_created" : datetime
}

license{
"name" : str,
"url" : str
}

question{
"question_id" : int,
"image_id" : int,
"question" : str
}
```

### Accessing keys

In [36]:
list(vqa.questions.keys())

['info', 'task_type', 'data_type', 'license', 'data_subtype', 'questions']

### `question` dict 

In [37]:
list(vqa.questions['questions'])[0:3]

[{'image_id': 458752,
  'question': 'What is this photo taken looking through?',
  'question_id': 458752000},
 {'image_id': 458752,
  'question': 'What position is this man playing?',
  'question_id': 458752001},
 {'image_id': 458752,
  'question': 'What color is the players shirt?',
  'question_id': 458752002}]

#### Accessing items in `question` dict by `question_id`

In [41]:
vqa.qqa[458752000]

{'image_id': 458752,
 'question': 'What is this photo taken looking through?',
 'question_id': 458752000}

Annotation
--

### Structure

```
{
"info" : info,
"data_type": str,
"data_subtype": str,
"annotations" : [annotation],
"license" : license
}

info {
"year" : int,
"version" : str,
"description" : str,
"contributor" : str,
"url" : str,
"date_created" : datetime
}

license{
"name" : str,
"url" : str
}

annotation{
"question_id" : int,
"image_id" : int,
"question_type" : str,
"answer_type" : str,
"answers" : [answer],
"multiple_choice_answer" : str
}

answer{
"answer_id" : int,
"answer" : str,
"answer_confidence": str
}
```

### Accessing keys

In [38]:
list(vqa.dataset.keys())

['info', 'license', 'data_subtype', 'annotations', 'data_type']

### `annotation` dict 

In [40]:
list(vqa.dataset['annotations'])[0:2]

[{'question_type': 'what is this',
  'multiple_choice_answer': 'net',
  'answers': [{'answer': 'net', 'answer_confidence': 'maybe', 'answer_id': 1},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 2},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 3},
   {'answer': 'netting', 'answer_confidence': 'yes', 'answer_id': 4},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 5},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 6},
   {'answer': 'mesh', 'answer_confidence': 'maybe', 'answer_id': 7},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 8},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 9},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 10}],
  'image_id': 458752,
  'answer_type': 'other',
  'question_id': 458752000},
 {'question_type': 'what',
  'multiple_choice_answer': 'pitcher',
  'answers': [{'answer': 'pitcher',
    'answer_confidence': 'yes',
    'answer_id': 1},
   {'answer': 'c

#### Accessing items in `annotation` dict by `question_id` 

In [43]:
vqa.qa[458752000]

{'question_type': 'what is this',
 'multiple_choice_answer': 'net',
 'answers': [{'answer': 'net', 'answer_confidence': 'maybe', 'answer_id': 1},
  {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 2},
  {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 3},
  {'answer': 'netting', 'answer_confidence': 'yes', 'answer_id': 4},
  {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 5},
  {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 6},
  {'answer': 'mesh', 'answer_confidence': 'maybe', 'answer_id': 7},
  {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 8},
  {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 9},
  {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 10}],
 'image_id': 458752,
 'answer_type': 'other',
 'question_id': 458752000}

In [51]:
vqa.loadQA([458752003, 458752002, 458752001])

[{'question_type': 'is this',
  'multiple_choice_answer': 'yes',
  'answers': [{'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 1},
   {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 2},
   {'answer': 'yes', 'answer_confidence': 'maybe', 'answer_id': 3},
   {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 4},
   {'answer': 'yes', 'answer_confidence': 'maybe', 'answer_id': 5},
   {'answer': 'no', 'answer_confidence': 'maybe', 'answer_id': 6},
   {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 7},
   {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 8},
   {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 9},
   {'answer': 'yes', 'answer_confidence': 'maybe', 'answer_id': 10}],
  'image_id': 458752,
  'answer_type': 'yes/no',
  'question_id': 458752003},
 {'answer_type': 'other',
  'multiple_choice_answer': 'orange',
  'answers': [{'answer': 'orange', 'answer_confidence': 'yes', 'answer_id': 1},
   {'answer': 'orange', 'answer_c

Data Preprocessing
--

In [1]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from build_vocab import *
from build_answers import *
from vqaTools.vqa import VQA
from PIL import Image

In [2]:
rootDir = '../../data2'
dataSubType = 'train2014'
annFile ='{}/v2_mscoco_{}_annotations.json'.format(rootDir, dataSubType)
quesFile ='{}/v2_OpenEnded_mscoco_{}_questions.json'.format(rootDir, dataSubType)

Build an answers wrapper
--

In [3]:
# instantiate VQA object
vqa = VQA(annFile, quesFile)

loading VQA annotations and questions into memory...
0:00:10.408995
creating index...
index created!


In [4]:
answers = build_answers(annFile, quesFile)

loading VQA annotations and questions into memory...
0:00:08.195402
creating index...
index created!
len of annotations dict: 443757
[443757/443757] Answers tally completed.


In [5]:
answers_path = '../../dotCuda/notebook/answers.pkl'

In [6]:
with open(answers_path, 'wb') as f:
    pickle.dump(answers, f)
print("Total answers size: {}".format(len(answers)))
print("Saved the answers wrapper to '{}'".format(answers_path))

Total answers size: 3000
Saved the answers wrapper to '../../dotCuda/notebook/answers.pkl'


In [7]:
with open(answers_path, 'rb') as f:
    answers = pickle.load(f)

Build a vocabulary wrapper
--

In [8]:
vocab = build_vocab(annFile, quesFile, threshold=4)

loading VQA annotations and questions into memory...
0:00:06.805285
creating index...
index created!
[443757/443757] Tokenized the questions.


In [9]:
vocab_path = '../../dotCuda/notebook/vocab.pkl'

In [10]:
with open(vocab_path, 'wb') as f:
    pickle.dump(vocab, f)
print("Total vocabulary size: {}".format(len(vocab)))
print("Saved the vocabulary wrapper to '{}'".format(vocab_path))

Total vocabulary size: 7521
Saved the vocabulary wrapper to '../../dotCuda/notebook/vocab.pkl'


In [11]:
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)

Transform the image
--

In [12]:
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

Dataset
--

In [28]:
class COCODataset(Dataset):
    
    def __init__(self, vocab, answers, rootDir='../../data2/', dataSubType='train2014', transform=transform):
        
        annFile ='{}/v2_mscoco_{}_annotations.json'.format(rootDir, dataSubType)
        quesFile ='{}/v2_OpenEnded_mscoco_{}_questions.json'.format(rootDir, dataSubType)
        self.vqa = VQA(annFile, quesFile)
        self.imgDir = '{}/{}'.format(rootDir, dataSubType)
        self.vocab = vocab
        self.answers = answers
        self.quesIds = self.vqa.getQuesIds()
        self.dataSubType = dataSubType
        self.transform = transform
        
    def __getitem__(self, index):
        
        """
        returns:
            question: tensor of word-indices
            transformed image: tensor of shape [3, 224, 224]
            answers: tensor of indices mapped to 3000 most frequently occurring answers
            answers with not found among 300 most frequently occurring answers are eliminated
        """
        
        quesId = self.quesIds[index]
        
        img_id = self.vqa.qqa[quesId]['image_id']        
        path = 'COCO_{}_000000{}.jpg'.format(self.dataSubType, img_id)
        image = Image.open(os.path.join(self.imgDir, path)).convert('RGB')

        if self.transform is not None:
            image = self.transform(image)
            
            
        # Convert question to word ids
        vocab = self.vocab
        question = self.vqa.qqa[quesId]['question']
        print(question)
        
        tokens = nltk.tokenize.word_tokenize(question.lower())
        question_list = []
        question_list.append(vocab('<start>'))
        question_list.extend([vocab(token) for token in tokens])
        question_list.append(vocab('<end>'))
        question_tensor = torch.Tensor(question_list)
        
        qa = self.vqa.loadQA(quesId)
        
        ans_list = [a['answer'] for a in qa[0]['answers']]
        print(ans_list)
        
        ans_index_list = [self.answers.ans2idx[ans] for ans in ans_list if ans in self.answers.ans2idx.keys()]
        answer_tensor = torch.Tensor(ans_index_list)
        
        return question_tensor, image, answer_tensor     
        
    def __len__():
        return len(self.vqa.dataset['annotations'])
        

In [14]:
dataset = COCODataset(vocab=vocab, answers=answers)

loading VQA annotations and questions into memory...
0:00:06.751295
creating index...
index created!


In [15]:
dataset[0]

What is this photo taken looking through?
['net', 'net', 'net', 'netting', 'net', 'net', 'mesh', 'net', 'net', 'net']


(tensor([  1.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,   2.]),
 tensor([[[ 0.2282,  0.2453,  0.3138,  ...,  0.9474,  0.8618,  0.9988],
          [ 0.0741,  0.1083,  0.2111,  ...,  1.0331,  0.9474,  1.0673],
          [ 0.4851,  0.4508,  0.5193,  ...,  1.0159,  0.9303,  1.0502],
          ...,
          [-0.1143, -0.2684, -0.1999,  ..., -1.1589, -1.1418, -1.1418],
          [-0.2856, -0.1486, -0.1143,  ..., -1.2445, -1.2788, -1.1760],
          [-0.1828, -0.3198, -0.1999,  ..., -1.2103, -1.2274, -1.2103]],
 
         [[ 0.5203,  0.5378,  0.6429,  ...,  0.5903,  0.5728,  0.7129],
          [ 0.2752,  0.3452,  0.3978,  ...,  0.7129,  0.6954,  0.8354],
          [ 0.3803,  0.4153,  0.5378,  ...,  0.7129,  0.6954,  0.8354],
          ...,
          [-0.3725, -0.5476, -0.4951,  ..., -1.0378, -1.0378, -1.0553],
          [-0.4951, -0.3550, -0.3200,  ..., -1.1078, -1.1604, -1.0903],
          [-0.3725, -0.5126, -0.4076,  ..., -1.1078, -1.1604, -1.2129]],
 
         [[ 0.0082,  0.0256, 

In [21]:
question_tensor, image, ans_index_list = dataset[0]
ans_index_list.shape

What is this photo taken looking through?
['net', 'net', 'net', 'netting', 'net', 'net', 'mesh', 'net', 'net', 'net']


torch.Size([9])

Collator
--

In [133]:
def collate(batch):
    """
    args: list of (question, image, answer) tuples
         question: 1D tensor of variable length
         image: tensor of shape (3, 3, 256)
         answer: 1D tensor of variable length
         
    returns:
        question: packed sequence (data: 1D tensor of total questions length, batch_sizes: 1D tensor of max ques length)
        image: tensor of shape (batchsize, 3, 3, 256)
        answer: 1D tensor of variable length
    """
    # sort batch in descending order by question length
    sorted_batch = sorted(batch, key=lambda x: len(x[0]), reverse=True) # x is a tuple
    question, image, answer = zip(*sorted_batch)
    
    return torch.nn.utils.rnn.pack_sequence(question), torch.stack(image), answer

Dataloader
--