In [70]:
%matplotlib inline
import os

Data folder structure
--

In [3]:
dataDir ='../../data2'
os.listdir(dataDir)

['v2_mscoco_val2014_complementary_pairs.json',
 'v2_Questions_Val_mscoco.zip',
 'v2_mscoco_val2014_annotations.json',
 'v2_OpenEnded_mscoco_val2014_questions.json',
 'v2_mscoco_train2014_annotations.json',
 'annotations',
 'train2014',
 'v2_OpenEnded_mscoco_train2014_questions.json',
 'v2_Questions_Train_mscoco.zip',
 'v2_mscoco_train2014_complementary_pairs.json',
 'val2014']

Notes on data folder
--
- `annotations`: not required, only for image captioning
- `train2014` and `val2014`: img folders with format e.g. `COCO_val2014_000000059710.jpg`
- `v2_Questions_Train_mscoco.zip` and `v2_Questions_Val_mscoco.zip`: unzip into respective questions `json`

In [8]:
from vqaTools.vqa import VQA

dataDir ='../../data2'
versionType ='v2_' # this should be '' when using VQA v2.0 dataset
taskType ='OpenEnded' # 'OpenEnded' only for v2.0. 'OpenEnded' or 'MultipleChoice' for v1.0
dataType ='mscoco'  # 'mscoco' only for v1.0. 'mscoco' for real and 'abstract_v002' for abstract for v1.0.
dataSubType ='train2014'
annFile ='{}/{}{}_{}_annotations.json'.format(dataDir, versionType, dataType, dataSubType)
quesFile ='{}/{}{}_{}_{}_questions.json'.format(dataDir, versionType, taskType, dataType, dataSubType)
imgDir = '{}/{}/'.format(dataDir, dataSubType)


In [9]:
# initialize VQA api for QA annotations
vqa = VQA(annFile, quesFile)

loading VQA annotations and questions into memory...
0:00:10.382276
creating index...
index created!


In [79]:
import nltk
image = vqa.qqa[458752000]['image_id']
qa = vqa.loadQA(458752002)
# qa['multiple_choice_answer']
# [answer for answer in qa[0]['answers']

qa
 
# for a in qa[0]['answers']:
#     print(a['answer'])
# ans_list = [a['answer'] for a in qa[0]['answers']]


# len(vqa.dataset['annotations']) # 443757
# len(vqa.getQuesIds())

[{'answer_type': 'other',
  'multiple_choice_answer': 'orange',
  'answers': [{'answer': 'orange', 'answer_confidence': 'yes', 'answer_id': 1},
   {'answer': 'orange', 'answer_confidence': 'yes', 'answer_id': 2},
   {'answer': 'orange', 'answer_confidence': 'maybe', 'answer_id': 3},
   {'answer': 'orange', 'answer_confidence': 'yes', 'answer_id': 4},
   {'answer': 'orange', 'answer_confidence': 'yes', 'answer_id': 5},
   {'answer': 'orange', 'answer_confidence': 'yes', 'answer_id': 6},
   {'answer': 'orange', 'answer_confidence': 'yes', 'answer_id': 7},
   {'answer': 'orange', 'answer_confidence': 'yes', 'answer_id': 8},
   {'answer': 'orange', 'answer_confidence': 'yes', 'answer_id': 9},
   {'answer': 'orange', 'answer_confidence': 'yes', 'answer_id': 10}],
  'image_id': 458752,
  'question_type': 'what color is the',
  'question_id': 458752002}]

Questions
--

### Structure
```
{
"info" : info,
"task_type" : str,
"data_type": str,
"data_subtype": str,
"questions" : [question],
"license" : license
}

info {
"year" : int,
"version" : str,
"description" : str,
"contributor" : str,
"url" : str,
"date_created" : datetime
}

license{
"name" : str,
"url" : str
}

question{
"question_id" : int,
"image_id" : int,
"question" : str
}
```

### Accessing keys

In [36]:
list(vqa.questions.keys())

['info', 'task_type', 'data_type', 'license', 'data_subtype', 'questions']

### `question` dict 

In [37]:
list(vqa.questions['questions'])[0:3]

[{'image_id': 458752,
  'question': 'What is this photo taken looking through?',
  'question_id': 458752000},
 {'image_id': 458752,
  'question': 'What position is this man playing?',
  'question_id': 458752001},
 {'image_id': 458752,
  'question': 'What color is the players shirt?',
  'question_id': 458752002}]

#### Accessing items in `question` dict by `question_id`

In [41]:
vqa.qqa[458752000]

{'image_id': 458752,
 'question': 'What is this photo taken looking through?',
 'question_id': 458752000}

Annotation
--

### Structure

```
{
"info" : info,
"data_type": str,
"data_subtype": str,
"annotations" : [annotation],
"license" : license
}

info {
"year" : int,
"version" : str,
"description" : str,
"contributor" : str,
"url" : str,
"date_created" : datetime
}

license{
"name" : str,
"url" : str
}

annotation{
"question_id" : int,
"image_id" : int,
"question_type" : str,
"answer_type" : str,
"answers" : [answer],
"multiple_choice_answer" : str
}

answer{
"answer_id" : int,
"answer" : str,
"answer_confidence": str
}
```

### Accessing keys

In [38]:
list(vqa.dataset.keys())

['info', 'license', 'data_subtype', 'annotations', 'data_type']

### `annotation` dict 

In [40]:
list(vqa.dataset['annotations'])[0:2]

[{'question_type': 'what is this',
  'multiple_choice_answer': 'net',
  'answers': [{'answer': 'net', 'answer_confidence': 'maybe', 'answer_id': 1},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 2},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 3},
   {'answer': 'netting', 'answer_confidence': 'yes', 'answer_id': 4},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 5},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 6},
   {'answer': 'mesh', 'answer_confidence': 'maybe', 'answer_id': 7},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 8},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 9},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 10}],
  'image_id': 458752,
  'answer_type': 'other',
  'question_id': 458752000},
 {'question_type': 'what',
  'multiple_choice_answer': 'pitcher',
  'answers': [{'answer': 'pitcher',
    'answer_confidence': 'yes',
    'answer_id': 1},
   {'answer': 'c

#### Accessing items in `annotation` dict by `question_id` 

In [43]:
vqa.qa[458752000]

{'question_type': 'what is this',
 'multiple_choice_answer': 'net',
 'answers': [{'answer': 'net', 'answer_confidence': 'maybe', 'answer_id': 1},
  {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 2},
  {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 3},
  {'answer': 'netting', 'answer_confidence': 'yes', 'answer_id': 4},
  {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 5},
  {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 6},
  {'answer': 'mesh', 'answer_confidence': 'maybe', 'answer_id': 7},
  {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 8},
  {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 9},
  {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 10}],
 'image_id': 458752,
 'answer_type': 'other',
 'question_id': 458752000}

In [23]:
vqa.loadQA([458752000, 458752001, 458752002])

[{'question_type': 'what is this',
  'multiple_choice_answer': 'net',
  'answers': [{'answer': 'net', 'answer_confidence': 'maybe', 'answer_id': 1},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 2},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 3},
   {'answer': 'netting', 'answer_confidence': 'yes', 'answer_id': 4},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 5},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 6},
   {'answer': 'mesh', 'answer_confidence': 'maybe', 'answer_id': 7},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 8},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 9},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 10}],
  'image_id': 458752,
  'answer_type': 'other',
  'question_id': 458752000},
 {'question_type': 'what',
  'multiple_choice_answer': 'pitcher',
  'answers': [{'answer': 'pitcher',
    'answer_confidence': 'yes',
    'answer_id': 1},
   {'answer': 'c

In [1]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from build_vocab import *
from vqaTools.vqa import VQA
from PIL import Image

In [3]:
rootDir = '../../data2'
dataSubType = 'train2014'
annFile ='{}/v2_mscoco_{}_annotations.json'.format(rootDir, dataSubType)
quesFile ='{}/v2_OpenEnded_mscoco_{}_questions.json'.format(rootDir, dataSubType)

vocab = build_vocab(annFile, quesFile, threshold=4)
vocab_path = '../../dotCuda/notebook/vocab.pkl'

with open(vocab_path, 'wb') as f:
    pickle.dump(vocab, f)
print("Total vocabulary size: {}".format(len(vocab)))
print("Saved the vocabulary wrapper to '{}'".format(vocab_path))

loading VQA annotations and questions into memory...
0:00:07.573418
creating index...
index created!
[443757/443757] Tokenized the questions.
Total vocabulary size: 7521
Saved the vocabulary wrapper to '../../dotCuda/notebook/vocab.pkl'


In [4]:
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)

In [5]:
class COCODataset(Dataset):
    
    def __init__(self, vocab, rootDir='../../data2/', dataSubType='train2014', transform=None):
        
        annFile ='{}/v2_mscoco_{}_annotations.json'.format(rootDir, dataSubType)
        quesFile ='{}/v2_OpenEnded_mscoco_{}_questions.json'.format(rootDir, dataSubType)
        self.vqa = VQA(annFile, quesFile)
        self.imgDir = '{}/{}'.format(rootDir, dataSubType)
        self.vocab = vocab
        self.quesIds = self.vqa.getQuesIds()
        self.dataSubType = dataSubType
        self.transform = transform
        
    def __getitem__(self, index):
        
        """
        returns:
            actual question as tensor of word-indices
            image_id
            indices of answers mapped to 3000 most frequent answers ?
        """
        
        quesId = self.quesIds[index]
        
        img_id = self.vqa.qqa[quesId]['image_id']        
        path = 'COCO_{}_000000{}.jpg'.format(self.dataSubType, img_id)
        image = Image.open(os.path.join(self.imgDir, path)).convert('RGB')

        if self.transform is not None:
            image = self.transform(image)
            
            
        # Convert question to word ids
        vocab = self.vocab
        question = self.vqa.qqa[quesId]['question']
        print(question)
        tokens = nltk.tokenize.word_tokenize(question.lower())
        question_list = []
        question_list.append(vocab('<start>'))
        question_list.extend([vocab(token) for token in tokens])
        question_list.append(vocab('<end>'))
        question_tensor = torch.Tensor(question_list)
        
        qa = self.vqa.loadQA(quesId)
        
        ans_list = [a['answer'] for a in qa[0]['answers']] # ?

        return question_tensor, image, ans_list
        
        
    def __len__():
        return len(self.vqa.dataset['annotations'])
        

In [6]:
dataset = COCODataset(vocab=vocab)

loading VQA annotations and questions into memory...
0:00:07.472525
creating index...
index created!


In [14]:
dataset[0]

What is this photo taken looking through?


(tensor([  1.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,   2.]),
 <PIL.Image.Image image mode=RGB size=640x480 at 0x7F5938DFADD8>,
 ['net', 'net', 'net', 'netting', 'net', 'net', 'mesh', 'net', 'net', 'net'])