In [1]:
from typing import Any, Dict, List, Optional

import torch
from torch.nn.functional import normalize
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset

from visdial.data.readers import (
    DialogsReader,
    DenseAnnotationsReader,
    ImageFeaturesHdfReader,
)
from visdial.data.vocabulary import Vocabulary

In [2]:
class VisDialDataset(Dataset):
    def __init__(
        self,
        config: Dict[str, Any],
        dialogs_jsonpath: str,
        dense_annotations_jsonpath: Optional[str] = None,
        overfit: bool = False,
        in_memory: bool = False,
        return_options: bool = True,
        add_boundary_toks: bool = False,
    ):
        super().__init__()
        self.config = config
        self.return_options = return_options
        self.add_boundary_toks = add_boundary_toks
        self.dialogs_reader = DialogsReader(dialogs_jsonpath)

        if "val" in self.split and dense_annotations_jsonpath is not None:
            self.annotations_reader = DenseAnnotationsReader(
                dense_annotations_jsonpath
            )
        else:
            self.annotations_reader = None

        self.vocabulary = Vocabulary(
            config["word_counts_json"], min_count=config["vocab_min_count"]
        )

        # Initialize image features reader according to split.
        image_features_hdfpath = config["image_features_train_h5"]
        if "val" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_val_h5"]
        elif "test" in self.dialogs_reader.split:
            image_features_hdfpath = config["image_features_test_h5"]

        self.hdf_reader = ImageFeaturesHdfReader(
            image_features_hdfpath, in_memory
        )

        # Keep a list of image_ids as primary keys to access data.
        self.image_ids = list(self.dialogs_reader.dialogs.keys())
        if overfit:
            self.image_ids = self.image_ids[:5]

    @property
    def split(self):
        return self.dialogs_reader.split

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, index):
        # Get image_id, which serves as a primary key for current instance.
        image_id = self.image_ids[index]

        # Get image features for this image_id using hdf reader.
        image_features = self.hdf_reader[image_id]
        image_features = torch.tensor(image_features)
        # Normalize image features at zero-th dimension (since there's no batch
        # dimension).
        if self.config["img_norm"]:
            image_features = normalize(image_features, dim=0, p=2)

        # Retrieve instance for this image_id using json reader.
        visdial_instance = self.dialogs_reader[image_id]
        caption = visdial_instance["caption"]
        dialog = visdial_instance["dialog"]

        # Convert word tokens of caption, question, answer and answer options
        # to integers.
        caption = self.vocabulary.to_indices(caption)
        for i in range(len(dialog)):
            dialog[i]["question"] = self.vocabulary.to_indices(
                dialog[i]["question"]
            )
            if self.add_boundary_toks:
                dialog[i]["answer"] = self.vocabulary.to_indices(
                    [self.vocabulary.SOS_TOKEN]
                    + dialog[i]["answer"]
                    + [self.vocabulary.EOS_TOKEN]
                )
            else:
                dialog[i]["answer"] = self.vocabulary.to_indices(
                    dialog[i]["answer"]
                )

            if self.return_options:
                for j in range(len(dialog[i]["answer_options"])):
                    if self.add_boundary_toks:
                        dialog[i]["answer_options"][
                            j
                        ] = self.vocabulary.to_indices(
                            [self.vocabulary.SOS_TOKEN]
                            + dialog[i]["answer_options"][j]
                            + [self.vocabulary.EOS_TOKEN]
                        )
                    else:
                        dialog[i]["answer_options"][
                            j
                        ] = self.vocabulary.to_indices(
                            dialog[i]["answer_options"][j]
                        )

        questions, question_lengths = self._pad_sequences(
            [dialog_round["question"] for dialog_round in dialog]
        )
        history, history_lengths = self._get_history(
            caption,
            [dialog_round["question"] for dialog_round in dialog],
            [dialog_round["answer"] for dialog_round in dialog],
        )
        answers_in, answer_lengths = self._pad_sequences(
            [dialog_round["answer"][:-1] for dialog_round in dialog]
        )
        answers_out, _ = self._pad_sequences(
            [dialog_round["answer"][1:] for dialog_round in dialog]
        )

        # Collect everything as tensors for ``collate_fn`` of dataloader to
        # work seamlessly questions, history, etc. are converted to
        # LongTensors, for nn.Embedding input.
        item = {}
        item["img_ids"] = torch.tensor(image_id).long()
        item["img_feat"] = image_features
        item["ques"] = questions.long()
        item["hist"] = history.long()
        item["ans_in"] = answers_in.long()
        item["ans_out"] = answers_out.long()
        item["ques_len"] = torch.tensor(question_lengths).long()
        item["hist_len"] = torch.tensor(history_lengths).long()
        item["ans_len"] = torch.tensor(answer_lengths).long()
        item["num_rounds"] = torch.tensor(
            visdial_instance["num_rounds"]
        ).long()

        if self.return_options:
            if self.add_boundary_toks:
                answer_options_in, answer_options_out = [], []
                answer_option_lengths = []
                for dialog_round in dialog:
                    options, option_lengths = self._pad_sequences(
                        [
                            option[:-1]
                            for option in dialog_round["answer_options"]
                        ]
                    )
                    answer_options_in.append(options)

                    options, _ = self._pad_sequences(
                        [
                            option[1:]
                            for option in dialog_round["answer_options"]
                        ]
                    )
                    answer_options_out.append(options)

                    answer_option_lengths.append(option_lengths)
                answer_options_in = torch.stack(answer_options_in, 0)
                answer_options_out = torch.stack(answer_options_out, 0)

                item["opt_in"] = answer_options_in.long()
                item["opt_out"] = answer_options_out.long()
                item["opt_len"] = torch.tensor(answer_option_lengths).long()
            else:
                answer_options = []
                answer_option_lengths = []
                for dialog_round in dialog:
                    options, option_lengths = self._pad_sequences(
                        dialog_round["answer_options"]
                    )
                    answer_options.append(options)
                    answer_option_lengths.append(option_lengths)
                answer_options = torch.stack(answer_options, 0)

                item["opt"] = answer_options.long()
                item["opt_len"] = torch.tensor(answer_option_lengths).long()

            if "test" not in self.split:
                answer_indices = [
                    dialog_round["gt_index"] for dialog_round in dialog
                ]
                item["ans_ind"] = torch.tensor(answer_indices).long()

        # Gather dense annotations.
        if self.annotations_reader is not None:
            dense_annotations = self.annotations_reader[image_id]
            item["gt_relevance"] = torch.tensor(
                dense_annotations["gt_relevance"]
            ).float()
            item["round_id"] = torch.tensor(
                dense_annotations["round_id"]
            ).long()

        return item

    def _pad_sequences(self, sequences: List[List[int]]):
        """Given tokenized sequences (either questions, answers or answer
        options, tokenized in ``__getitem__``), padding them to maximum
        specified sequence length. Return as a tensor of size
        ``(*, max_sequence_length)``.

        This method is only called in ``__getitem__``, chunked out separately
        for readability.

        Parameters
        ----------
        sequences : List[List[int]]
            List of tokenized sequences, each sequence is typically a
            List[int].

        Returns
        -------
        torch.Tensor, torch.Tensor
            Tensor of sequences padded to max length, and length of sequences
            before padding.
        """

        for i in range(len(sequences)):
            sequences[i] = sequences[i][
                : self.config["max_sequence_length"] - 1
            ]
        sequence_lengths = [len(sequence) for sequence in sequences]

        # Pad all sequences to max_sequence_length.
        maxpadded_sequences = torch.full(
            (len(sequences), self.config["max_sequence_length"]),
            fill_value=self.vocabulary.PAD_INDEX,
        )
        padded_sequences = pad_sequence(
            [torch.tensor(sequence) for sequence in sequences],
            batch_first=True,
            padding_value=self.vocabulary.PAD_INDEX,
        )
        maxpadded_sequences[:, : padded_sequences.size(1)] = padded_sequences
        return maxpadded_sequences, sequence_lengths

    def _get_history(
        self,
        caption: List[int],
        questions: List[List[int]],
        answers: List[List[int]],
    ):
        # Allow double length of caption, equivalent to a concatenated QA pair.
        caption = caption[: self.config["max_sequence_length"] * 2 - 1]

        for i in range(len(questions)):
            questions[i] = questions[i][
                : self.config["max_sequence_length"] - 1
            ]

        for i in range(len(answers)):
            answers[i] = answers[i][: self.config["max_sequence_length"] - 1]

        # History for first round is caption, else concatenated QA pair of
        # previous round.
        history = []
        history.append(caption)
        for question, answer in zip(questions, answers):
            history.append(question + answer + [self.vocabulary.EOS_INDEX])
        # Drop last entry from history (there's no eleventh question).
        history = history[:-1]
        max_history_length = self.config["max_sequence_length"] * 2

        if self.config.get("concat_history", False):
            # Concatenated_history has similar structure as history, except it
            # contains concatenated QA pairs from previous rounds.
            concatenated_history = []
            concatenated_history.append(caption)
            for i in range(1, len(history)):
                concatenated_history.append([])
                for j in range(i + 1):
                    concatenated_history[i].extend(history[j])

            max_history_length = (
                self.config["max_sequence_length"] * 2 * len(history)
            )
            history = concatenated_history

        history_lengths = [len(round_history) for round_history in history]
        maxpadded_history = torch.full(
            (len(history), max_history_length),
            fill_value=self.vocabulary.PAD_INDEX,
        )
        padded_history = pad_sequence(
            [torch.tensor(round_history) for round_history in history],
            batch_first=True,
            padding_value=self.vocabulary.PAD_INDEX,
        )
        maxpadded_history[:, : padded_history.size(1)] = padded_history
        return maxpadded_history, history_lengths

In [5]:
import argparse
import itertools

from tensorboardX import SummaryWriter
import torch
from torch import nn, optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader
from tqdm import tqdm
import yaml
from bisect import bisect

from visdial.data.dataset import VisDialDataset
from visdial.encoders import Encoder
from visdial.decoders import Decoder
from visdial.metrics import SparseGTMetrics, NDCG
from visdial.model import EncoderDecoderModel
from visdial.utils.checkpointing import CheckpointManager, load_checkpoint


parser = argparse.ArgumentParser()
parser.add_argument(
        "--config-yml",
        default="configs/lf_gen_faster_rcnn_x101.yml",
        help="Path to a config file listing reader, model and solver parameters.",
        )
parser.add_argument(
        "--train-json",
        default="/home/ubuntu/datasets/myvisdial/data/visdial_1.0_train.json",
        help="Path to json file containing VisDial v1.0 training data.",
        )
parser.add_argument(
        "--val-json",
        default="/home/ubuntu/datasets/myvisdial/data/visdial_1.0_val.json",
        help="Path to json file containing VisDial v1.0 validation data.",
        )
parser.add_argument(
        "--val-dense-json",
        default="/home/ubuntu/datasets/myvisdial/data/visdial_1.0_val_dense_annotations.json",
        help="Path to json file containing VisDial v1.0 validation dense ground "
             "truth annotations.",
        )

parser.add_argument_group(
        "Arguments independent of experiment reproducibility"
        )
parser.add_argument(
        "--gpu-ids",
        nargs="+",
        type=int,
        default=1,
        help="List of ids of GPUs to use.",
        )
parser.add_argument(
        "--cpu-workers",
        type=int,
        default=4,
        help="Number of CPU workers for dataloader.",
        )
parser.add_argument(
        "--overfit",
        action="store_true",
        help="Overfit model on 5 examples, meant for debugging.",
        )
parser.add_argument(
        "--validate",
        action="store_true",
        help="Whether to validate on val split after every epoch.",
        )
parser.add_argument(
        "--in-memory",
        action="store_true",
        help="Load the whole dataset and pre-extracted image features in memory. "
             "Use only in presence of large RAM, atleast few tens of GBs.",
        )

parser.add_argument_group("Checkpointing related arguments")
parser.add_argument(
        "--save-dirpath",
        default="/home/ubuntu/datasets/myvisdial/checkpoints/",
        help="Path of directory to create checkpoint directory and save "
             "checkpoints.",
        )
parser.add_argument(
        "--load-pthpath",
        default="",
        help="To continue training, path to .pth file of saved checkpoint.",
        )

args = parser.parse_args()

usage: ipykernel_launcher.py [-h] [--config-yml CONFIG_YML]
                             [--train-json TRAIN_JSON] [--val-json VAL_JSON]
                             [--val-dense-json VAL_DENSE_JSON]
                             [--gpu-ids GPU_IDS [GPU_IDS ...]]
                             [--cpu-workers CPU_WORKERS] [--overfit]
                             [--validate] [--in-memory]
                             [--save-dirpath SAVE_DIRPATH]
                             [--load-pthpath LOAD_PTHPATH]
ipykernel_launcher.py: error: unrecognized arguments: -f /run/user/1000/jupyter/kernel-b7158e4d-e834-47b2-8f42-94bdf17521e6.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [6]:
run train_experiment.py

dataset:
  concat_history: true
  image_features_test_h5: /home/ubuntu/datasets/myvisdial/data/features_faster_rcnn_x101_test.h5
  image_features_train_h5: /home/ubuntu/datasets/myvisdial/data/features_faster_rcnn_x101_train.h5
  image_features_val_h5: /home/ubuntu/datasets/myvisdial/data/features_faster_rcnn_x101_val.h5
  img_norm: 1
  max_sequence_length: 20
  vocab_min_count: 5
  word_counts_json: /home/ubuntu/datasets/myvisdial/data/visdial_1.0_word_counts_train.json
model:
  decoder: gen
  dropout: 0.2
  encoder: lf
  img_feature_size: 2048
  lstm_hidden_size: 512
  lstm_num_layers: 2
  word_embedding_size: 300
solver:
  batch_size: 32
  initial_lr: 1e-3
  num_epochs: 100
  training_splits: train

config_yml          : configs/lf_gen_faster_rcnn_x101.yml
train_json          : /home/ubuntu/datasets/myvisdial/data/visdial_1.0_train.json
val_json            : /home/ubuntu/datasets/myvisdial/data/visdial_1.0_val.json
val_dense_json      : /home/ubuntu/datasets/myvisdial/data/visdial_1

  0%|          | 903/376083 [00:00<00:45, 8252.78it/s]

[train] Tokenizing questions...


100%|██████████| 376083/376083 [00:22<00:00, 16661.68it/s]
  1%|          | 3408/337528 [00:00<00:19, 17061.49it/s]

[train] Tokenizing answers...


100%|██████████| 337528/337528 [00:21<00:00, 15431.05it/s]
  1%|          | 1320/123287 [00:00<00:09, 13197.68it/s]

[train] Tokenizing captions...


100%|██████████| 123287/123287 [00:08<00:00, 15003.22it/s]
  0%|          | 1271/376083 [00:00<00:29, 12708.10it/s]

[train] Tokenizing questions...


100%|██████████| 376083/376083 [00:22<00:00, 16601.61it/s]
  0%|          | 1671/337528 [00:00<00:20, 16708.82it/s]

[train] Tokenizing answers...


100%|██████████| 337528/337528 [00:24<00:00, 14032.56it/s]
  1%|          | 1314/123287 [00:00<00:09, 13136.97it/s]

[train] Tokenizing captions...


100%|██████████| 123287/123287 [00:08<00:00, 15057.62it/s]


In [7]:
vd = val_dataset

In [9]:
elem = vd[0]
for key in elem:
    print(key)

img_ids
img_feat
ques
hist
ans_in
ans_out
ques_len
hist_len
ans_len
num_rounds
opt_in
opt_out
opt_len
ans_ind


In [10]:
vocab = vd.vocabulary

In [12]:
elem['opt_in'].shape

torch.Size([10, 100, 20])

In [13]:
elem['opt_out'].shape

torch.Size([10, 100, 20])

In [22]:
print(elem['opt_in'][1, 3, :])
print(elem['opt_out'][1, 3, :])


tensor([ 1, 82,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0])
tensor([82,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0])


In [26]:
' '.join(vocab.to_words((elem['opt_out'][1, 3, :].numpy())))

'green </S> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>'

In [27]:
' '.join(vocab.to_words((elem['opt_in'][1, 3, :].numpy())))

'<S> green <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>'

In [28]:
tr = train_dataset

In [34]:
elem = tr[0]
elem['ans_in'].shape

torch.Size([10, 20])

In [42]:
for i in range(10):
    print(' '.join(vocab.to_words(elem['ans_in'][i].numpy())))

<S> adult <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
<S> male <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
<S> inside <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
<S> yes , but there is a blanket in between them and the floor <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
<S> it is tile <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
<S> red and white <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
<S> orange red <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
<S> boxer <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
<S> yes <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <

In [43]:
for i in range(10):
    print(' '.join(vocab.to_words(elem['ans_out'][i].numpy())))

adult </S> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
male </S> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
inside </S> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
yes , but there is a blanket in between them and the floor </S> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
it is tile </S> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
red and white </S> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
orange red </S> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
boxer </S> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
yes </S> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD

In [44]:
vel = vd[0]

In [45]:
vel['ans_ind']

tensor([74, 12, 30, 54, 22, 24, 33, 54, 70, 43])