# The Vision-Language Tookit (VLTK)

* Define FRCNN Adapter
* Define Vision Dataset Adapters
    * Define Adapter for COCO
    * Define Adapter for Visual Genome
* Define Vision-Language Dataset Adapters
    * Define Adapter for VQA
    * Define Adapter for GQA
* Register User-Defined Adapters with VLTK to Superset Datasets
* Extract Datasets for Each User-Defined Adapter Class
* Define Config to Super-Set Datasets Together + View First Row

In [24]:
#!pip install -e .

In [1]:
from collections import Counter, defaultdict

import vltk
from vltk import Features, adapters, compat
from vltk.adapters import Adapters
from vltk.configs import DataConfig, ProcessorConfig
from vltk.loader.builder import init_datasets
from vltk.metrics import soft_score
from vltk.modeling.frcnn import FRCNN as FasterRCNN
from vltk.processing.label import clean_imgid_default, label_default
from pprint import pprint

# Define FRCNN Adapter

In [2]:
class FRCNN(adapters.VisnExtraction):

    default_processor = ProcessorConfig(
        **{
            "transforms": ["ToPILImage", "ToTensor", "ResizeTensor", "Normalize"],
            "size": (800, 1333),
            "mode": "bilinear",
            "pad_value": 0.0,
            "mean": [102.9801, 115.9465, 122.7717],
            "sdev": [1.0, 1.0, 1.0],
        }
    )
    model_config = compat.Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
    weights = "unc-nlp/frcnn-vg-finetuned"
    model = FasterRCNN
    model_config = compat.Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")

    def schema(max_detections=36, visual_dim=2048):
        return {
            "attr_ids": Features.ids,
            "object_ids": Features.ids,
            vltk.features: Features.features(max_detections, visual_dim),
            vltk.boxtensor: Features.boxtensor(max_detections),
        }

    def forward(model, entry):

        size = entry["size"]
        scale_hw = entry["scale"]
        image = entry["image"]

        model_out = model(
            images=image.unsqueeze(0),
            image_shapes=size.unsqueeze(0),
            scales_yx=scale_hw.unsqueeze(0),
            padding="max_detections",
            pad_value=0.0,
            return_tensors="np",
            location="cpu",
        )
        return {
            "object_ids": model_out["obj_ids"],
            "attr_ids": model_out["attr_ids"],
            vltk.boxtensor: model_out["normalized_boxes"],
            vltk.features: model_out["roi_features"],
        }

# Define Vision Dataset Adapters

## Define Adapter for COCO dataset

In [3]:
class Coco2014(adapters.VisnDataset):
    def imgid_to_filename(imgid, split):
        year = 2014 if split != "test" else 2015
        return f"COCO_{split}{year}_{str((12 - len(imgid)) * 0)}{imgid}.jpg"

    def schema():
        return {vltk.box: Features.box, vltk.segmentation: Features.segmentation}

    def forward(json_files, splits):

        total_annos = {}
        id_to_cat = {}
        id_to_size = {}
        for file, json in json_files:
            if "instance" not in file:
                continue
            info = json["images"]
            for i in info:
                id_to_size[clean_imgid_default(i["file_name"]).split(".")[0]] = [
                    i["height"],
                    i["width"],
                ]
        for file, json in json_files:
            if "instance" not in file:
                continue

            categories = json["categories"]
            for cat in categories:
                id_to_cat[cat["id"]] = cat["name"]

            for entry in json["annotations"]:
                img_id = clean_imgid_default(str(entry["image_id"]))
                bbox = entry["bbox"]
                segmentation = entry["segmentation"]
                category_id = id_to_cat[entry["category_id"]]
                if entry["iscrowd"]:
                    seg_mask = []
                else:
                    seg_mask = segmentation
                    if not isinstance(seg_mask[0], list):
                        seg_mask = [seg_mask]
                img_data = total_annos.get(img_id, None)
                if img_data is None:
                    img_entry = defaultdict(list)
                    img_entry[vltk.label].append(category_id)
                    img_entry[vltk.box].append(bbox)
                    img_entry[vltk.segmentation].append(seg_mask)
                    total_annos[img_id] = img_entry
                else:
                    total_annos[img_id][vltk.box].append(bbox)
                    total_annos[img_id][vltk.label].append(category_id)
                    total_annos[img_id][vltk.segmentation].append(seg_mask)

        return [{vltk.imgid: img_id, **entry} for img_id, entry in total_annos.items()]

## Define Adatper for Visual Genome

In [4]:
class VisualGenome(adapters.VisnDataset):
    def imgid_to_filename(imgid, split):
        return f"{imgid}.jpg"

    def schema():
        return {}

    def forward(json_files, splits):
        return {}

# Define Adapters for Vision-Language Datasets 

## Define Adapter for VQA

In [5]:
class VQA(adapters.VisnLangDataset):
    data_info = {
        "val": {"coco2014": ["val"]},
        "train": {"coco2014": ["train"]},
        "test": {"coco2014": ["test"]},
    }

    def schema():
        return {"qid": Features.string}

    def forward(json_files, split, min_label_frequency=9):
        batch_entries = []
        all_questions = []
        qid2answers = {}
        label_frequencies = Counter()
        for x in json_files:
            if "questions" in x:
                all_questions.extend(x["questions"])
            else:
                annotations = x["annotations"]
                accepted_answers = {
                    label_default(anno["multiple_choice_answer"])
                    for anno in annotations
                }
                for anno in annotations:
                    qid = str(anno["question_id"])
                    answers = anno["answers"]
                    label_frequencies.update(
                        [label_default(anno["multiple_choice_answer"])]
                    )
                    answer_counter = Counter()
                    for ans_dict in answers:
                        ans = ans_dict["answer"]
                        if ans not in accepted_answers:
                            pass
                        else:
                            ans = label_default(ans)
                            answer_counter.update([ans])
                    qid2answers[qid] = {
                        k: soft_score(v) for k, v in answer_counter.items()
                    }

        skipped = 0
        for entry in all_questions:
            entry[vltk.imgid] = str(entry.pop("image_id"))
            entry[vltk.text] = entry.pop("question")
            entry["qid"] = str(entry.pop("question_id"))
            try:
                entry[vltk.label] = qid2answers[entry["qid"]]
                labels = {
                    l: s
                    for l, s in entry[vltk.label].items()
                    if label_frequencies[l] > min_label_frequency
                }
                if not labels:
                    skipped += 1
                    continue

                labels, scores = adapters.VisnLangDataset._label_handler(labels)
                entry[vltk.score] = scores
                entry[vltk.label] = labels
            except KeyError:
                pass

            batch_entries.append(entry)
        return batch_entries

## Define Adapter for GQA

In [6]:
class GQA(adapters.VisnLangDataset):
    data_info = {
        "dev": {"coco2014": ["test"]},
        "train": {"visualgenome": ["train"]},
        "val": {"visualgenome": ["train"]},
        "test": {"coco2014": ["test"]},
        "testdev": {"coco2014": ["val"]},
    }

    def schema():
        return {}

    def forward(json_files, split, min_label_frequency=2):
        skipped = 0
        label_frequencies = Counter()
        batch_entries = []

        for t in json_files:
            for i, (k, v) in enumerate(t.items()):
                if "answer" in v:
                    answer = label_default(v["answer"])
                    label_frequencies.update([answer])

            for i, (k, v) in enumerate(t.items()):
                if split == "test":
                    answer = None
                elif label_frequencies[v["answer"]] < min_label_frequency:
                    skipped += 1
                    continue
                else:
                    answer = label_default(v["answer"])

                text = v["question"]
                img_id = v["imageId"].lstrip("n")
                entry = {
                    vltk.text: text,
                    vltk.imgid: img_id,
                    vltk.label: [answer],
                    vltk.score: [1.0],
                }

                batch_entries.append(entry)

        return batch_entries

# Register User-Defined Adapters with VLTK to Superset Datasets

In [7]:
# add adapters to library
Adapters().add(VQA, GQA, Coco2014, VisualGenome, FRCNN)

# Extract Datasets for Each Defined Adapter Class

In [9]:
# demo data dir
datadir = "/home/eltoto/demodata"

cocofeats = FRCNN.extract(datadir, dataset_name="coco2014")
vgfeats = FRCNN.extract(datadir, dataset_name="visualgenome")
coco2014 = Coco2014.extract(datadir)
visualgenome = VisualGenome.extract(datadir)
vqa = VQA.extract(datadir)
gqa = GQA.extract(datadir)

# Define Config to Super-Set Datasets Together + View First Row

In [8]:

#config
config = DataConfig(
    # choose which dataset and dataset split for train and eval
    train_datasets=[["gqa", "train"], ["vqa", "trainval"]],
    eval_datasets=["gqa", "testdev"],
    # choose which tokenizer to use
    tokenizer="BertWordPieceTokenizer",
    # choose which feature extractor to use
    extractor="frcnn",
    datadir="/home/eltoto/demodata",
    train_batch_size=1,
    eval_batch_size=1,
    img_first=True,
)

In [9]:
# superset datasets together
(train, val), _, answer_to_id, object_to_id = init_datasets(config)

Added VisnLangDataset gqa: testdev
Added VisnDataset coco2014: val
Added VisnLangDataset gqa: train
Added VisnDataset visualgenome: train
Added VisnLangDataset vqa: train
Added VisnDataset coco2014: train
Added VisnLangDataset vqa: val


In [20]:
for batch in train[1]:
    pprint(batch)
    break

{'attr_ids': tensor([[  7.,   7.,   7., 234.,   7., 234.,   7.,   7.,   7.,   7.,   7., 234.,
           7., 234.,   7.,   7.,   4.,   7.,   4., 234.,   7., 234., 234.,   7.,
           7., 234.,   7., 234.,   7.,   7.,   7.,   7.,   7., 234.,   7.,   7.]]),
 'boxtensor': tensor([[[3.6219e-03, 0.0000e+00, 7.5334e-01, 2.9445e-01],
         [1.4685e-01, 0.0000e+00, 1.1711e+00, 3.5520e-01],
         [4.3438e-03, 0.0000e+00, 6.3741e-01, 3.6989e-01],
         [0.0000e+00, 1.3093e-01, 5.6910e-01, 8.2327e-01],
         [2.5827e-01, 0.0000e+00, 9.7401e-01, 3.4704e-01],
         [1.0965e-01, 1.4731e-01, 1.1224e+00, 8.7836e-01],
         [0.0000e+00, 3.6662e-02, 5.5663e-01, 4.6126e-01],
         [0.0000e+00, 6.2447e-01, 8.2906e-01, 9.9546e-01],
         [3.4754e-01, 0.0000e+00, 9.9833e-01, 4.5211e-01],
         [2.4611e-01, 5.5362e-01, 1.2640e+00, 1.0000e+00],
         [6.9048e-04, 0.0000e+00, 4.0233e-01, 4.2276e-01],
         [0.0000e+00, 3.6096e-01, 6.0552e-01, 9.9193e-01],
         [1.9048e-0