# The Vision-Language Tookit (VLTK)

* Define FRCNN Adapter
* Define Vision Dataset Adapters
    * COCO and Visual Genome
* Define Vision-Language Dataset Adapters
    * VQA, GQA, and Visual Genome QA
* Extract Datasets for Each User-Defined Adapter Class
* Register User-Defined Adapters and Config  with VLTK to Superset Datasets

In [1]:
!pip install -e .

Obtaining file:///home/eltoto/vltk


Installing collected packages: vltk
  Attempting uninstall: vltk
    Found existing installation: vltk 1.0.0
    Uninstalling vltk-1.0.0:
      Successfully uninstalled vltk-1.0.0
  Running setup.py develop for vltk
Successfully installed vltk-1.0.0
You should consider upgrading via the '/home/eltoto/mlnv/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import vltk
from vltk import Features, adapters
from vltk.configs import DataConfig, VisionConfig, LangConfig
from vltk.loader import build
from vltk.utils.adapters import rescale_box, clean_label, soft_score

from pprint import pprint
from PIL import Image
import torch
from collections import Counter, defaultdict

DATADIR = '/home/eltoto/demodata/'


# Define FRCNN Adapter

In [3]:
class FRCNN(adapters.VisnExtraction):

    # TODO: currently, this image preprocessing config is not correct
    default_processor = VisionConfig(
        **{
            "transforms": ["FromFile", "Resize", "ToTensor", "Normalize"],
            "size": 800,
            "max_size": 1333,
            "mode": Image.BILINEAR,
            "pad_value": 0.0,
            "mean": [0.404, 0.455,0.482],
            "std": [1.0, 1.0, 1.0],
        }
    )

    def setup():
        from vltk import compat
        from vltk.modeling.frcnn import FRCNN as FasterRCNN

        weights = "unc-nlp/frcnn-vg-finetuned"
        model_config = compat.Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
        return FasterRCNN.from_pretrained(weights, model_config), model_config

    def schema(max_detections=36, visual_dim=2048):
        return {
            "attr_ids": Features.Ids,
            "object_ids": Features.Ids,
            vltk.features: Features.Features3D(max_detections, visual_dim),
            vltk.box: Features.Box,
        }

    def forward(model, entry):

        size = entry[vltk.size]
        scale_hw = entry[vltk.scale]
        image = entry[vltk.img]

        model_out = model(
            images=image.unsqueeze(0),
            image_shapes=size.unsqueeze(0),
            scales_yx=scale_hw.unsqueeze(0),
            padding="max_detections",
            pad_value=0.0,
            location="cpu",
        )
        normalized_boxes = torch.round(rescale_box(model_out["boxes"][0], 1 / scale_hw))

        return {
            "object_ids": [model_out["obj_ids"][0].tolist()],
            "attr_ids": [model_out["attr_ids"][0].tolist()],
            vltk.box: [normalized_boxes.tolist()],
            vltk.features: [model_out["roi_features"][0]],
        }

# Define Vision Dataset Adapters

### COCO and Visual Genome

In [4]:
class Coco2014(adapters.VisnDataset):
    def schema():
        return {
            vltk.box: Features.Box,
            vltk.polygons: Features.Polygons,
            vltk.objects: Features.StringList,
        }

    def forward(json_files, splits):

        total_annos = {}
        id_to_cat = {}
        file_to_id_to_stem = defaultdict(dict)
        for file, json in json_files.items():
            if "instance" not in file:
                continue
            info = json["images"]
            for i in info:
                img_id = i["file_name"].split(".")[0]
                file_to_id_to_stem[file][i["id"]] = img_id
        for file, json in json_files.items():
            if "instance" not in file:
                continue

            categories = json["categories"]
            for cat in categories:
                id_to_cat[cat["id"]] = cat["name"]

            for entry in json["annotations"]:
                # TODO: change this image ID thing later

                img_id = str(file_to_id_to_stem[file][entry["image_id"]])
                bbox = entry["bbox"]
                segmentation = entry["segmentation"]
                category_id = id_to_cat[entry["category_id"]]
                if entry["iscrowd"]:
                    seg_mask = []
                else:
                    seg_mask = segmentation
                    if not isinstance(seg_mask[0], list):
                        seg_mask = [seg_mask]
                img_data = total_annos.get(img_id, None)
                if img_data is None:
                    img_entry = defaultdict(list)
                    img_entry[vltk.objects].append(category_id)
                    img_entry[vltk.box].append(bbox)
                    img_entry[vltk.polygons].append(seg_mask)
                    total_annos[img_id] = img_entry
                else:
                    total_annos[img_id][vltk.box].append(bbox)
                    total_annos[img_id][vltk.objects].append(category_id)
                    total_annos[img_id][vltk.polygons].append(seg_mask)

        return [{vltk.imgid: img_id, **entry} for img_id, entry in total_annos.items()]


class VisualGenome(adapters.VisnDataset):
    def schema():
        return {}

    def forward(json_files, splits):
        return {}

# Define Adapters for Vision-Language Datasets 

### VQA, GQA, and Visual Genome QA

In [5]:
class GQA(adapters.VisnLangDataset):
    data_info = {
        "dev": {"coco2014": ["test"]},
        "train": {"visualgenome": ["train"]},
        "val": {"visualgenome": ["train"]},
        "test": {"coco2014": ["test"]},
        "testdev": {"coco2014": ["val"]},
    }

    filters = ["unbalanced", "train"]

    def schema():
        return {vltk.label: Features.StringList, "layout": Features.StringList}

    def forward(json_files, split, min_label_frequency=2):
        label_frequencies = Counter()
        batch_entries = []

        for filename, data in json_files.items():
            for i, (k, v) in enumerate(data.items()):
                if "answer" in v:
                    answer = clean_label(v["answer"])
                    label_frequencies.update([answer])

            for i, (k, v) in enumerate(data.items()):
                if split == "test":
                    answer = None
                    layout = None
                elif label_frequencies[v["answer"]] < min_label_frequency:
                    continue
                else:
                    answer = clean_label(v["answer"])
                    layout = [layout["operation"] for layout in v["semantic"]]

                text = v["question"]
                img_id = v["imageId"].lstrip("n")

                entry = {
                    vltk.text: text,
                    vltk.imgid: img_id,
                    vltk.label: [answer],
                    "layout": layout,
                }

                batch_entries.append(entry)

        return batch_entries
    
class VQA(adapters.VisnLangDataset):
    data_info = {
        "val": {"coco2014": ["val"]},
        "train": {"coco2014": ["train"]},
        "test": {"coco2014": ["test"]},
    }

    def schema():
        return {
            vltk.qid: Features.String,
            vltk.label: Features.StringList,
            vltk.score: Features.FloatList,
        }

    def adjust_imgid(imgid, vdset_name, vdset_split):
        imgid = f'{"COCO"}_{vdset_split[0].lower()}{2014}_{"".join(["0"] * (12 - len(imgid)))}{imgid}'
        return imgid

    def forward(json_files, split, min_label_frequency=9):
        batch_entries = []
        all_questions = []
        qid2answers = {}
        label_frequencies = Counter()
        for filename, x in json_files.items():
            if "questions" in x:
                all_questions.extend(x["questions"])
            else:
                annotations = x["annotations"]
                accepted_answers = {
                    clean_label(anno["multiple_choice_answer"]) for anno in annotations
                }
                for anno in annotations:
                    qid = str(anno["question_id"])
                    answers = anno["answers"]
                    label_frequencies.update(
                        [clean_label(anno["multiple_choice_answer"])]
                    )
                    answer_counter = Counter()
                    for ans_dict in answers:
                        ans = ans_dict["answer"]
                        if ans not in accepted_answers:
                            pass
                        else:
                            ans = clean_label(ans)
                            answer_counter.update([ans])
                    qid2answers[qid] = {
                        k: soft_score(v) for k, v in answer_counter.items()
                    }

        for entry in all_questions:
            try:
                entry[vltk.imgid] = str(entry.pop("image_id"))
            except Exception:
                raise Exception(entry.keys())
            entry[vltk.text] = entry.pop("question")
            # entry.pop("question_id")
            entry["qid"] = str(entry.pop("question_id"))
            try:
                entry[vltk.label] = qid2answers[entry["qid"]]
                labels = {
                    l: s
                    for l, s in entry[vltk.label].items()
                    if label_frequencies[l] > min_label_frequency
                }
                if not labels:
                    continue

                labels, scores = adapters.VisnLangDataset._label_handler(labels)
                entry[vltk.score] = scores
                entry[vltk.label] = labels
            except KeyError:
                pass

            batch_entries.append(entry)
        return batch_entries
    
class COCOCaptions(adapters.VisnLangDataset):
    data_info = {
        "train": {"coco2014": ["train"]},
        "val": {"coco2014": ["val"]},
    }

    def schema():
        return {}

    def forward(json_files, split, min_label_frequency=2):
        batch_entries = []
        id2imgid = {}
        for filename, data in json_files.items():
            if "annotations" not in data:
                continue
            if "caption" not in data["annotations"][0]:
                continue
            for img in data["images"]:
                id2imgid[img["id"]] = img["file_name"]
            for item in (data["annotations"]):
                imgid = id2imgid[item["image_id"]].split(".")[0]
                entry = {vltk.imgid: imgid, vltk.text: item["caption"]}

                batch_entries.append(entry)

        return batch_entries
    
    
class VGQA(adapters.VisnLangDataset):
    data_info = {
        "train": {"visualgenome": ["train"]},
    }

    def schema():
        return {
            vltk.qid: Features.String,
            vltk.label: Features.StringList,
        }

    def forward(json_files, split, min_label_frequency=9):
        batch_entries = []
        answer_counts = Counter()
        for filename, data in json_files.items():
            for y in data:
                for x in y["qas"]:
                    answer_counts.update([clean_label(x["answer"])]),

        for filename, data in json_files.items():
            for y in data:
                for x in y["qas"]:
                    if answer_counts[clean_label(x["answer"])] >= min_label_frequency:
                        entry = {
                            vltk.qid: str(x["qa_id"]),
                            vltk.imgid: str(x["image_id"]),
                            vltk.text: x["question"],
                            vltk.label: [clean_label(x["answer"])],
                        }
                        batch_entries.append(entry)
        return batch_entries

# Extract Datasets for Each Defined Adapter Class 

In [6]:
cocofeats = FRCNN.extract(DATADIR, dataset="coco2014")
vgfeats = FRCNN.extract(DATADIR, dataset="visualgenome")
coco2014 = Coco2014.extract(DATADIR)
vqa = VQA.extract(DATADIR)
gqa = GQA.extract(DATADIR)
vgqa = VGQA.extract(DATADIR)
cococaptions = COCOCaptions.extract(DATADIR)

will write to /home/eltoto/demodata/coco2014/frcnn




loading weights file https://cdn.huggingface.co/unc-nlp/frcnn-vg-finetuned/pytorch_model.bin from cache at /home/eltoto/.cache/torch/transformers/57f6df6abe353be2773f2700159c65615babf39ab5b48114d2b49267672ae10f.77b59256a4cf8343ae0f923246a81489fc8d82f98d082edc2d2037c977c0d9d0
All model checkpoint weights were used when initializing FRCNN.

All the weights of FRCNN were initialized from the model checkpoint at unc-nlp/frcnn-vg-finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use FRCNN for predictions without further training.
extracting from ['/home/eltoto/demodata/coco2014/test', '/home/eltoto/demodata/coco2014/train', '/home/eltoto/demodata/coco2014/val']
100%|██████████| 10/10 [01:03<00:00,  6.39s/it]
saving...
Success! You wrote 8 entry(s) and 2 mb
Located: /home/eltoto/demodata/coco2014/frcnn/val.arrow
Success! You wrote 1 entry(s) and 0 mb
Located: /home/eltoto/demodata/coco2014/frcnn/test.arrow
Success! You wrote 1 entry(s)

  0%|          | 0/6 [00:00<?, ?it/s]


saving...
Success! You wrote 11 entry(s) and 3 mb
Located: /home/eltoto/demodata/visualgenome/frcnn/train.arrow
loading annotations...


100%|██████████| 6/6 [00:19<00:00,  3.18s/it]


writing to Datasets/Arrow object
saving...
Success! You wrote 122218 entry(s) and 203 mb
Located: /home/eltoto/demodata/coco2014/annotations.arrow


  0%|          | 0/2 [00:00<?, ?it/s]

searching for input files for splits: {'test', 'train', 'val', 'validation', 'evaluation', 'eval', 'dev'}
loading json files from: ['/home/eltoto/demodata/vqa/v2_mscoco_train2014_annotations.json', '/home/eltoto/demodata/vqa/v2_OpenEnded_mscoco_train2014_questions.json']


100%|██████████| 2/2 [00:04<00:00,  2.16s/it]


begin extraction
writing rows to arrow dataset
Success! You wrote 432628 entry(s) and 46 mb
Located: /home/eltoto/demodata/vqa/train.arrow


  0%|          | 0/2 [00:00<?, ?it/s]

loading json files from: ['/home/eltoto/demodata/vqa/v2_mscoco_val2014_annotations.json', '/home/eltoto/demodata/vqa/v2_OpenEnded_mscoco_val2014_questions.json']


100%|██████████| 2/2 [00:01<00:00,  1.07it/s]


begin extraction
writing rows to arrow dataset
Success! You wrote 206454 entry(s) and 21 mb
Located: /home/eltoto/demodata/vqa/val.arrow


100%|██████████| 1/1 [00:00<00:00, 12.81it/s]

searching for input files for splits: {'test', 'train', 'val', 'validation', 'evaluation', 'eval', 'dev'}
loading json files from: ['/home/eltoto/demodata/gqa/test_balanced_questions.json']
begin extraction
writing rows to arrow dataset



  0%|          | 0/1 [00:00<?, ?it/s]

Success! You wrote 95336 entry(s) and 5 mb
Located: /home/eltoto/demodata/gqa/test.arrow
loading json files from: ['/home/eltoto/demodata/gqa/val_balanced_questions.json']


100%|██████████| 1/1 [00:03<00:00,  3.26s/it]


begin extraction
writing rows to arrow dataset


  0%|          | 0/1 [00:00<?, ?it/s]

Success! You wrote 129563 entry(s) and 13 mb
Located: /home/eltoto/demodata/gqa/val.arrow
loading json files from: ['/home/eltoto/demodata/gqa/testdev_balanced_questions.json']


100%|██████████| 1/1 [00:00<00:00,  4.61it/s]


begin extraction
writing rows to arrow dataset


  0%|          | 0/1 [00:00<?, ?it/s]

Success! You wrote 11025 entry(s) and 1 mb
Located: /home/eltoto/demodata/gqa/dev.arrow
searching for input files for splits: {'test', 'train', 'val', 'validation', 'evaluation', 'eval', 'dev'}
No files pattern matched with corresponding split, falling back to searching all json in top level directory
loading json files from: ['/home/eltoto/demodata/vgqa/question_answers.json']


100%|██████████| 1/1 [00:06<00:00,  6.31s/it]


begin extraction
writing rows to arrow dataset
Success! You wrote 1185566 entry(s) and 78 mb
Located: /home/eltoto/demodata/vgqa/train.arrow


  0%|          | 0/3 [00:00<?, ?it/s]

searching for input files for splits: {'test', 'train', 'val', 'validation', 'evaluation', 'eval', 'dev'}
loading json files from: ['/home/eltoto/demodata/cococaptions/instances_train2014.json', '/home/eltoto/demodata/cococaptions/captions_train2014.json', '/home/eltoto/demodata/cococaptions/person_keypoints_train2014.json']


100%|██████████| 3/3 [00:12<00:00,  4.17s/it]


begin extraction
writing rows to arrow dataset
Success! You wrote 414113 entry(s) and 34 mb
Located: /home/eltoto/demodata/cococaptions/train.arrow


  0%|          | 0/3 [00:00<?, ?it/s]

loading json files from: ['/home/eltoto/demodata/cococaptions/instances_val2014.json', '/home/eltoto/demodata/cococaptions/person_keypoints_val2014.json', '/home/eltoto/demodata/cococaptions/captions_val2014.json']


100%|██████████| 3/3 [00:05<00:00,  1.81s/it]


begin extraction
writing rows to arrow dataset
Success! You wrote 202654 entry(s) and 16 mb
Located: /home/eltoto/demodata/cococaptions/val.arrow


# Register User-Defined Adapters and Define Config with VLTK to Superset Datasets

In [24]:
# add adapters to library
adapters.Adapters().add(VQA, GQA, Coco2014, VisualGenome, FRCNN, COCOCaptions, VGQA)

#config
config = DataConfig(
    # choose which dataset and dataset split for train and eval
    train_datasets=[["vqa", "trainval"],["cococaptions", 'trainval'], ['vgqa', 'train'], ['gqa', 'trainval']],
    # choose which feature extractor to use
    extractor="frcnn",
    datadir=DATADIR,
    train_batch_size=1,
    # iterate with through datasets via images first versus text
    img_first=True,
    #ignore segmentation annotations from being prcoessed with the COCO dataset
    ignore_segmentation=True
)
train, val = build(config)

Added VisnLangDataset cococaptions: train
Added VisnDataset coco2014: train
Added VisnLangDataset cococaptions: val
Added VisnDataset coco2014: val
Added VisnLangDataset gqa: train
Added VisnDataset visualgenome: train
Added VisnLangDataset gqa: val
Added VisnLangDataset vgqa: train
Added VisnLangDataset vqa: train
Added VisnLangDataset vqa: val
Max spanning column names for each batch: {'label', 'attr_ids', 'object_ids', 'imgid', 'layout', 'box', 'text', 'objects', 'features', 'qid', 'score'} (not including extra columns/features from processors)
resizing datasets to account for 228610 missing image IDs


# First Batch

In [25]:
for batch in train:
    pprint(batch.keys())
    print()
    pprint(batch)
    break

dict_keys(['layout', 'text_attention_mask', 'input_ids', 'type_ids', 'label', 'score', 'attr_ids', 'box', 'features', 'imgid', 'object_ids'])

{'attr_ids': tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 7., 0., 7., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 7., 0., 0., 7., 7., 0., 0., 7., 0., 7., 0., 0., 0.]]),
 'box': tensor([[[ 32.,  40., 800., 634.],
         [ 53.,   1., 800., 461.],
         [  2., 211., 800., 781.],
         [100.,   2., 800., 315.],
         [185., 108., 800., 683.],
         [  1.,   5., 526., 603.],
         [  3., 118., 514., 725.],
         [  2.,   4., 573., 388.],
         [  1.,  14., 402., 684.],
         [ 68., 314., 800., 846.],
         [  2.,   5., 460., 465.],
         [  5.,   3., 562., 263.],
         [281.,  50., 800., 621.],
         [  4.,   4., 447., 311.],
         [280., 172., 800., 742.],
         [415., 132., 800., 716.],
         [435.,  28., 800., 629.],
         [310.,   8., 800., 490.],
         [219., 281., 800., 800.],
      