In [1]:
import os
from collections import Counter, defaultdict

import pandas as pd
import pyarrow as pa
from tqdm import tqdm

# from glossary import normalize_word


def statistics(iid2captions, iid2split):
    all_images = {"train": [], "val": [], "test": []}
    all_texts = {"train": [], "val": [], "test": []}

    for iid, texts in iid2captions.items():
        split = iid2split[iid]
        all_images[split].append(iid)
        all_texts[split].extend(texts)

    for split, images in all_images.items():
        print(f"+ {split} set: {len(images)} images")

    for split, texts in all_texts.items():
        lengths = [len(text.split()) for text in texts]
        avg_len = sum(lengths) / len(lengths)
        print(f"+ {split} set: {len(texts)} texts")
        print(f"+ {split} set: {avg_len} words in average.")
        lengths = [length // 10 * 10 for length in lengths]
        print(Counter(lengths))


def path2rest(path, iid2captions, iid2split):
    name = path
    with open(path, "rb") as fp:
        binary = fp.read()
    captions = iid2captions[name]
    split = iid2split[name]
    return [binary, captions, name, split]


def make_arrow(data, dataset_name, save_dir):
    print(f"+ Pre-processing {dataset_name}...")
    iid2captions = defaultdict(list)
    iid2split = dict()

    for split, split_data in data.items():
        for sample in split_data:
            iid2captions[sample["img_path"]].extend(sample["texts"])
            iid2split[sample["img_path"]] = split

    path = len(iid2captions)
    caption_paths = [path for path in iid2captions if os.path.exists(path)]
    print(f"+ {len(caption_paths)} images / {path} annotations")
    statistics(iid2captions, iid2split)
    import pdb
    pdb.set_trace()
    bs = [path2rest(path, iid2captions, iid2split) for path in tqdm(caption_paths)]

    for split in ["train", "val", "test"]:
        batches = [b for b in bs if b[-1] == split]
        dataframe = pd.DataFrame(batches, columns=["image", "caption", "image_id", "split"])
        table = pa.Table.from_pandas(dataframe)
        os.makedirs(save_dir, exist_ok=True)
        with pa.OSFile(f"{save_dir}/{dataset_name}_{split}.arrow", "wb") as sink:
            with pa.RecordBatchFileWriter(sink, table.schema) as writer:
                writer.write_table(table)


def path2rest_mimic_cxr(path, iid2captions, iid2chexpert, iid2split):
    name = path
    with open(path, "rb") as fp:
        binary = fp.read()
    captions = iid2captions[name]
    chexpert = iid2chexpert[name]
    split = iid2split[name]
    return [binary, captions, name, chexpert, split]


def make_arrow_mimic_cxr(data, dataset_name, save_dir):
    print(f"+ Pre-processing {dataset_name}...")
    iid2captions = defaultdict(list)
    iid2chexpert = defaultdict(list)
    iid2split = dict()

    for split, split_data in data.items():
        for sample in split_data:
            iid2captions[sample["img_path"]].extend(sample["texts"])
            iid2chexpert[sample["img_path"]].extend(sample["chexpert"])
            iid2split[sample["img_path"]] = split

    path = len(iid2captions)
    caption_paths = [path for path in iid2captions if os.path.exists(path)]
    print(f"+ {len(caption_paths)} images / {path} annotations")
    statistics(iid2captions, iid2split)
    import pdb
    pdb.set_trace()
    bs = [path2rest_mimic_cxr(path, iid2captions, iid2chexpert, iid2split) for path in tqdm(caption_paths)]

    for split in ["train", "val", "test"]:
        batches = [b for b in bs if b[-1] == split]
        dataframe = pd.DataFrame(batches, columns=["image", "caption", "image_id", "chexpert", "split"])
        table = pa.Table.from_pandas(dataframe)
        os.makedirs(save_dir, exist_ok=True)
        with pa.OSFile(f"{save_dir}/{dataset_name}_{split}.arrow", "wb") as sink:
            with pa.RecordBatchFileWriter(sink, table.schema) as writer:
                writer.write_table(table)


def get_score(occurences):
    return 1.0


def path2rest_vqa(path, split, annotations, label2ans):
    with open(path, "rb") as fp:
        binary = fp.read()

    iid = path
    _annotation = annotations[split][iid]
    _annotation = list(_annotation.items())
    qids, qas = [a[0] for a in _annotation], [a[1] for a in _annotation]
    questions = [qa[0] for qa in qas]
    answers = [qa[1] for qa in qas]
    answer_labels = [a["labels"] for a in answers]
    answer_scores = [a["scores"] for a in answers]
    question_types = [a["answer_type"] for a in answers]
    answers = [[label2ans[l] for l in al] for al in answer_labels]

    return [binary, questions, answers, answer_labels, answer_scores, iid, qids, question_types, split]


def make_arrow_vqa(data, dataset_name, save_dir):
    questions_train, questions_val, questions_test = data["train"], data["val"], data["test"]

    # Record Questions
    annotations = dict()
    for split, questions in zip(["train", "val", "test"], [questions_train, questions_val, questions_test]):
        _annotation = defaultdict(dict)
        for q in tqdm(questions):
            _annotation[q["img_path"]][q["qid"]] = [q["question"]]
        annotations[split] = _annotation

    # Construct Vocabulary
    all_major_answers = list()
    for split, questions in zip(["train", "val", "test"], [questions_train, questions_val, questions_test]):
        for q in tqdm(questions):
            all_major_answers.append(str(q["answer"]).lower())
    all_major_answers = [normalize_word(word) for word in tqdm(all_major_answers)]

    counter = {k: v for k, v in Counter(all_major_answers).items() if v >= 0}
    ans2label = {k: i for i, k in enumerate(counter.keys())}
    label2ans = list(counter.keys())

    # print("@@@@@@@@@@", ans2label, "$$$$$$$", label2ans, "&&&&&&&", counter)
    # later by hxj
    # save label2ans into json
    # import json
    # with open('/home/coder/projects/METER/data/vqa_rad/label2ans.json', 'w') as f:
    #     json.dump(label2ans, f)
    # print("Label size ({}): {}.".format(dataset_name, len(ans2label)))
    # print("########", len(label2ans))
    print("######把生成label2ans.json的文件注释了")

    # Record Answers
    for split, questions in zip(["train", "val", "test"], [questions_train, questions_val, questions_test]):
        _annotation = annotations[split]
        for q in tqdm(questions):
            answers = normalize_word(str(q["answer"]).lower())
            answer_count = {}
            answer_count[answers] = answer_count.get(answers, 0) + 1
            labels = []
            scores = []
            for answer in answer_count:
                assert answer in ans2label
                labels.append(ans2label[answer])
                score = get_score(answer_count[answer])
                scores.append(score)
            assert q['answer_type'].strip().lower() == "closed" or q['answer_type'].strip().lower() == "open"
            answer_type = 0 if q['answer_type'].strip().lower() == "closed" else 1
            _annotation[q["img_path"]][q["qid"]].append(
                {"labels": labels, "scores": scores, "answer_type": answer_type})

    # Write to the files
    for split in ["train", "val", "test"]:
        annot = annotations[split]
        annot_paths = [path for path in annot if os.path.exists(path)]
        print("######", len(annot_paths), len(annot))
        assert len(annot_paths) == len(annot) or len(annot_paths) == len(annot) - 1
        print("{} set: {} images, {} questions".format(split,
                                                       len(annot),
                                                       len([vv for k, v in annot.items() for kk, vv in v.items()])))

        bs = [
            path2rest_vqa(path, split, annotations, label2ans) for path in tqdm(annot_paths)
        ]
        dataframe = pd.DataFrame(
            bs,
            columns=[
                "image",
                "questions",
                "answers",
                "answer_labels",
                "answer_scores",
                "image_id",
                "question_id",
                "answer_type",
                "split",
            ],
        )

        # print("#########max", max(dataframe["answer_labels"]))

        # print("文件生成被我注释掉了")    
        table = pa.Table.from_pandas(dataframe)
        os.makedirs(save_dir, exist_ok=True)
        with pa.OSFile(f"{save_dir}/{dataset_name}_{split}.arrow", "wb") as sink:
            with pa.RecordBatchFileWriter(sink, table.schema) as writer:
                writer.write_table(table)


def make_arrow_vqa_ovqa(data, dataset_name, save_dir):
    questions_train, questions_val, questions_test = data["train"], data["val"], data["test"]

    # Construct Vocabulary
    all_major_answers = list()
    for split, questions in zip(["train", "val", "test"], [questions_train, questions_val, questions_test]):
        for q in tqdm(questions):
            all_major_answers.append(str(q["answer"]).lower())
    all_major_answers = [normalize_word(word) for word in tqdm(all_major_answers)]
    # 针对ovqa数据集的处理, 仅把标签数量大于或等于2的保留
    # 原始参数设置
    counter = {k: v for k, v in Counter(all_major_answers).items() if v >= 2}
    ans2label = {k: i for i, k in enumerate(counter.keys())}
    label2ans = list(counter.keys())

    # Record Questions
    annotations = dict()
    for split, questions in zip(["train", "val", "test"], [questions_train, questions_val, questions_test]):
        _annotation = defaultdict(dict)
        for q in tqdm(questions):
            answer_tmp = normalize_word(str(q["answer"]).lower())
            if answer_tmp not in ans2label:
                continue
            _annotation[q["img_path"]][q["qid"]] = [q["question"]]
        annotations[split] = _annotation

    print("@@@@@@@@@@", ans2label, "$$$$$$$", label2ans, "&&&&&&&", counter)
    # later by hxj
    # save label2ans into json
    # import json
    # with open('/home/coder/projects/METER/data/vqa_rad/label2ans.json', 'w') as f:
    #     json.dump(label2ans, f)
    # print("Label size ({}): {}.".format(dataset_name, len(ans2label)))
    # print("########", len(label2ans))
    print("######把生成label2ans.json的文件注释了")

    # Record Answers
    for split, questions in zip(["train", "val", "test"], [questions_train, questions_val, questions_test]):
        _annotation = annotations[split]
        for q in tqdm(questions):
            answers = normalize_word(str(q["answer"]).lower())
            if answers not in ans2label:
                continue
            # print("#######answers", answers)
            answer_count = {}
            answer_count[answers] = answer_count.get(answers, 0) + 1
            labels = []
            scores = []
            for answer in answer_count:
                assert answer in ans2label
                labels.append(ans2label[answer])
                score = get_score(answer_count[answer])
                scores.append(score)
            assert q['answer_type'].strip().lower() == "closed" or q['answer_type'].strip().lower() == "open"
            answer_type = 0 if q['answer_type'].strip().lower() == "closed" else 1
            _annotation[q["img_path"]][q["qid"]].append(
                {"labels": labels, "scores": scores, "answer_type": answer_type})

    # Write to the files
    for split, questions in zip(["train", "val", "test"], [questions_train, questions_val, questions_test]):
        annot = annotations[split]
        annot_paths = []
        for path, q in zip(annot, questions):
            answers = normalize_word(str(q["answer"]).lower())
            if answers not in ans2label:
                continue
            if os.path.exists(path):
                annot_paths.append(path)
        # annot_paths = [path for path in annot if os.path.exists(path)]
        print("######", len(annot_paths), len(annot))
        # assert len(annot_paths) == len(annot) or len(annot_paths) == len(annot) - 1
        print("{} set: {} images, {} questions".format(split,
                                                       len(annot),
                                                       len([vv for k, v in annot.items() for kk, vv in v.items()])))

        bs = [
            path2rest_vqa(path, split, annotations, label2ans) for path in tqdm(annot_paths)
        ]
        dataframe = pd.DataFrame(
            bs,
            columns=[
                "image",
                "questions",
                "answers",
                "answer_labels",
                "answer_scores",
                "image_id",
                "question_id",
                "answer_type",
                "split",
            ],
        )

        print("#########max", max(dataframe["answer_labels"]))
        print("文件生成被我注释掉了")    
        # table = pa.Table.from_pandas(dataframe)
        # os.makedirs(save_dir, exist_ok=True)
        # with pa.OSFile(f"{save_dir}/{dataset_name}_{split}.arrow", "wb") as sink:
        #     with pa.RecordBatchFileWriter(sink, table.schema) as writer:
        #         writer.write_table(table)


def path2rest_melinda(path, iid2captions, iid2i_meth, iid2p_meth, iid2i_meth_label, iid2p_meth_label, iid2split):
    name = path
    with open(path, "rb") as fp:
        binary = fp.read()
    captions = iid2captions[name]
    i_meth = iid2i_meth[name]
    p_meth = iid2p_meth[name]
    i_meth_label = iid2i_meth_label[name]
    p_meth_label = iid2p_meth_label[name]
    assert len(captions) == len(i_meth)
    assert len(captions) == len(p_meth)
    assert len(captions) == len(i_meth_label)
    assert len(captions) == len(p_meth_label)
    split = iid2split[name]
    return [binary, captions, name, i_meth, p_meth, i_meth_label, p_meth_label, split]


def make_arrow_melinda(data, dataset_name, save_dir):
    print(f"+ Pre-processing {dataset_name}...")
    iid2captions = defaultdict(list)
    iid2i_meth = defaultdict(list)
    iid2p_meth = defaultdict(list)
    iid2i_meth_label = defaultdict(list)
    iid2p_meth_label = defaultdict(list)
    iid2split = dict()

    for split, split_data in data.items():
        for sample in split_data:
            iid2captions[sample["img_path"]].extend(sample["texts"])
            iid2split[sample["img_path"]] = split
            iid2i_meth[sample["img_path"]].append(sample["i_meth"])
            iid2p_meth[sample["img_path"]].append(sample["p_meth"])
            iid2i_meth_label[sample["img_path"]].append(sample["i_meth_label"])
            iid2p_meth_label[sample["img_path"]].append(sample["p_meth_label"])

    i_meth_set = set([vv for k, v in iid2i_meth.items() for vv in v])
    i_meth_label_set = set([vv for k, v in iid2i_meth_label.items() for vv in v])
    p_meth_set = set([vv for k, v in iid2p_meth.items() for vv in v])
    p_meth_label_set = set([vv for k, v in iid2p_meth_label.items() for vv in v])

    i_meth_set = sorted(i_meth_set)
    i_meth_label_set = sorted(i_meth_label_set)
    p_meth_set = sorted(p_meth_set)
    p_meth_label_set = sorted(p_meth_label_set)

    i_meth_dict = {j: i for i, j in enumerate(i_meth_set)}
    p_meth_dict = {j: i for i, j in enumerate(p_meth_set)}
    i_meth_label_dict = {j: i for i, j in enumerate(i_meth_label_set)}
    p_meth_label_dict = {j: i for i, j in enumerate(p_meth_label_set)}

    iid2i_meth = {k: [i_meth_dict[vv] for vv in v] for k, v in iid2i_meth.items()}
    iid2p_meth = {k: [p_meth_dict[vv] for vv in v] for k, v in iid2p_meth.items()}
    iid2i_meth_label = {k: [i_meth_label_dict[vv] for vv in v] for k, v in iid2i_meth_label.items()}
    iid2p_meth_label = {k: [p_meth_label_dict[vv] for vv in v] for k, v in iid2p_meth_label.items()}

    path = len(iid2captions)
    caption_paths = [path for path in iid2captions if os.path.exists(path)]
    print(f"+ {len(caption_paths)} images / {path} annotations")
    statistics(iid2captions, iid2split)
    bs = [path2rest_melinda(path, iid2captions, iid2i_meth, iid2p_meth, iid2i_meth_label, iid2p_meth_label, iid2split)
          for path in tqdm(caption_paths)]

    for split in ["train", "val", "test"]:
        batches = [b for b in bs if b[-1] == split]
        dataframe = pd.DataFrame(batches, columns=["image", "caption", "image_id", "i_meth", "p_meth", "i_meth_label",
                                                   "p_meth_label", "split"])
        table = pa.Table.from_pandas(dataframe)
        os.makedirs(save_dir, exist_ok=True)
        with pa.OSFile(f"{save_dir}/{dataset_name}_{split}.arrow", "wb") as sink:
            with pa.RecordBatchFileWriter(sink, table.schema) as writer:
                writer.write_table(table)


def path2rest_chexpert(path, iid2captions, iid2chexpert, iid2split):
    name = path
    with open(path, "rb") as fp:
        binary = fp.read()
    captions = iid2captions[name]
    chexpert = iid2chexpert[name]
    split = iid2split[name]
    return [binary, captions, name, chexpert, split]


def make_arrow_chexpert(data, dataset_name, save_dir):
    print(f"+ Pre-processing {dataset_name}...")
    for split, split_data in data.items():
        iid2captions = defaultdict(list)
        iid2chexpert = defaultdict(list)
        iid2split = dict()

        for sample in split_data:
            iid2captions[sample["img_path"]].extend(sample["texts"])
            iid2chexpert[sample["img_path"]].extend(sample["chexpert"])
            iid2split[sample["img_path"]] = split

        path = len(iid2captions)
        caption_paths = [path for path in iid2captions if os.path.exists(path)]
        print(f"+ {len(caption_paths)} images / {path} annotations")
        bs = [path2rest_chexpert(path, iid2captions, iid2chexpert, iid2split) for path in tqdm(caption_paths)]

        batches = [b for b in bs if b[-1] == split]
        dataframe = pd.DataFrame(batches, columns=["image", "caption", "image_id", "chexpert", "split"])
        table = pa.Table.from_pandas(dataframe)
        os.makedirs(save_dir, exist_ok=True)
        with pa.OSFile(f"{save_dir}/{dataset_name}_{split}.arrow", "wb") as sink:
            with pa.RecordBatchFileWriter(sink, table.schema) as writer:
                writer.write_table(table)


def path2rest_pnsa_pneumonia(path, iid2captions, iid2pnsa_pneumonia, iid2split):
    name = path

    with open(path, "rb") as fp:
        binary = fp.read()
    captions = iid2captions[name]
    pnsa_pneumonia = iid2pnsa_pneumonia[name]
    split = iid2split[name]
    return [binary, captions, name, pnsa_pneumonia, split]


def make_arrow_pnsa_pneumonia(data, dataset_name, save_dir):
    print(f"+ Pre-processing {dataset_name}...")
    for split, split_data in data.items():
        iid2captions = defaultdict(list)
        iid2pnsa_pneumonia = defaultdict(list)
        iid2split = dict()

        for sample in split_data:
            iid2captions[sample["img_path"]].extend(sample["texts"])
            iid2pnsa_pneumonia[sample["img_path"]].extend(sample["pnsa_pneumonia"])
            iid2split[sample["img_path"]] = split

        path = len(iid2captions)
        caption_paths = [path for path in iid2captions if os.path.exists(path)]
        print(f"+ {len(caption_paths)} images / {path} annotations")
        bs = [path2rest_pnsa_pneumonia(path, iid2captions, iid2pnsa_pneumonia, iid2split) for path in
              tqdm(caption_paths)]

        batches = [b for b in bs if b[-1] == split]
        dataframe = pd.DataFrame(batches, columns=["image", "caption", "image_id", "pnsa_pneumonia", "split"])
        table = pa.Table.from_pandas(dataframe)
        os.makedirs(save_dir, exist_ok=True)
        with pa.OSFile(f"{save_dir}/{dataset_name}_{split}.arrow", "wb") as sink:
            with pa.RecordBatchFileWriter(sink, table.schema) as writer:
                writer.write_table(table)


def path2rest_clm_mimic_cxr(path, iid2captions, iid2findings, iid2impression, iid2chexpert, iid2split):
    name = path
    with open(path, "rb") as fp:
        binary = fp.read()
    captions = iid2captions[name]
    findings = iid2findings[name]
    impression = iid2impression[name]
    assert len(captions) == 1 and len(impression) == 1
    chexpert = iid2chexpert[name]
    split = iid2split[name]
    return [binary, captions, name, findings, impression, chexpert, split]


def make_arrow_clm_mimic_cxr(data, dataset_name, save_dir):
    print(f"+ Pre-processing {dataset_name}...")
    iid2captions = defaultdict(list)
    iid2findings = defaultdict(list)
    iid2impression = defaultdict(list)
    iid2chexpert = defaultdict(list)
    iid2split = dict()

    for split, split_data in data.items():
        for sample in split_data:
            iid2captions[sample["img_path"]].extend(sample["texts"])
            iid2findings[sample["img_path"]].extend(sample["findings"])
            iid2impression[sample["img_path"]].extend(sample["impression"])
            iid2chexpert[sample["img_path"]].extend(sample["chexpert"])
            iid2split[sample["img_path"]] = split

    path = len(iid2captions)
    caption_paths = [path for path in iid2captions if os.path.exists(path)]
    print(f"+ {len(caption_paths)} images / {path} annotations")
    statistics(iid2captions, iid2split)
    bs = [path2rest_clm_mimic_cxr(path, iid2captions, iid2findings, iid2impression, iid2chexpert, iid2split) for path in
          tqdm(caption_paths)]

    for split in ["train", "val", "test"]:
        batches = [b for b in bs if b[-1] == split]
        dataframe = pd.DataFrame(batches, columns=["image", "caption", "image_id", "findings", "impression",
                                                   "chexpert", "split"])
        table = pa.Table.from_pandas(dataframe)
        os.makedirs(save_dir, exist_ok=True)
        with pa.OSFile(f"{save_dir}/{dataset_name}_{split}.arrow", "wb") as sink:
            with pa.RecordBatchFileWriter(sink, table.schema) as writer:
                writer.write_table(table)


def path2rest_text_classification(guid, iid2text_a, iid2labels, iid2split):
    text_a = iid2text_a[guid]
    labels = iid2labels[guid]
    split = iid2split[guid]
    assert len(text_a) == 1
    return [text_a, guid, labels, split]


def make_arrow_text_classification(data, dataset_name, save_dir):
    print(f"+ Pre-processing {dataset_name}...")
    iid2text_a = defaultdict(list)
    iid2labels = dict()
    iid2split = dict()

    for split, split_data in data.items():
        for sample in split_data:
            iid2text_a[sample["guid"]].extend(sample["text_a"])
            iid2labels[sample["guid"]] = sample["label"]
            iid2split[sample["guid"]] = split

    statistics(iid2text_a, iid2split)
    bs = [path2rest_text_classification(guid, iid2text_a, iid2labels, iid2split) for guid in tqdm(iid2text_a)]

    for split in ["train", "val", "test"]:
        batches = [b for b in bs if b[-1] == split]
        dataframe = pd.DataFrame(batches, columns=["text_a", "guid", "label", "split"])
        table = pa.Table.from_pandas(dataframe)
        os.makedirs(save_dir, exist_ok=True)
        with pa.OSFile(f"{save_dir}/{dataset_name}_{split}.arrow", "wb") as sink:
            with pa.RecordBatchFileWriter(sink, table.schema) as writer:
                writer.write_table(table)


def path2rest_nli(guid, iid2text_a, iid2text_b, iid2text, iid2labels, iid2split):
    text_a = iid2text_a[guid]
    text_b = iid2text_b[guid]
    text = iid2text[guid]
    labels = iid2labels[guid]
    split = iid2split[guid]
    assert len(text_a) == 1
    assert len(text_b) == 1
    assert len(text) == 1
    return [text_a, text_b, text, guid, labels, split]


def make_arrow_text_nli(data, dataset_name, save_dir):
    print(f"+ Pre-processing {dataset_name}...")
    iid2text_a = defaultdict(list)
    iid2text_b = defaultdict(list)
    iid2text = defaultdict(list)
    iid2labels = dict()
    iid2split = dict()

    for split, split_data in data.items():
        for sample in split_data:
            iid2text_a[sample["guid"]].extend(sample["text_a"])
            iid2text_b[sample["guid"]].extend(sample["text_b"])
            iid2text[sample["guid"]].extend(sample["text"])
            iid2labels[sample["guid"]] = sample["label"]
            iid2split[sample["guid"]] = split

    statistics(iid2text_a, iid2split)
    statistics(iid2text_b, iid2split)
    statistics(iid2text, iid2split)
    bs = [path2rest_nli(guid, iid2text_a, iid2text_b, iid2text, iid2labels, iid2split) for guid in tqdm(iid2text_a)]

    for split in ["train", "val", "test"]:
        batches = [b for b in bs if b[-1] == split]
        dataframe = pd.DataFrame(batches, columns=["text_a", "text_b", "text", "guid", "label", "split"])
        table = pa.Table.from_pandas(dataframe)
        os.makedirs(save_dir, exist_ok=True)
        with pa.OSFile(f"{save_dir}/{dataset_name}_{split}.arrow", "wb") as sink:
            with pa.RecordBatchFileWriter(sink, table.schema) as writer:
                writer.write_table(table)

In [2]:
import re

contractions = {
    "aint": "ain't",
    "arent": "aren't",
    "cant": "can't",
    "couldve": "could've",
    "couldnt": "couldn't",
    "couldn'tve": "couldn't've",
    "couldnt've": "couldn't've",
    "didnt": "didn't",
    "doesnt": "doesn't",
    "dont": "don't",
    "hadnt": "hadn't",
    "hadnt've": "hadn't've",
    "hadn'tve": "hadn't've",
    "hasnt": "hasn't",
    "havent": "haven't",
    "hed": "he'd",
    "hed've": "he'd've",
    "he'dve": "he'd've",
    "hes": "he's",
    "howd": "how'd",
    "howll": "how'll",
    "hows": "how's",
    "Id've": "I'd've",
    "I'dve": "I'd've",
    "Im": "I'm",
    "Ive": "I've",
    "isnt": "isn't",
    "itd": "it'd",
    "itd've": "it'd've",
    "it'dve": "it'd've",
    "itll": "it'll",
    "let's": "let's",
    "maam": "ma'am",
    "mightnt": "mightn't",
    "mightnt've": "mightn't've",
    "mightn'tve": "mightn't've",
    "mightve": "might've",
    "mustnt": "mustn't",
    "mustve": "must've",
    "neednt": "needn't",
    "notve": "not've",
    "oclock": "o'clock",
    "oughtnt": "oughtn't",
    "ow's'at": "'ow's'at",
    "'ows'at": "'ow's'at",
    "'ow'sat": "'ow's'at",
    "shant": "shan't",
    "shed've": "she'd've",
    "she'dve": "she'd've",
    "she's": "she's",
    "shouldve": "should've",
    "shouldnt": "shouldn't",
    "shouldnt've": "shouldn't've",
    "shouldn'tve": "shouldn't've",
    "somebody'd": "somebodyd",
    "somebodyd've": "somebody'd've",
    "somebody'dve": "somebody'd've",
    "somebodyll": "somebody'll",
    "somebodys": "somebody's",
    "someoned": "someone'd",
    "someoned've": "someone'd've",
    "someone'dve": "someone'd've",
    "someonell": "someone'll",
    "someones": "someone's",
    "somethingd": "something'd",
    "somethingd've": "something'd've",
    "something'dve": "something'd've",
    "somethingll": "something'll",
    "thats": "that's",
    "thered": "there'd",
    "thered've": "there'd've",
    "there'dve": "there'd've",
    "therere": "there're",
    "theres": "there's",
    "theyd": "they'd",
    "theyd've": "they'd've",
    "they'dve": "they'd've",
    "theyll": "they'll",
    "theyre": "they're",
    "theyve": "they've",
    "twas": "'twas",
    "wasnt": "wasn't",
    "wed've": "we'd've",
    "we'dve": "we'd've",
    "weve": "we've",
    "werent": "weren't",
    "whatll": "what'll",
    "whatre": "what're",
    "whats": "what's",
    "whatve": "what've",
    "whens": "when's",
    "whered": "where'd",
    "wheres": "where's",
    "whereve": "where've",
    "whod": "who'd",
    "whod've": "who'd've",
    "who'dve": "who'd've",
    "wholl": "who'll",
    "whos": "who's",
    "whove": "who've",
    "whyll": "why'll",
    "whyre": "why're",
    "whys": "why's",
    "wont": "won't",
    "wouldve": "would've",
    "wouldnt": "wouldn't",
    "wouldnt've": "wouldn't've",
    "wouldn'tve": "wouldn't've",
    "yall": "y'all",
    "yall'll": "y'all'll",
    "y'allll": "y'all'll",
    "yall'd've": "y'all'd've",
    "y'alld've": "y'all'd've",
    "y'all'dve": "y'all'd've",
    "youd": "you'd",
    "youd've": "you'd've",
    "you'dve": "you'd've",
    "youll": "you'll",
    "youre": "you're",
    "youve": "you've",
}

manual_map = {
    "none": "0",
    "zero": "0",
    "one": "1",
    "two": "2",
    "three": "3",
    "four": "4",
    "five": "5",
    "six": "6",
    "seven": "7",
    "eight": "8",
    "nine": "9",
    "ten": "10",
}
articles = ["a", "an", "the"]
period_strip = re.compile("(?!<=\d)(\.)(?!\d)")
comma_strip = re.compile("(\d)(\,)(\d)")
punct = [
    ";",
    r"/",
    "[",
    "]",
    '"',
    "{",
    "}",
    "(",
    ")",
    "=",
    "+",
    "\\",
    "_",
    "-",
    ">",
    "<",
    "@",
    "`",
    ",",
    "?",
    "!",
]


def normalize_word(token):
    _token = token
    for p in punct:
        if (p + " " in token or " " + p in token) or (
                re.search(comma_strip, token) != None
        ):
            _token = _token.replace(p, "")
        else:
            _token = _token.replace(p, " ")
    token = period_strip.sub("", _token, re.UNICODE)

    _token = []
    temp = token.lower().split()
    for word in temp:
        word = manual_map.setdefault(word, word)
        if word not in articles:
            _token.append(word)
    for i, word in enumerate(_token):
        if word in contractions:
            _token[i] = contractions[word]
    token = " ".join(_token)
    token = token.replace(",", "")
    return token

In [6]:
import json
import os
import random
import re

import pandas as pd

# from make_arrow import make_arrow, make_arrow_vqa, make_arrow_melinda

def prepro_vqa_ovqa():
    random.seed(42)

    data = {
        "train": [],
        "val": [],
        "test": []
    }

    data_root = "/home/coder/projects/Med-VQA/data_OVQA/"
    image_root = f"{data_root}/img"

    for split in ["train", "val", "test"]:
        with open(f"{data_root}/{split}set.json", "r") as fp:
            samples = json.load(fp)
            for sample in samples:
                img_path = os.path.join(image_root, sample["image_name"])
                if sample["answer"].lower() == "yes" or sample["answer"].lower() == "no":
                    answer_type = "CLOSED"
                else:
                    answer_type = "OPEN"
                print("#######answer_type",  sample["answer"].lower(), answer_type)
                qid = sample["qid"]
                question = sample["question"]
                answer = sample["answer"]
                # answer_type = sample["answer_type"]
                data[split].append({
                    "img_path": img_path,
                    "qid": qid,
                    "question": question,
                    "answer": answer,
                    "answer_type": answer_type
                })
    make_arrow_vqa_ovqa(data, "vqa_ovqa", "data/vqa_ovqa/")


def prepro_robot_demo():
    random.seed(42)

    data = {
        "train": [],
        "val": [],
        "test": []
    }

    data_root = "/home/coder/projects/Med-VQA/data/"
    image_root = f"{data_root}/images"

    for split in ["train", "val"]:
        with open(f"{data_root}/test-val-train/{split}set.json", "r") as fp:
            samples = json.load(fp)
            for sample in samples:
                img_path = os.path.join(image_root, sample["image_name"])
                qid = sample["qid"]
                question = sample["question"]
                answer = sample["answer"]
                answer_type = sample["answer_type"]
                data[split].append({
                    "img_path": img_path,
                    "qid": qid,
                    "question": question,
                    "answer": answer,
                    "answer_type": answer_type
                })

    # for split in ["train", "val", "test"]:
    data["test"].append({
        "img_path": '/home/coder/projects/SystemDataset/robot/upload.jpg',
        "qid": 0,
        "question": 'Is there evidence of an aortic aneurysm?',
        "answer": 'Yes',
        "answer_type": 'CLOSED'
    })
    make_arrow_vqa(data, "vqa_vqa_rad", "data/finetune_arrows/")


def prepro_vqa_vqa_rad():
    random.seed(42)

    data = {
        "train": [],
        "val": [],
        "test": []
    }

    data_root = "/home/coder/projects/Med-VQA/data/"
    image_root = f"{data_root}/images"

    for split in ["train", "val", "test"]:
        with open(f"{data_root}/test-val-train/{split}set.json", "r") as fp:
            samples = json.load(fp)
            for sample in samples:
                img_path = os.path.join(image_root, sample["image_name"])
                qid = sample["qid"]
                question = sample["question"]
                answer = sample["answer"]
                answer_type = sample["answer_type"]
                data[split].append({
                    "img_path": img_path,
                    "qid": qid,
                    "question": question,
                    "answer": answer,
                    "answer_type": answer_type
                })
    make_arrow_vqa(data, "vqa_vqa_rad", "data/finetune_arrows/")


def prepro_vqa_vqa_rad_20231019():
    random.seed(42)

    data = {
        "train": [],
        "val": [],
        "test": []
    }

    data_root = "/home/coder/projects/Med-VQA/data/"
    image_root = f"{data_root}/images"

    for split in ["train", "val", "test"]:
        with open(f"{data_root}/test-val-train/{split}set.json", "r") as fp:
            samples = json.load(fp)
            for sample in samples:
                img_path = os.path.join(image_root, sample["image_name"])
                qid = sample["qid"]
                question = "{}#{}#{}#{}#{}".format(sample["question"], sample["image_name"], sample["image_organ"], sample["answer"], sample["answer_type"], )
                answer = sample["answer"]
                answer_type = sample["answer_type"]
                data[split].append({
                    "img_path": img_path,
                    "qid": qid,
                    "question": question,
                    "answer": answer,
                    "answer_type": answer_type
                })
    make_arrow_vqa(data, "vqa_vqa_rad", "data/finetune_arrows_20231019/")


def prepro_vqa_slack():
    random.seed(42)

    data = {
        "train": [],
        "val": [],
        "test": []
    }

    data_root = "/home/coder/projects/Med-VQA/data_SLAKE"
    image_root = f"{data_root}/images"

    # for split, file in zip(["train", "val", "test"], ["train.json", "validate.json", "test.json"]):
    #     with open(f"{data_root}/{file}", "r") as fp:
    for split in ["train", "val", "test"]:
        with open(f"{data_root}/test-val-train/{split}set.json", "r") as fp:
            samples = json.load(fp)
            for sample in samples:
                if sample["q_lang"] != "en":
                    continue
                img_path = os.path.join(image_root, sample["image_name"])
                qid = sample["qid"]
                question = sample["question"]
                answer = sample["answer"]
                answer_type = sample["answer_type"]
                data[split].append({
                    "img_path": img_path,
                    "qid": qid,
                    "question": question,
                    "answer": answer,
                    "answer_type": answer_type
                })
    print("######", data["train"][:5])
    make_arrow_vqa(data, "vqa_slack", "data/finetune_arrows/")


def prepro_vqa_path():
    random.seed(42)

    data = {
        "train": [],
        "val": [],
        "test": []
    }

    data_root = "/home/coder/projects/Med-VQA/data_PATH"
    image_root = f"{data_root}/images"

    # for split, file in zip(["train", "val", "test"], ["train.json", "validate.json", "test.json"]):
    #     with open(f"{data_root}/{file}", "r") as fp:
    for split in ["train", "val", "test"]:
        with open(f"{data_root}/test-val-train/{split}set.json", "r") as fp:
            samples = json.load(fp)
            for sample in samples:
                img_path = os.path.join(image_root, sample["image_name"])
                qid = sample["qid"]
                question = sample["question"]
                answer = sample["answer"]
                if sample["answer_type"] == "yes/no":
                    answer_type = "CLOSED"
                else:
                    answer_type = "OPEN"

                data[split].append({
                    "img_path": img_path,
                    "qid": qid,
                    "question": question,
                    "answer": answer,
                    "answer_type": answer_type
                })

    # print("######", data["train"][:10])
    # print("######", data["val"][:10])
    # print("######", data["test"][:10])
    make_arrow_vqa(data, "vqa_path", "/home/coder/projects/PTUnifier-share/data/finetune_arrows")


def prepro_vqa_medvqa2019():
    random.seed(42)

    data = {
        "train": [],
        "val": [],
        "test": []
    }

    data_root = "/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019"
    # image_root = "data/finetune_data/medvqa_2019/{}/images"
    image_root = "/home/coder/projects/MMBERT/VQA-Med-2019"
    offset = 0
    for split, imag_path in zip(["train", "val", "test"], ["ImageClef-2019-VQA-Med-Training/train_images", 
                                                "ImageClef-2019-VQA-Med-Validation/val_images",
                                                "ImageClef-2019-VQA-Med-Test/test_images"]):
        # samples = open(f"{data_root}/{split}/QA/Modality.csv").read().strip().split("\n") + \
        #           open(f"{data_root}/{split}/QA/Organ.csv").read().strip().split("\n") + \
        #           open(f"{data_root}/{split}/QA/Plane.csv").read().strip().split("\n")
        # samples = [[idx + offset] + question.split("|") for idx, question in enumerate(samples)]
        # offset += len(samples)
        with open(f"{data_root}/{split}set.json", "r") as fp:
            samples = json.load(fp)       
            for sample in samples:
                # {'image_name': 'synpic41148.jpg', 'question': 'which organ is captured by this ct scan?', 'answer': 'lung, mediastinum, pleura', 'mode': 'train', 
                # 'category': 'organ', 'qid': 500, 'answer_type': 'OPEN', 'question_type': 'ORGAN'}
                # img_path = os.path.join(image_root.format(split), sample[1] + ".jpg")
                print(sample)
                img_path = os.path.join(image_root, imag_path, sample['image_name'])
                qid = sample['qid']
                question = sample['question']
                answer = sample['answer']
                if sample['category'] == 'binary':
                    answer_type = "CLOSED"
                else:
                    answer_type = "OPEN"
                data[split].append({
                    "img_path": img_path,
                    "qid": qid,
                    "question": question,
                    "answer": answer,
                    "answer_type": answer_type
                })
    # print(data["train"][:5])
    make_arrow_vqa(data, "vqa_medvqa_2019", "/home/coder/projects/PTUnifier-share/data/finetune_arrows")


def prepro_cls_melinda():
    random.seed(42)

    data = {
        "train": [],
        "val": [],
        "test": []
    }

    data_root = "data/finetune_data/melinda"
    image_root = f"{data_root}/melinda_images"

    for split, file in zip(["train", "val", "test"], ["train.csv", "dev.csv", "test.csv"]):
        samples = pd.read_csv(f"{data_root}/{file}")
        for sample_idx, sample in samples.iterrows():

            img_path = os.path.join(image_root, sample["figure_file"])
            texts = [sample["caption"]]
            i_meth = sample["i_meth"]
            p_meth = sample["p_meth"]
            i_meth_label = sample["i_meth_label"]
            p_meth_label = sample["p_meth_label"]

            if len(texts) > 0:
                data[split].append({
                    "img_path": img_path,
                    "texts": texts,
                    "i_meth": i_meth,
                    "p_meth": p_meth,
                    "i_meth_label": i_meth_label,
                    "p_meth_label": p_meth_label
                })

    make_arrow_melinda(data, "cls_melinda", "data/finetune_arrows/")


def prepro_irtr_roco(min_length=3):
    random.seed(42)

    data = {
        "train": [],
        "val": [],
        "test": []
    }
    roco_data_root = "data/pretrain_data/roco"
    roco_image_root = "data/pretrain_data/roco/{}/radiology/images/"

    for split in ["train", "val", "test"]:
        with open(f"{roco_data_root}/{split}/radiology/captions.txt", "r") as fp:
            lines = fp.read().strip().split("\n")
            random.shuffle(lines)
            for line_idx, line in enumerate(lines):
                str_splits = line.strip().split('\t')
                if len(str_splits) == 2:
                    img_path = os.path.join(roco_image_root.format(split), str_splits[0] + ".jpg")
                    texts = [str_splits[1]]
                    texts = [re.sub(r"\s+", " ", text) for text in texts]
                    texts = [text for text in texts if len(text.split()) >= min_length]
                    if len(texts) > 0:
                        data[split].append({
                            "img_path": img_path,
                            "texts": texts
                        })
                        if split == "val" and len(data[split]) == 2000:
                            break
                        if split == "test" and len(data[split]) == 2000:
                            break
    make_arrow(data, "irtr_roco", "data/finetune_arrows/")


# if __name__ == '__main__':
# prepro_vqa_vqa_rad()
# prepro_vqa_slack()
# prepro_vqa_medvqa2019()
# prepro_vqa_path()
# prepro_robot_demo()
# prepro_vqa_vqa_rad_20231019()

# prepro_cls_melinda()
# prepro_irtr_roco()
prepro_vqa_ovqa()

#######answer_type ct OPEN
#######answer_type no CLOSED
#######answer_type no CLOSED
#######answer_type ct OPEN
#######answer_type oblique translucent line shadows are seen on the lateral plateau of the left tibia, the cortical bone of the lateral border is discontinuous, and there are no obvious bone abnormalities in the remaining bone of the knee joint and the middle and lower segments of the left tibia OPEN
#######answer_type no CLOSED
#######answer_type ct OPEN
#######answer_type comminuted fracture of bilateral distal radius OPEN
#######answer_type no CLOSED
#######answer_type ulna,elbow,and humerus. OPEN
#######answer_type fractures of the left olecranon and coronal process of the left ulna, the fracture is slightly displaced, with subluxation of the radial head OPEN
#######answer_type no CLOSED
#######answer_type ct OPEN
#######answer_type comminuted fracture of left distal radius, dorsal avulsion fracture of left scaphoid bone, left uncinate fracture, left ulnar styloid process

100%|██████████| 15216/15216 [00:00<00:00, 1163928.54it/s]
100%|██████████| 1902/1902 [00:00<00:00, 1399818.60it/s]
100%|██████████| 1902/1902 [00:00<00:00, 1493087.44it/s]
100%|██████████| 19020/19020 [00:00<00:00, 33563.81it/s]
100%|██████████| 15216/15216 [00:00<00:00, 34978.22it/s]
100%|██████████| 1902/1902 [00:00<00:00, 32675.39it/s]
100%|██████████| 1902/1902 [00:00<00:00, 32947.86it/s]


@@@@@@@@@@ {'ct': 0, 'no': 1, 'oblique translucent line shadows are seen on lateral plateau of left tibia cortical bone of lateral border is discontinuous and there are no obvious bone abnormalities in remaining bone of knee joint and middle and lower segments of left tibia': 2, 'comminuted fracture of bilateral distal radius': 3, 'ulna elbow and humerus': 4, 'fractures of left olecranon and coronal process of left ulna fracture is slightly displaced with subluxation of radial head': 5, 'comminuted fracture of left distal radius dorsal avulsion fracture of left scaphoid bone left uncinate fracture left ulnar styloid process avulsion fracture': 6, 'yes': 7, '0': 8, 'no abnormality': 9, 'ulna ulnar radioulnar and humerus': 10, 'leg': 11, 'old fracture of right tibia intercondylar carina': 12, 'comminuted fractures of base of second third and fourth metacarpal bones of left hand fractures of left wrist large and small polygonal bones and capitate bones': 13, 'p': 14, 'comminuted fracture 

100%|██████████| 15216/15216 [00:00<00:00, 31937.14it/s]
100%|██████████| 1902/1902 [00:00<00:00, 14162.30it/s]
100%|██████████| 1902/1902 [00:00<00:00, 29070.43it/s]


###### 1996 2000
train set: 2000 images, 15179 questions


100%|██████████| 1996/1996 [00:00<00:00, 4427.32it/s]


#########max [[499], [466], [7], [0], [7]]
文件生成被我注释掉了
###### 1212 1228
val set: 1228 images, 1886 questions


100%|██████████| 1212/1212 [00:00<00:00, 4720.43it/s]


#########max [[619]]
文件生成被我注释掉了
###### 1219 1230
test set: 1230 images, 1891 questions


100%|██████████| 1219/1219 [00:00<00:00, 10176.32it/s]

#########max [[633], [7]]
文件生成被我注释掉了





In [12]:
# import pyarrow as pa

# # 读取.arrow文件
# table = pa.Table.from_batch(pa.ipc.open_file('/home/coder/projects/PTUnifier-share/data/finetune_arrows/vqa_vqa_rad_test.arrow').read_all())

# # 将数据转换为Pandas DataFrame
# df = table.to_pandas()

import pyarrow as pa
import pandas as pd
import numpy as np

# # 打开.arrow文件并读取所有记录
# with pa.OSFile('/home/coder/projects/PTUnifier-share/data/finetune_arrows/vqa_vqa_rad_test.arrow') as f:
#     stream = pa.input_stream(f)
#     reader = pa.ipc.RecordBatchStreamReader(stream)
#     record_batch = reader.read_next_batch()

# # 从RecordBatch对象创建Table对象
# table = pa.Table.from_batches([record_batch])

# # 将Table对象转换为Pandas DataFrame
# df = table.to_pandas()

# print(max(df["answer_labels"]))
max_val = 0
ans_lt = []
for path in [
            # "/home/coder/projects/PTUnifier-share/data/finetune_arrows_2.0/vqa_medvqa_2019_test_my.arrow",
            # "/home/coder/projects/PTUnifier-share/data/finetune_arrows/vqa_medvqa_2019_train_my.arrow",
            # "/home/coder/projects/PTUnifier-share/data/finetune_arrows/vqa_medvqa_2019_val_my.arrow",
            # "/home/coder/projects/PTUnifier-share/data/finetune_arrows/vqa_vqa_rad_train.arrow",
            "/home/coder/projects/METER/data/finetune_arrows_20231019/vqa_vqa_rad_train.arrow", 
            "/home/coder/projects/METER/data/finetune_arrows_20231019/vqa_vqa_rad_test.arrow",
            "/home/coder/projects/METER/data/finetune_arrows_20231019/vqa_vqa_rad_val.arrow"
            ]:
    table = pa.ipc.RecordBatchFileReader(
        pa.memory_map(path, "r")
    ).read_all()

    pdtable = table.to_pandas()
    # print(pdtable.to_numpy() )
    # counter = {k: v for k, v in Counter(all_major_answers).items() if v >= 0}
    print("######", pdtable[:2], type(pdtable["image_id"][0]), pdtable["answer_labels"][0])
    for arr in pdtable["answer_labels"]:
        max_val = max(max_val, max(arr)) 
    for tmp in pdtable["answers"]:
        for tp in tmp:
            ans_lt.append(tp[0])

print(max_val)
counter_ans = {k:v for k, v in Counter(ans_lt).items()}
## test [497]



######                                                image  \
0  b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x02\x00...   
1  b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x02\x00...   

                                           questions  \
0  [Is/Are the liver normal?#synpic40596.jpg#ABD#...   
1  [In which lobes do you see a higher density of...   

                                             answers  \
0  [[no], [yes], [yes], [yes], [yes], [yes], [yes...   
1  [[upper lobes], [no], [no], [upper lobes], [no...   

                                       answer_labels  \
0  [[0], [7], [7], [7], [7], [7], [7], [7], [7], ...   
1                     [[1], [0], [0], [1], [0], [0]]   

                                       answer_scores  \
0  [[1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1....   
1         [[1.0], [1.0], [1.0], [1.0], [1.0], [1.0]]   

                                            image_id  \
0  /home/coder/projects/Med-VQA/data//images/synp...   
1  /home/coder/projects/Med-VQA/data

In [13]:
print(counter_ans)
print(len(counter_ans))

{'no': 1017, 'yes': 947, 'upper lobes': 4, 'volume loss': 4, 'right lung': 14, 'paratracheal area': 4, 'infection': 1, 'pa': 15, 'adenopathy': 1, 'right': 41, 'gray matter': 2, 'white matter': 7, 'white matter plaques': 2, '6.5 x 6.2 x 8.8cm': 2, 'right kidney': 4, 'descending colon': 4, 'adjacent to vertebrae': 1, 'with contrast': 7, 'right sylvian fissure': 4, 'subarachnoid': 8, 'in bowel': 1, 'abdomen and pelvis': 4, 'left apical pneumothorax': 4, 'coronal': 3, 'cystic': 5, 'prior surgery': 2, 'pulmonary nodules': 5, 'right superior cavoatrial junction': 4, 'bilateral': 10, 'gi': 4, 'ct': 13, 'atherosclerotic calcification': 4, 'cirrhosis': 2, 'splenule': 2, 'right convexity': 4, 'there is massive cerebral hemisphere edema': 2, 'small subdural hematoma with cerebral edema': 4, 'hyperintense': 6, 'lateral and third ventricular hydrocephalus': 4, 'pineal region': 4, 'left side': 4, 'axial': 43, 'nucleus pulposus': 2, 'cardiomegaly with pulmonary edema': 4, 'kidney cyst': 4, 'well circ

In [23]:
import random
import torch
import io
import pyarrow as pa
import os
from PIL import Image
data_dir = '/home/coder/projects/METER/data/vqa_rad'
names = ['vqa_train', 'vqa_val']
tables = [
    pa.ipc.RecordBatchFileReader(
        pa.memory_map(f"{data_dir}/{name}.arrow", "r")
    ).read_all()
    for name in names
    if os.path.isfile(f"{data_dir}/{name}.arrow")
]
for name in names:
    print(f"{data_dir}/{name}.arrow")
    if os.path.isfile('/home/coder/projects/METER/data/vqa_rad/vqa_train.arrow'):
        print("222222")
print(len(tables))
# /home/coder/projects/METER/data/vqa-rad/vqa_train.arrow

/home/coder/projects/METER/data/vqa_rad/vqa_train.arrow
/home/coder/projects/METER/data/vqa_rad/vqa_val.arrow
0


In [10]:
import json

with open("/home/coder/projects/METER/data/vqa_robot_demo/label2ans.json") as f:
    data = json.load(f)
    print(data[7])

KeyError: 7