In [1]:
import pandas as pd
import numpy as np
import _pickle as cPickle

class Dictionary(object):
    def __init__(self, word2idx=None, idx2word=None):
        if word2idx is None:
            word2idx = {}
        if idx2word is None:
            idx2word = []
        self.word2idx = word2idx
        self.idx2word = idx2word

    @property
    def ntoken(self):
        return len(self.word2idx)

    @property
    def padding_idx(self):
        return len(self.word2idx)

    def tokenize(self, sentence, add_word):
        sentence = sentence.lower()
        if "? -yes/no" in sentence:
            sentence = sentence.replace("? -yes/no", "")
        if "? -open" in sentence:
            sentence = sentence.replace("? -open", "")
        if "? - open" in sentence:
            sentence = sentence.replace("? - open", "")
        sentence = sentence.replace(',', '').replace('?', '').replace('\'s', ' \'s').replace('...', '').replace('x ray', 'x-ray').replace('.', '')
        words = sentence.split()
        tokens = []
        if add_word:
            for w in words:
                tokens.append(self.add_word(w))
        else:
            for w in words:
                # if a word is not in dictionary, it will be replaced with the last word of dictionary.
                tokens.append(self.word2idx.get(w, self.padding_idx-1))
        return tokens

    def dump_to_file(self, path):
        cPickle.dump([self.word2idx, self.idx2word], open(path, 'wb'))
        print('dictionary dumped to %s' % path)

    @classmethod
    def load_from_file(cls, path):
        print('loading dictionary from %s' % path)
        word2idx, idx2word = cPickle.load(open(path, 'rb'))
        d = cls(word2idx, idx2word)
        return d

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) + 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

def create_dictionary(dataroot):
    dictionary = Dictionary()
    questions = []
    files = [
        'traindf.csv',
    ]
    for path in files:
        question_path = os.path.join(dataroot, path)
        qs = pd.read_csv(open(question_path))
        # print(len(qs))
        for q in qs['question']:
            # print(q)
            dictionary.tokenize(q, True)
    return dictionary


# med2019_dir = "/home/coder/projects/MMBERT/VQA-Med-2019/ImageClef-2019-VQA-Med-Training"
ovqa_dir = "/home/coder/projects/Med-VQA/data_OVQA"
data_dir = ovqa_dir
dic = create_dictionary(data_dir)
print(dic.idx2word)
dic.dump_to_file(data_dir + '/dictionary_test_hxj.pkl')




['which', 'organ', 'is', 'captured', 'by', 'this', 'ct', 'scan', 'what', 'system', 'primarily', 'present', 'in', 'image', 'imaged', 'the', 'mri', 'shows', 'shown', 'x-ray', 'one', 'seen', 'part', 'of', 'body', 'does', 'show', 'visualized', 'pictured', 'here', 'evaluated', 'systems', 'can', 'be', 'with', 'being', 'showing', 'principally', 'angiogram', 'displayed', 'ultrasound', 'gastrointestinal', 'pet', 'nuclear', 'medicine', 'abnormal', 'abnormality', 'primary', 'most', 'alarming', 'about', 'mammograph', 'look', 'normal', 'there', 'something', 'wrong', 'a', 'are', 'abnormalities', 'evidence', 'any', 'an', 'plane', 'taken', 'was', 'film', 'used', 'acquired', 'demonstrated', 'imaging', 'depicted', 'oriented', 'kind', 't1', 'weighted', 'type', 'modality', 'to', 'acquire', 'noncontrast', 'contrast', 'or', 'mr', 'weighting', 'represent', 'gi', 'given', 'patient', 'take', 'did', 'have', 'picture', 'how', 'iv', 'method', 't2', 'flair']
dictionary dumped to /home/coder/projects/MMBERT/VQA-Med

In [1]:
print("##")

##



def create_glove_embedding_init(idx2word, glove_file):
    word2emb = {}
    with open(glove_file, 'r') as f:
        entries = f.readlines()
    emb_dim = len(entries[0].split(' ')) - 1
    print('embedding dim is %d' % emb_dim)
    weights = np.zeros((len(idx2word), emb_dim), dtype=np.float32)

    for entry in entries:
        vals = entry.split(' ')
        word = vals[0]
        vals = list(map(float, vals[1:]))
        word2emb[word] = np.array(vals)
    for idx, word in enumerate(idx2word):
        if word not in word2emb:
            continue
        weights[idx] = word2emb[word]
    return weights, word2emb

In [62]:
import pandas as pd
import numpy as np
import json
import os


df = pd.read_csv(open("/home/coder/projects/MMBERT/VQA-Med-2019/ImageClef-2019-VQA-Med-Training/traindf.csv"))

df.columns=df.columns.str.replace('img_id', 'image_name')
df["image_name"] = df["image_name"] + ".jpg"
js = df.to_json(orient="records",force_ascii=False)
json.dump(json.loads(js), open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/trainset.json", 'w'))

# /home/coder/projects/MMBERT/VQA-Med-2019/ImageClef-2019-VQA-Med-Test/testdf.csv
df = pd.read_csv(open("/home/coder/projects/MMBERT/VQA-Med-2019/ImageClef-2019-VQA-Med-Test/testdf.csv"))

df.columns=df.columns.str.replace('img_id', 'image_name')
df["image_name"] = df["image_name"] + ".jpg"
js = df.to_json(orient="records",force_ascii=False)
json.dump(json.loads(js), open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/testset.json", 'w', encoding='utf-8'))

df = pd.read_csv(open("/home/coder/projects/MMBERT/VQA-Med-2019/ImageClef-2019-VQA-Med-Validation/valdf.csv"))

df.columns=df.columns.str.replace('img_id', 'image_name')
df["image_name"] = df["image_name"] + ".jpg"
js = df.to_json(orient="records",force_ascii=False)
json.dump(json.loads(js), open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/valset.json", 'w', encoding='utf-8'))



def create_jsons():
    # read data
    ## 添加参数 error_bad_lines=False by hxj
    train_js = json.load(open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/trainset.json"))
    validation_js = json.load(open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/valset.json"))
    test_js = json.load(open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/testset.json"))

    # # convert df rows to dict
    # logger.info("Converting each row in dataframe to dictionary...")
    # train_df.drop(columns=['id'], inplace=True)
    # validation_df.drop(columns=['id'], inplace=True)
    # test_df.drop(columns=['id'], inplace=True)

    ## add full image paths
    train_image_dir = os.path.join("/home/coder/projects/MMBERT/VQA-Med-2019/ImageClef-2019-VQA-Med-Training/train_images")
    validation_image_dir = os.path.join("/home/coder/projects/MMBERT/VQA-Med-2019/ImageClef-2019-VQA-Med-Validation/val_images")
    test_image_dir = os.path.join("/home/coder/projects/MMBERT/VQA-Med-2019/ImageClef-2019-VQA-Med-Test/test_images")

    for js in train_js:
        path = _imgpath(train_image_dir, js['image_name'])
        if path != "nofile":
            js['qid'] = path
        else: 
            del js
    for js in validation_js:
        path = _imgpath(validation_image_dir, js['image_name'])
        if path != "nofile":
            js['qid'] = path
        else: 
            del js
    for js in test_js:
        path = _imgpath(test_image_dir, js['image_name'])
        if path != "nofile":
            js['qid'] = path
        else: 
            del js

    json.dump(train_js, open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/trainset.json", 'w'))
    json.dump(validation_js, open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/valset.json", 'w'))
    json.dump(test_js, open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/testset.json", 'w'))

create_jsons()

In [59]:

def _imgpath(img_dir, name):
    # print(name)
    img_path = os.path.join(img_dir, str(name))
    if not os.path.exists(img_path):
        return "nofile"
    return img_path

js_test = json.load(open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/trainset.json"))
print(type(js_test))

for js in js_test:
    print(js)
    break

train_image_dir = "/home/coder/projects/MMBERT/VQA-Med-2019/ImageClef-2019-VQA-Med-Training/train_images"
# js_test = list(map(lambda x:  imgpath(train_image_dir, x['image_name'])), js_test)
for js in js_test:
    js['id'] = _imgpath(train_image_dir, js['image_name'])
# list(map(lambda x: print(x['image_name']), js_test))

# print(js_test)
for js in js_test:
    print(js)
    break

<class 'list'>
{'image_name': 'synpic41148.jpg', 'question': 'which organ is captured by this ct scan?', 'answer': 'lung, mediastinum, pleura', 'mode': 'train', 'category': 'organ'}
{'image_name': 'synpic41148.jpg', 'question': 'which organ is captured by this ct scan?', 'answer': 'lung, mediastinum, pleura', 'mode': 'train', 'category': 'organ', 'id': '/home/coder/projects/MMBERT/VQA-Med-2019/ImageClef-2019-VQA-Med-Training/train_images/synpic41148.jpg'}


In [None]:
import pickle

# 打开.pkl文件，将数据加载到一个Python对象中
with open('/home/coder/projects/MEVF/MICCAI19-MedVQA/data_RAD/images224x224.pkl', 'rb') as f:
    data = pickle.load(f)

# 使用数据
print(data.shape)
print(type(data))

: 

In [None]:

def create_jsons():
    # read data
    ## 添加参数 error_bad_lines=False by hxj
    train_js = json.load(open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/trainset.json"))
    validation_js = json.load(open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/valset.json"))
    test_js = json.load(open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/valset.json"))

    # # convert df rows to dict
    # logger.info("Converting each row in dataframe to dictionary...")
    # train_df.drop(columns=['id'], inplace=True)
    # validation_df.drop(columns=['id'], inplace=True)
    # test_df.drop(columns=['id'], inplace=True)

    ## add full image paths
    train_image_dir = os.path.join("/home/coder/projects/MMBERT/VQA-Med-2019/ImageClef-2019-VQA-Med-Training/train_images")
    validation_image_dir = os.path.join("/home/coder/projects/MMBERT/VQA-Med-2019/ImageClef-2019-VQA-Med-Validation/val_images")
    test_image_dir = os.path.join("/home/coder/projects/MMBERT/VQA-Med-2019/ImageClef-2019-VQA-Med-Test/test_images")

    for js in train_js:
        path = _imgpath(train_image_dir, js['image_name'])
        if path != "nofile":
            js['id'] = path
        else: 
            del js
    for js in validation_js:
        path = _imgpath(validation_image_dir, js['image_name'])
        if path != "nofile":
            js['id'] = path
        else: 
            del js
    for js in test_js:
        path = _imgpath(test_image_dir, js['image_name'])
        if path != "nofile":
            js['id'] = path
        else: 
            del js

    json.dump(train_js, open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/trainset.json", 'w'))
    json.dump(validation_js, open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/valset.json", 'w'))
    json.dump(test_js, open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/testset.json", 'w'))

create_jsons()

    ### drop files that don't exist: for some names in csv files, the actual image does not exist
    # train_df = train_df[train_df['name'] != "nofile"]
    # validation_df = validation_df[validation_df['name'] != "nofile"]
    # test_df = test_df[test_df['name'] != "nofile"]

    ### drop zero bytes images
    # train_df['imagesize'] = train_df['name'].apply(lambda x: _imgsize(x))
    # validation_df['imagesize'] = validation_df['name'].apply(lambda x: _imgsize(x))
    # test_df['imagesize'] = test_df['name'].apply(lambda x: _imgsize(x))

    # train_df = train_df[train_df['imagesize'] != 0]
    # validation_df = validation_df[validation_df['imagesize'] != 0]
    # test_df = test_df[test_df['imagesize'] != 0]

    # train_df.drop(columns=['imagesize'], inplace=True)
    # validation_df.drop(columns=['imagesize'], inplace=True)
    # test_df.drop(columns=['imagesize'], inplace=True)

    # train_df.rename(columns={"name": "image_path"}, inplace=True)
    # validation_df.rename(columns={"name": "image_path"}, inplace=True)
    # test_df.rename(columns={"name": "image_path"}, inplace=True)

    # ## convert to dict
    # train_dict = train_df.to_dict('index')
    # validation_dict = validation_df.to_dict('index')
    # test_dict = test_df.to_dict('index')

    # del [[train_df, validation_df, test_df]]
    # gc.collect()

    # # Dump to json
    # ## train
    # logger.info("Dumping json data for train dataset...")
    # with open(os.path.join(jsonpath, "train_dataset.json"), 'w') as f:
    #     for row in tqdm(train_dict):
    #         json.dump(train_dict[row], f)
    #         f.write("\n")
    # ## validation
    # logger.info("Dumping json data for validation dataset...")
    # with open(os.path.join(jsonpath, "validation_dataset.json"), 'w') as f:
    #     for row in tqdm(validation_dict):
    #         json.dump(validation_dict[row], f)
    #         f.write("\n")
    # ## test
    # logger.info("Dumping json data for test dataset...")
    # with open(os.path.join(jsonpath, "test_dataset.json"), 'w') as f:
    #     for row in tqdm(test_dict):
    #         json.dump(test_dict[row], f)
    #         f.write("\n")

logger.info("Jsons are successfly created!")

In [None]:
#把qid转换为index_id ,并保存为json文件
import json

train_js = json.load(open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/trainset.json"))
validation_js = json.load(open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/valset.json"))
test_js = json.load(open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/testset.json"))

# index = 0
# pid2idx_js = {}
# for type_js in [train_js, validation_js, test_js]:
#     for js in type_js:
#         ky = js['image_name']
#         if ky in pid2idx_js:
#             pass
#         else:   
#             pid2idx_js[ky] = index
#             index += 1

# # print(pid2idx_js)

# json.dump(pid2idx_js, open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/imgid2idx.json", 'w'))


index = 0
pid2idx_js = {}
for type_js in [train_js, validation_js, test_js]:
    for js in type_js:
        ky = js['qid']
        if ky in pid2idx_js:
            pass
        else:   
            pid2idx_js[ky] = index
            index += 1

# print(pid2idx_js)

json.dump(pid2idx_js, open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/imgid2idx_path_test.json", 'w'))

: 

In [7]:
#把图片转换为数组并保存为images.pkl
#224x224x3!!!!!!!!!!!!!
from torchvision import transforms, models
import cv2

from PIL import Image
import numpy as np
import pickle
import json



tfm = transforms.Compose([transforms.ToPILImage(),  
                            transforms.Resize([224, 224]),
                            transforms.ToTensor(), 
                            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

all_js = json.load(open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/imgid2idx_path.json"))

image_np = np.empty((len(all_js), 224, 224, 3))
for path, id in all_js.items():
    # img = cv2.imread(path)
    # img = tfm(img)

    # 读取图片并转换为灰度图像
    img = Image.open(path).convert('RGB')
    transform = transforms.Resize((224, 224))
    img = transform(img)
    # 将灰度图像转换为数组
    img_array = np.asarray(img) / 255
    img_array = np.reshape(img_array, (224, 224, 3))

    image_np[id] = img_array

print(image_np.shape)
with open('/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/images224x224.pkl', 'wb') as f:
    pickle.dump(image_np, f)


(4200, 224, 224, 3)


In [9]:
#把图片转换为数组并保存为images.pkl
from torchvision import transforms, models
import cv2

from PIL import Image
import numpy as np
import pickle



tfm = transforms.Compose([transforms.ToPILImage(),  
                            transforms.Resize([84, 84]),
                            transforms.ToTensor(), 
                            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

all_js = json.load(open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/imgid2idx_path.json"))

image_np = np.empty((len(all_js), 84, 84, 1))
for path, id in all_js.items():
    # img = cv2.imread(path)
    # img = tfm(img)

    # 读取图片并转换为灰度图像
    img = Image.open(path).convert('L')
    transform = transforms.Resize((84, 84))
    img = transform(img)
    # 将灰度图像转换为数组
    img_array = np.asarray(img) / 255
    img_array = np.reshape(img_array, (84, 84, 1))

    image_np[id] = img_array

print(image_np.shape)
with open('/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/images84x84.pkl', 'wb') as f:
    pickle.dump(image_np, f)


(4200, 84, 84, 1)


In [None]:

import pickle

# 从文件中加载对象
with open('/home/coder/projects/Med-VQA/data_SLAKE/dictionary.pkl', 'rb') as f:
    my_obj = pickle.load(f)
    print(my_obj)

In [2]:
import json
st = set()
with open('/home/coder/projects/SystemDataset/data_OVQA_as_RAD/trainset.json', 'rb') as f:
    train_js = json.load(f)
    for js in train_js:
        st.add(js["question_type"])
print(st)

{'CLOSED', 'OPEN'}


In [7]:
import torch
labels = torch.tensor([3], dtype=torch.int64 )
scores = torch.tensor([1], dtype=torch.int64 )
target = torch.zeros(5, dtype=torch.int64 )
if labels is not None:
    # print("#####", target.shape, labels, scores)
    target.scatter_(0, labels, scores)

print(target)

tensor([0, 0, 0, 1, 0])


In [None]:
np.array(answer['labels'])