In [6]:

import os
import pandas as pd
import numpy as np
import _pickle as cPickle
import sys
import json
class Dictionary(object):
    def __init__(self, word2idx=None, idx2word=None):
        if word2idx is None:
            word2idx = {}
        if idx2word is None:
            idx2word = []
        self.word2idx = word2idx
        self.idx2word = idx2word

    @property
    def ntoken(self):
        return len(self.word2idx)

    @property
    def padding_idx(self):
        return len(self.word2idx)

    def tokenize(self, sentence, add_word):
        sentence = sentence.lower()
        if "? -yes/no" in sentence:
            sentence = sentence.replace("? -yes/no", "")
        if "? -open" in sentence:
            sentence = sentence.replace("? -open", "")
        if "? - open" in sentence:
            sentence = sentence.replace("? - open", "")
        sentence = sentence.replace(',', '').replace('?', '').replace('\'s', ' \'s').replace('...', '').replace('x ray', 'x-ray').replace('.', '')
        words = sentence.split()
        tokens = []
        if add_word:
            for w in words:
                tokens.append(self.add_word(w))
        else:
            for w in words:
                # if a word is not in dictionary, it will be replaced with the last word of dictionary.
                tokens.append(self.word2idx.get(w, self.padding_idx-1))
        return tokens

    def dump_to_file(self, path):
        cPickle.dump([self.word2idx, self.idx2word], open(path, 'wb'))
        print('dictionary dumped to %s' % path)

    @classmethod
    def load_from_file(cls, path):
        print('loading dictionary from %s' % path)
        word2idx, idx2word = cPickle.load(open(path, 'rb'))
        d = cls(word2idx, idx2word)
        return d

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


def create_dictionary(dataroot):
    dictionary = Dictionary()
    questions = []
    files = [
        'trainset.json', #train dataset
        'valset.json' #validate dateset
    ]
    for path in files:
        qa_pairs = os.path.join(dataroot, path)
        print("processing the {}".format(path))
        with open(qa_pairs) as f:
            data_js = json.load(f)
            for item in data_js:
                dictionary.tokenize(item['question'], True)     #row[0]: id , row[1]: question , row[2]: answer
    return dictionary

def create_glove_embedding_init(idx2word, glove_file):
    word2emb = {}
    with open(glove_file, 'r') as f:
        entries = f.readlines()
    emb_dim = len(entries[0].split(' ')) - 1
    print('embedding dim is %d' % emb_dim)
    weights = np.zeros((len(idx2word), emb_dim), dtype=np.float32)

    for entry in entries:
        vals = entry.split(' ')
        word = vals[0]
        vals = list(map(float, vals[1:]))
        word2emb[word] = np.array(vals)
    for idx, word in enumerate(idx2word):
        if word not in word2emb:
            continue
        weights[idx] = word2emb[word]
    return weights, word2emb

In [8]:
# 替换了dictionary.pkl用这个生成/home/coder/projects/MEVF/MICCAI19-MedVQA/tools/create_dictionary.py


data_dir = "/home/coder/projects/Med-VQA/data_OVQA"
d = create_dictionary(data_dir)
d.dump_to_file(data_dir + '/dictionary.pkl')

d = Dictionary.load_from_file(data_dir + '/dictionary.pkl')
emb_dim = 300
glove_file = data_dir + '/glove/glove.6B.%dd.txt' % emb_dim
weights, word2emb = create_glove_embedding_init(d.idx2word, glove_file)
np.save(data_dir + '/glove6b_init_%dd.npy' % emb_dim, weights)

processing the trainset.json
processing the valset.json
dictionary dumped to /home/coder/projects/Med-VQA/data_OVQA/dictionary.pkl
loading dictionary from /home/coder/projects/Med-VQA/data_OVQA/dictionary.pkl
embedding dim is 300


In [None]:
import pandas as pd
import os
import sys
import json
import numpy as np
import re
import _pickle as cPickle

contractions = {
    "aint": "ain't", "arent": "aren't", "cant": "can't", "couldve":
    "could've", "couldnt": "couldn't", "couldn'tve": "couldn't've",
    "couldnt've": "couldn't've", "didnt": "didn't", "doesnt":
    "doesn't", "dont": "don't", "hadnt": "hadn't", "hadnt've":
    "hadn't've", "hadn'tve": "hadn't've", "hasnt": "hasn't", "havent":
    "haven't", "hed": "he'd", "hed've": "he'd've", "he'dve":
    "he'd've", "hes": "he's", "howd": "how'd", "howll": "how'll",
    "hows": "how's", "Id've": "I'd've", "I'dve": "I'd've", "Im":
    "I'm", "Ive": "I've", "isnt": "isn't", "itd": "it'd", "itd've":
    "it'd've", "it'dve": "it'd've", "itll": "it'll", "let's": "let's",
    "maam": "ma'am", "mightnt": "mightn't", "mightnt've":
    "mightn't've", "mightn'tve": "mightn't've", "mightve": "might've",
    "mustnt": "mustn't", "mustve": "must've", "neednt": "needn't",
    "notve": "not've", "oclock": "o'clock", "oughtnt": "oughtn't",
    "ow's'at": "'ow's'at", "'ows'at": "'ow's'at", "'ow'sat":
    "'ow's'at", "shant": "shan't", "shed've": "she'd've", "she'dve":
    "she'd've", "she's": "she's", "shouldve": "should've", "shouldnt":
    "shouldn't", "shouldnt've": "shouldn't've", "shouldn'tve":
    "shouldn't've", "somebody'd": "somebodyd", "somebodyd've":
    "somebody'd've", "somebody'dve": "somebody'd've", "somebodyll":
    "somebody'll", "somebodys": "somebody's", "someoned": "someone'd",
    "someoned've": "someone'd've", "someone'dve": "someone'd've",
    "someonell": "someone'll", "someones": "someone's", "somethingd":
    "something'd", "somethingd've": "something'd've", "something'dve":
    "something'd've", "somethingll": "something'll", "thats":
    "that's", "thered": "there'd", "thered've": "there'd've",
    "there'dve": "there'd've", "therere": "there're", "theres":
    "there's", "theyd": "they'd", "theyd've": "they'd've", "they'dve":
    "they'd've", "theyll": "they'll", "theyre": "they're", "theyve":
    "they've", "twas": "'twas", "wasnt": "wasn't", "wed've":
    "we'd've", "we'dve": "we'd've", "weve": "we've", "werent":
    "weren't", "whatll": "what'll", "whatre": "what're", "whats":
    "what's", "whatve": "what've", "whens": "when's", "whered":
    "where'd", "wheres": "where's", "whereve": "where've", "whod":
    "who'd", "whod've": "who'd've", "who'dve": "who'd've", "wholl":
    "who'll", "whos": "who's", "whove": "who've", "whyll": "why'll",
    "whyre": "why're", "whys": "why's", "wont": "won't", "wouldve":
    "would've", "wouldnt": "wouldn't", "wouldnt've": "wouldn't've",
    "wouldn'tve": "wouldn't've", "yall": "y'all", "yall'll":
    "y'all'll", "y'allll": "y'all'll", "yall'd've": "y'all'd've",
    "y'alld've": "y'all'd've", "y'all'dve": "y'all'd've", "youd":
    "you'd", "youd've": "you'd've", "you'dve": "you'd've", "youll":
    "you'll", "youre": "you're", "youve": "you've"
}
manual_map = { 'none': '0',
              'zero': '0',
              'one': '1',
              'two': '2',
              'three': '3',
              'four': '4',
              'five': '5',
              'six': '6',
              'seven': '7',
              'eight': '8',
               'nine': '9',
              'ten': '10'}
articles = ['a', 'an', 'the']
period_strip = re.compile("(?!<=\d)(\.)(?!\d)")
comma_strip = re.compile("(\d)(\,)(\d)")
punct = [';', r"/", '[', ']', '"', '{', '}',
                '(', ')', '=', '+', '\\', '_', '-',
                '>', '<', '@', '`', ',', '?', '!']

def process_punctuation(inText):
    outText = inText
    for p in punct:
        if (p + ' ' in inText or ' ' + p in inText) \
           or (re.search(comma_strip, inText) != None):
            outText = outText.replace(p, '')
        else:
            outText = outText.replace(p, ' ')
    outText = period_strip.sub("", outText, re.UNICODE)
    return outText

def process_digit_article(inText):
    outText = []
    tempText = inText.lower().split()
    for word in tempText:
        word = manual_map.setdefault(word, word)
        if word not in articles:
            outText.append(word)
        else:
            pass
    for wordId, word in enumerate(outText):
        if word in contractions:
            outText[wordId] = contractions[word]
    outText = ' '.join(outText)
    return outText

def preprocess_answer(answer):
    answer = str(answer)
    answer = process_digit_article(process_punctuation(answer))
    answer = answer.replace(',', '').replace('x ray', 'xray')
    return answer

def filter_answers(qa_pairs, min_occurence):
    """This will change the answer to preprocessed version
    """
    occurence = {}

    for id, row in qa_pairs.iterrows(): # row:[id,ques,ans]
        gtruth = row['answer']
        gtruth = ' '.join(gtruth.split())
        # gtruth = preprocess_answer(gtruth)
        if gtruth not in occurence:
            occurence[gtruth] = set()
        occurence[gtruth].add(row['question'])
    for answer in list(occurence):
        if len(occurence[answer]) < min_occurence:
            occurence.pop(answer)

    print('Num of answers that appear >= %d times: %d' % (
        min_occurence, len(occurence)))
    return occurence

def create_ans2label(occurence,root='data'):
    """Note that this will also create label2ans.pkl at the same time

    occurence: dict {answer -> whatever}
    name: prefix of the output file
    cache_root: str
    """
    ans2label = {}
    label2ans = []
    label = 0
    for answer in occurence:
        label2ans.append(answer)
        ans2label[answer] = label
        label += 1

    print('ans2lab', len(ans2label))
    print('lab2abs', len(label2ans))

    file = os.path.join(root, 'ans2label.pkl')
    cPickle.dump(ans2label, open(file, 'wb'))
    file = os.path.join(root, 'label2ans.pkl')
    cPickle.dump(label2ans, open(file, 'wb'))
    return ans2label

def compute_target(answers_dset, ans2label, name, root='data'):
    """Augment answers_dset with soft score as label

    ***answers_dset should be preprocessed***

    Write result into a cache file
    """
    target = []
    count = 0
    for id,qa_pair in answers_dset.iterrows():
        answers = ' '.join(qa_pair['answer'].split())
        # answer_count = {}
        # for answer in answers:
        #     answer_ = answer['answer']
        #     answer_count[answer_] = answer_count.get(answer_, 0) + 1

        labels = []
        scores = []
        if answers in ans2label:
            scores.append(1.)
            labels.append(ans2label[answers])
        # for answer in answer_count:
        #     if answer not in ans2label:
        #         continue
        #     labels.append(ans2label[answer])
        #     score = get_score(answer_count[answer])
        #     scores.append(score)

        target.append({
            'question': qa_pair['question'],
            'image_name': qa_pair['id'],
            'labels': labels,
            'scores': scores
        })

    file = os.path.join(root, name+'_target.pkl')
    cPickle.dump(target, open(file, 'wb'))
    return target

In [None]:
data = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data')
train_path = os.path.join(data,'VQA-Med-2020-Task1-VQAnswering-TrainVal-Sets/VQAMed2020-VQAnswering-TrainingSet/VQAnswering_2020_Train_QA_pairs.txt')
train_qa_pairs = pd.read_csv(train_path, sep='|', header=None, names=['id', 'question', 'answer'], index_col=None)
occurence = filter_answers(train_qa_pairs, 0)  # select the answer with frequence over min_occurence

label_path = data + 'ans2label.pkl'
if os.path.isfile(label_path):
    print('found %s' % label_path)
    ans2label = cPickle.load(open(label_path, 'rb'))
else:
    ans2label = create_ans2label(occurence,data)     # create ans2label and label2ans

compute_target(train_qa_pairs, ans2label, 'train',data) #dump train target to .pkl {question,image_name,labels,scores}

validate_path = os.path.join(data,'VQA-Med-2020-Task1-VQAnswering-TrainVal-Sets/VQAMed2020-VQAnswering-ValidationSet/VQAnswering_2020_Val_QA_Pairs.txt')
val_qa_pairs = pd.read_csv(validate_path, sep='|', header=None, names=['id', 'question', 'answer'], index_col=None)
compute_target(val_qa_pairs, ans2label, 'validate', data)   #dump validate target to .pkl {question,image_name,labels,scores}


# cache文件夹的文件使用/home/coder/projects/MEVF/MICCAI19-MedVQA/tools/create_label.py生成

In [7]:
#把qid转换为index_id ,并保存为json文件
import json

data_root = "/home/coder/projects/Med-VQA/data_OVQA"
train_js = json.load(open(os.path.join(data_root, "trainset.json")))
validation_js = json.load(open(os.path.join(data_root, "valset.json")))
test_js = json.load(open(os.path.join(data_root, "testset.json")))

# index = 0
# pid2idx_js = {}
# for type_js in [train_js, validation_js, test_js]:
#     for js in type_js:
#         ky = js['image_name']
#         if ky in pid2idx_js:
#             pass
#         else:   
#             pid2idx_js[ky] = index
#             index += 1

# # print(pid2idx_js)

# json.dump(pid2idx_js, open("/home/coder/projects/MEVF/MICCAI19-MedVQA/data_Med/VQA-Med-2019/imgid2idx.json", 'w'))


index = 0
pid2idx_js = {}
for type_js in [train_js, validation_js, test_js]:
    for js in type_js:
        ky = js['image_name']
        if ky in pid2idx_js:
            pass
        else:   
            pid2idx_js[ky] = index
            index += 1

# print(pid2idx_js)

json.dump(pid2idx_js, open(os.path.join(data_root, "imgid2idx.json"), 'w'))

In [2]:
from PIL import Image
import pandas as pd
import os
import json
import numpy as np
import pickle
from tqdm import tqdm


def imageresize(img2idx_jsonpath, img_folderpath, reshape_size, out_path, channels):
    with open(img2idx_jsonpath) as f:
        img2idx = json.load(f)
    
    if channels == 3:
        imgs = np.ndarray(shape=(len(img2idx), reshape_size, reshape_size, 3), dtype=float)
    else:
        imgs = np.ndarray(shape=(len(img2idx), reshape_size, reshape_size, 1), dtype=float)

    for imgid, idx in tqdm(img2idx.items()):
        if ".jpg" in imgid or ".png" in imgid:
            imgpath = os.path.join(img_folderpath, imgid)
        else:
            imgpath = os.path.join(img_folderpath, f"{imgid}.png")
        if os.path.exists(imgpath):
            if channels == 3:
                img = Image.open(imgpath).convert('RGB')
            else:
                img = Image.open(imgpath).convert('L')
        else:
            raise ValueError(f"Image path is not correct: {imgpath}")
        resized = img.resize((reshape_size, reshape_size))
        normalized = np.array(resized) / 255
        if channels == 3:
            normalized = normalized.reshape((reshape_size, reshape_size, 3))
        else:
            normalized = normalized.reshape((reshape_size, reshape_size, 1))
        imgs[idx] = normalized

    with open(out_path, 'wb') as f:
        pickle.dump(imgs, f)
    return

# imageresize("/home/coder/projects/Med-VQA/data_OVQA/imgid2idx.json", "/home/coder/projects/Med-VQA/data_OVQA/img", 
#             84, "/home/coder/projects/Med-VQA/data_OVQA/images84x84.pkl", 1)
# imageresize("/home/coder/projects/Med-VQA/data_OVQA/imgid2idx.json", "/home/coder/projects/Med-VQA/data_OVQA/img", 
#             128, "/home/coder/projects/Med-VQA/data_OVQA/images128x128.pkl", 1)
imageresize("/home/coder/projects/SystemDataset/data_OVQA_as_RAD/imgid2idx.json", "/home/coder/projects/SystemDataset/data_OVQA_as_RAD/images", 
            224, "/home/coder/projects/SystemDataset/data_OVQA_as_RAD/images224x224.pkl", 3)

# imageresize("/home/coder/projects/Med-VQA/data_PATH/imgid2idx.json", "/home/coder/projects/Med-VQA/data_PATH/images", 
#             84, "/home/coder/projects/Med-VQA/data_PATH/images84x84.pkl", 1)
# imageresize("/home/coder/projects/Med-VQA/data_PATH/imgid2idx.json", "/home/coder/projects/Med-VQA/data_PATH/images", 
#             128, "/home/coder/projects/Med-VQA/data_PATH/images128x128.pkl", 1)

# imageresize("/home/coder/projects/Med-VQA/data_PATH/imgid2idx.json", "/home/coder/projects/Med-VQA/data_PATH/images", 
#             224, "/home/coder/projects/Med-VQA/data_PATH/images224x224.pkl", 1)
# imageresize("/home/coder/projects/Med-VQA/data_PATH/imgid2idx.json", "/home/coder/projects/Med-VQA/data_PATH/images", 
#             224, "/home/coder/projects/Med-VQA/data_PATH/images224x224.pkl", 3)

# imageresize("/home/coder/projects/Med-VQA/data_SLAKE/imgid2idx.json", "/home/coder/projects/Med-VQA/data_SLAKE/images", 
#             224, "/home/coder/projects/Med-VQA/data_SLAKE/images224x224.pkl", 3)

100%|██████████| 2000/2000 [01:57<00:00, 17.05it/s]


将ovqa处理为大模型的需要的数据格式在/home/coder/projects/MiniGPT-4/test.ipynb

In [2]:
import torch 
target = torch.zeros(5)

print(target)
try:
    target.scatter_(0, torch.tensor([3]), torch.tensor([1.]))
except:
    print('a_t=0 ')

tensor([0., 0., 0., 0., 0.])


In [4]:
import sys  
sys.setdefaultencoding('utf8') 

AttributeError: module 'sys' has no attribute 'setdefaultencoding'