In [None]:
import csv 
import json 
import re 
import pathlib 
import numpy as np
np.random.seed(12)

In [56]:
def read_pred_file(path):
    data_by_qid = {}
    with open(path) as f1:
        for line in f1:
            batch = json.loads(line)
            qids = batch['question_id']
            preds = batch['speaker_utterances'][0]
            for qid, pred in zip(qids, preds):
                pred = re.sub("<[^>]*?>", "", pred)
                pred = pred.strip()
                data_by_qid[qid] = pred
    return data_by_qid


def read_csv_data(path):
    data_by_qid = {}
    with open(path) as f1:
        reader = csv.DictReader(f1)
        for row in reader:
            qid = row['Input.question_id']
            for i, sent in enumerate(row['Answer.answer_questions']):
                new_qid = f"{qid}_{i}"
                data_by_qid[new_qid] = {"sent": sent, "img_url": row['Input.imgUrl']}
    return data_by_qid

In [57]:
pred_questions = read_pred_file('/brtx/602-nvme1/estengel/annotator_uncertainty/models/img2q_t5_base_no_limit/output/test_set_predictions_forced.jsonl') 
ann_questions = json.load(open("/home/estengel/annotator_uncertainty/jimena_work/cleaned_data/csv/test_set/questions.json"))['questions']
ann_annotations = json.load(open("/home/estengel/annotator_uncertainty/jimena_work/cleaned_data/csv/test_set/annotations.json"))['annotations']
ann_img_ids = read_csv_data("/home/estengel/annotator_uncertainty/jimena_work/cleaned_data/csv/test_set/consolidate_data_repeat_all_data.csv")
original_questions = json.load(open("/brtx/603-nvme2/estengel/annotator_uncertainty/vqa/v2_OpenEnded_mscoco_train2014_questions.json"))['questions']

ann_questions = {q['question_id']: q for q in ann_questions}
ann_annotations = {q['question_id']: q for q in ann_annotations}
original_questions = {q['question_id']: q for q in original_questions}


In [75]:
np.random.seed(12)
# sample 100 question ids 
all_qids = sorted(list(pred_questions.keys()))
chosen = np.random.choice(all_qids, size=100, replace=False)

data = []
# get the pred, annotator, original, and random questions for each qid 
for i, qid in enumerate(chosen):

    pred = pred_questions[qid]
    ann = ann_questions[qid]['new_question']
    img_url = ann_img_ids[qid]['img_url']
    answer = ann_annotations[qid]['answers'][0]['answer']
    orig_qid = int(qid.split("_")[0])
    orig = original_questions[orig_qid]['question']

    # choose random qid and get original question for distractor 
    random_qid = np.random.choice(all_qids, size=1, replace=False)[0]
    # just in case it somehow chooses the same 
    while random_qid  == qid: 
        random_qid = np.random.choice(all_qids, size=1, replace=False)[0]
    rand_orig_qid = int(random_qid.split("_")[0])
    rand_question = original_questions[rand_orig_qid]['question']

    datapoint = {"qid": qid, "pred_question": pred, "ann_question": ann, "orig_question": orig, "rand_question": rand_question, "answer": answer, "img_url": img_url}
    data.append(datapoint)


In [93]:
# randomize data for HIT
np.random.seed(12)

data_for_hit = []
for datapoint in data: 
    questions = ["pred_question", "ann_question", "orig_question", "rand_question"]
    # questions = [(i,x) for i,x in enumerate(questions)]
    # pick a random order 
    np.random.shuffle(questions)
    # indices, questions = zip(*questions)
    questions = [(i,x) for i,x in enumerate(questions)]

    hit_datapoint = {"qid": datapoint["qid"], "img_url": json.loads(datapoint["img_url"]), "answer": datapoint["answer"], "indices_and_questions": questions} 
    for i in range(len(questions)):
        hit_datapoint[f"question_{i}"] = datapoint[questions[i][1]]
    data_for_hit.append(hit_datapoint)




In [94]:
print(data_for_hit[4])
print(data[4])


{'qid': '238290005_6', 'img_url': 'https://cs.jhu.edu/~esteng/images_for_hit/COCO_train2014_000000238290.jpg', 'answer': 'plane', 'indices_and_questions': [(0, 'pred_question'), (1, 'orig_question'), (2, 'rand_question'), (3, 'ann_question')], 'question_0': 'What is in the air? source', 'question_1': 'Besides the sun, what is the other light source in this scene?', 'question_2': 'What time of day is this?', 'question_3': 'Besides the sun, what could be the other sky light source in this scene?'}
{'qid': '238290005_6', 'pred_question': 'What is in the air? source', 'ann_question': 'Besides the sun, what could be the other sky light source in this scene?', 'orig_question': 'Besides the sun, what is the other light source in this scene?', 'rand_question': 'What time of day is this?', 'answer': 'plane', 'img_url': '"https://cs.jhu.edu/~esteng/images_for_hit/COCO_train2014_000000238290.jpg"'}


In [96]:
json_keys = ['indices_and_questions']

with open("../eval_hit/csvs/data_for_hit.csv", "w") as f1:
    writer = csv.DictWriter(f1, fieldnames=data_for_hit[0].keys())
    writer.writeheader()
    for row in data_for_hit:
        row = {k: json.dumps(v) if v in json_keys else v for k, v in row.items() }
        writer.writerow(row)