In [None]:
import csv 
import json 
import re 
import pathlib 
import numpy as np
np.random.seed(12)

In [None]:
def read_pred_file(path):
    data_by_qid = {}
    with open(path) as f1:
        for line in f1:
            batch = json.loads(line)
            qids = batch['question_id']
            preds = batch['speaker_utterances'][0]
            for qid, pred in zip(qids, preds):
                pred = re.sub("<[^>]*?>", "", pred)
                pred = pred.strip()
                data_by_qid[qid] = pred
    return data_by_qid


def read_csv_data(path):
    data_by_qid = {}
    with open(path) as f1:
        reader = csv.DictReader(f1)
        for row in reader:
            qid = row['Input.question_id']
            for i, sent in enumerate(row['Answer.answer_questions']):
                new_qid = f"{qid}_{i}"
                data_by_qid[new_qid] = {"sent": sent, "img_url": row['Input.imgUrl']}
    return data_by_qid

In [None]:
pred_questions = read_pred_file('/brtx/602-nvme1/estengel/annotator_uncertainty/models/img2q_t5_base_no_limit/output/test_set_predictions_forced.jsonl') 
ann_questions = json.load(open("/home/estengel/annotator_uncertainty/jimena_work/cleaned_data/csv/test_set/questions.json"))['questions']
ann_annotations = json.load(open("/home/estengel/annotator_uncertainty/jimena_work/cleaned_data/csv/test_set/annotations.json"))['annotations']
ann_img_ids = read_csv_data("/home/estengel/annotator_uncertainty/jimena_work/cleaned_data/csv/test_set/consolidate_data_repeat_all_data.csv")
original_questions = json.load(open("/brtx/603-nvme2/estengel/annotator_uncertainty/vqa/v2_OpenEnded_mscoco_train2014_questions.json"))['questions']

ann_questions = {q['question_id']: q for q in ann_questions}
ann_annotations = {q['question_id']: q for q in ann_annotations}
original_questions = {q['question_id']: q for q in original_questions}


In [None]:
ann_annotations['35884005_1']

In [45]:
from collections import defaultdict
np.random.seed(12)
# sample 100 question ids
keys = [x for x in pred_questions.keys()]
all_qids_short = [x.split("_")[0] for x in keys]
all_qids_long = keys
all_idxs = [i for i in range(100)]
chosen_idxs = np.random.choice(all_idxs, size=100, replace=False)
chosen_qids_short = [all_qids_short[i] for i in chosen_idxs]
chosen_qids_long = [all_qids_long[i] for i in chosen_idxs]

# create lookup for anns by short qid and then postfix 
ann_by_short_qid = defaultdict(lambda: defaultdict(list))
for qid, data in ann_questions.items():
    annotation = ann_annotations[qid]
    answer = annotation['answers'][0]['answer']
    new_question = data['new_question']
    qid, idx = qid.split("_")
    ann_by_short_qid[qid][new_question].append(answer) 


data = []
# get the pred, annotator, original, and random questions for each qid 
for i, qid in enumerate(chosen_qids_long):
    # get the pred question 
    pred = pred_questions[qid]
    # get the annotator question 
    ann = ann_questions[qid]['new_question']
    # get the original question
    orig_qid = int(chosen_qids_short[i])
    orig = original_questions[orig_qid]['question']
    # get the image url
    img_url = ann_img_ids[qid]['img_url']
    # get the real answer 
    short_qid, idx = qid.split("_")
    idx = int(idx)
    answer = ann_annotations[qid]['answers'][0]['answer']
    # print(f"predicted: {pred}")
    # print(f"ann: {ann}")
    # print(f"orig: {orig}")
    # print(f"answer: {answer}")
    # get a distractor answer 
    # print(f"idx: {idx}")
    distractor_questions = list(set(ann_by_short_qid[short_qid].keys()) - set([ann]))
    # print(f"distractor cands: {distractor_questions}")
    distractor_question = np.random.choice(distractor_questions, size=1)[0]
    # print(f"distractor: {distractor_question}")
    distractor_answer = np.random.choice(ann_by_short_qid[short_qid][distractor_question], size=1)[0]
    # print(f"distractor: {distractor_answer}")


    questions_and_types = [(pred, "pred"), (ann, "ann"), (orig, "orig")]
    for quest, qtype in questions_and_types:
        main_datapoint = {"qid": qid, "question": quest, "question_type": qtype, "answer": answer, "img_url": json.loads(img_url), "is_distractor": False}
        dist_datapoint = {"qid": qid, "question": quest, "question_type": qtype, "answer": distractor_answer, "img_url": json.loads(img_url), "is_distractor": True}
        data.append(main_datapoint)
        data.append(dist_datapoint)


In [46]:
print(len(data))

# randomize data for HIT
np.random.seed(12)

metadata_keys = ["qid", "question_type", "is_distractor"]
data_for_hit = []
for datapoint in data: 
    datapoint = {k: json.dumps(v) if k in metadata_keys else v for k, v in datapoint.items() }
    data_for_hit.append(datapoint)





600


In [50]:
# print("\n".join([str(x) for x in data_for_hit[0:6]]))
# print(data_for_hit[])
# print(data[4])


{'qid': '"44463014_5"', 'question': 'Where is the car?', 'question_type': '"pred"', 'answer': 'near road', 'img_url': 'https://cs.jhu.edu/~esteng/images_for_hit/COCO_train2014_000000044463.jpg', 'is_distractor': 'false'}
{'qid': '"44463014_5"', 'question': 'Where is the car?', 'question_type': '"pred"', 'answer': 'on poles', 'img_url': 'https://cs.jhu.edu/~esteng/images_for_hit/COCO_train2014_000000044463.jpg', 'is_distractor': 'true'}
{'qid': '"44463014_5"', 'question': 'What is the sign board by?', 'question_type': '"ann"', 'answer': 'near road', 'img_url': 'https://cs.jhu.edu/~esteng/images_for_hit/COCO_train2014_000000044463.jpg', 'is_distractor': 'false'}
{'qid': '"44463014_5"', 'question': 'What is the sign board by?', 'question_type': '"ann"', 'answer': 'on poles', 'img_url': 'https://cs.jhu.edu/~esteng/images_for_hit/COCO_train2014_000000044463.jpg', 'is_distractor': 'true'}
{'qid': '"44463014_5"', 'question': 'Where is the sign board placed?', 'question_type': '"orig"', 'answe

In [51]:
# json_keys = ['indices_and_questions']
# shuffle data for actual HIT 
np.random.shuffle(data_for_hit)
with open("../eval_hit/csvs/data_for_ordinal_hit.csv", "w") as f1:
    writer = csv.DictWriter(f1, fieldnames=data_for_hit[0].keys())
    writer.writeheader()
    for row in data_for_hit:
        # row = {k: json.dumps(v) if v in json_keys else v for k, v in row.items() }
        writer.writerow(row)