In [1]:
import numpy as np 
import torch 
from collections import defaultdict
import json 
import pathlib 
import sys
from tqdm import tqdm



In [2]:
annotations = json.load(open("/brtx/603-nvme2/estengel/annotator_uncertainty/vqa/dev_from_mturk/annotations.json"))['annotations']
questions = json.load(open("/brtx/603-nvme2/estengel/annotator_uncertainty/vqa/dev_from_mturk/questions.json"))['questions']

In [3]:
# get the clusters from annotations 
def get_annotator_clusters(questions, annotations): 
    anns_by_qid = defaultdict(list)
    for quest, ann in zip(questions, annotations):

        qid, i = quest['question_id'].split("_")
        anns_by_qid[qid].append((quest, ann))

    clusters_by_qid = {}
    for qid, list_of_qas in anns_by_qid.items():
        clusters = defaultdict(list)
        for quest, ann in list_of_qas:
            rewritten = quest['new_question']
            answer = ann['answers'][0]['answer']
            answer_id = ann['answers'][0]['mturk_id']
            cluster_dict = {"answer": answer, "id": answer_id} 
            clusters[rewritten].append(cluster_dict)
        clusters_by_qid[qid] = clusters
    return clusters_by_qid

# get the clusters from kmeans preprocessing
def get_preprocessed_clusters(questions, annotations): 
    anns_by_qid = defaultdict(list)
    for quest, ann in zip(questions, annotations):

        qid, i = quest['question_id'].split("_")
        anns_by_qid[qid].append((quest, ann))

    clusters_by_qid = {}
    for qid, list_of_qas in anns_by_qid.items():
        clusters = defaultdict(list)
        for quest, ann in list_of_qas:
            answer = ann['answers'][0]['answer']
            answer_id = ann['answers'][0]['mturk_id']
            id_key, answer_id_suffix = answer_id.split(".")
            cluster_dict = {"answer": answer, "id": answer_id} 
            clusters[id_key].append(cluster_dict)
        clusters_by_qid[qid] = clusters
    return clusters_by_qid


In [93]:
import re
from scipy.cluster.hierarchy import linkage, fcluster
import scipy 
np.set_printoptions(precision=2)

def read_generations(output_path):
    flat_data_by_qid = {}
    data = open(output_path).readlines()
    for line in data:
        batch_data = json.loads(line)
        for qid, generation in zip(batch_data['question_id'], batch_data['speaker_utterances'][0]):
            flat_data_by_qid[qid] = generation
    return flat_data_by_qid

def clean_text(text): 
    text = re.sub("<.*?>", "", text)
    text = text.strip() 
    return text 

# get the clusters from predictions 
def get_prediction_clusters(predictions_jsonl, questions, annotations, score_cls):
    generations_by_qid = read_generations(predictions_jsonl)
    anns_by_qid = defaultdict(list)
    answers_by_qid = defaultdict(list)
    for quest, ann in zip(questions, annotations):
        qid, i = quest['question_id'].split("_")
        generation = clean_text(generations_by_qid[quest['question_id']])
        anns_by_qid[qid].append(generation)
        answers_by_qid[qid].append(ann['answers'])

    scores_by_qid = {} 
    clusts_by_qid = {}
    # Get matrix of scores 
    answer_clusters = {}
    c = 0
    for qid, quest_list in tqdm(anns_by_qid.items()): 
        scores = np.zeros((len(quest_list), len(quest_list))) 
        done = []
        for i, q1 in enumerate(quest_list): 
            for j, q2 in enumerate(quest_list):
                if i == j: 
                    scores[i,j] = 0.0 
                    continue
                sim_score = score_cls.get_similarity(q1, q2) 
                scores[i,j] = 1/sim_score
                done.append((i,j))
                done.append((j,i))
        scores_by_qid[qid] = scores 
        scores = scipy.spatial.distance.squareform(scores)
        link = linkage(scores, method="average", metric="cosine")
        t = 1 
        clust = fcluster(link, t=t, criterion='inconsistent')
        clusts_by_qid[qid] = clust 
        answers_clustered = defaultdict(list)
        ans_list = answers_by_qid[qid]
        for i, idx in enumerate(clust):
            answer = ans_list[i]
            cluster_dict = {"answer": answer, "id": f"g{i}"} 
            answers_clustered[f"g{idx}"].append(cluster_dict)
        answer_clusters[qid] = answers_clustered

    return answer_clusters

In [94]:
from transformers.utils import logging
from BARTScore.abstract_class import BertSimilarityScore
logging.set_verbosity(50)
# logger = logging.get_logger("transformers")
# logger.info("INFO")
# logger.warning("WARN")

score_cls = BertSimilarityScore()
pred_path = "/brtx/602-nvme1/estengel/annotator_uncertainty/models/img2q_t5_small/output/predictions_forced.jsonl"
ans_clusts_by_qid = get_prediction_clusters(pred_path,
                                        questions, 
                                        annotations,
                                        score_cls=score_cls)

100%|██████████| 30/30 [00:21<00:00,  1.39it/s]
