In [1]:
import numpy as np 
import torch 
from collections import defaultdict
import json 
import pathlib 
import sys
from tqdm import tqdm



In [2]:
annotations = json.load(open("/home/estengel/annotator_uncertainty/jimena_work/cleaned_data/csv/dev_set/annotations.json"))['annotations']
questions = json.load(open("/home/estengel/annotator_uncertainty/jimena_work/cleaned_data/csv/dev_set/questions.json"))['questions']

In [3]:
# get the clusters from annotations 
def get_annotator_clusters(questions, annotations): 
    anns_by_qid = defaultdict(list)
    for quest, ann in zip(questions, annotations):

        qid, i = quest['question_id'].split("_")
        anns_by_qid[qid].append((quest, ann))

    clusters_by_qid = {}
    for qid, list_of_qas in anns_by_qid.items():
        clusters = defaultdict(list)
        for quest, ann in list_of_qas:
            rewritten = quest['new_question']
            answer = ann['answers'][0]['answer']
            answer_id = ann['answers'][0]['mturk_id']
            cluster_dict = {"answer": answer, "id": answer_id} 
            clusters[rewritten].append(cluster_dict)
        clusters_by_qid[qid] = clusters
    return clusters_by_qid

# get the clusters from kmeans preprocessing
def get_preprocessed_clusters(questions, annotations): 
    anns_by_qid = defaultdict(list)
    for quest, ann in zip(questions, annotations):

        qid, i = quest['question_id'].split("_")
        anns_by_qid[qid].append((quest, ann))

    clusters_by_qid = {}
    for qid, list_of_qas in anns_by_qid.items():
        clusters = defaultdict(list)
        for quest, ann in list_of_qas:
            answer = ann['answers'][0]['answer']
            answer_id = ann['answers'][0]['mturk_id']
            id_key, answer_id_suffix = answer_id.split(".")
            cluster_dict = {"answer": answer, "id": answer_id} 
            clusters[id_key].append(cluster_dict)
        clusters_by_qid[qid] = clusters
    return clusters_by_qid


In [19]:

curr_path = pathlib.Path('').resolve().parent
sys.path.insert(0, str(curr_path.joinpath("hit3.0").joinpath("results")))
from process_csv import f1_score

def preprocess(cluster_data):
    if type(cluster_data) in [dict, defaultdict]:
        # dealing with predicted clusters or preprocessed clusters
        return cluster_data.values()
    return cluster_data

def get_scores(clusters_by_qid_a, clusters_by_qid_b):
    scores = []
    for qid in clusters_by_qid_a.keys():
        cluster_a = preprocess(clusters_by_qid_a[qid])
        cluster_b = preprocess(clusters_by_qid_b[qid])
        f1_tuple = f1_score(cluster_a, cluster_b)
        f1_tuple = f1_tuple[0:-1]
        scores.append(f1_tuple)
    # print(scores)
    scores = np.array(scores)
    return np.mean(scores, axis=0)


In [20]:
from string_metrics import BertSimilarityScore
score_cls = BertSimilarityScore()

In [21]:
import re
from scipy.cluster.hierarchy import linkage, fcluster
import scipy 
np.set_printoptions(precision=2)

def read_generations(output_path):
    flat_data_by_qid = {}
    data = open(output_path).readlines()
    for line in data:
        batch_data = json.loads(line)
        for qid, generation in zip(batch_data['question_id'], batch_data['speaker_utterances'][0]):
            flat_data_by_qid[qid] = generation
    return flat_data_by_qid

def clean_text(text): 
    text = re.sub("<.*?>", "", text)
    text = text.strip() 
    return text 

# get the clusters from predictions 
def get_prediction_clusters(predictions_jsonl,
                            questions, 
                            annotations, 
                            score_cls, 
                            t = 1.06, 
                            criterion = "centroid", 
                            method = "distance"):
    generations_by_qid = read_generations(predictions_jsonl)
    anns_by_qid = defaultdict(list)
    answers_by_qid = defaultdict(list)
    for quest, ann in zip(questions, annotations):
        qid, i = quest['question_id'].split("_")
        generation = clean_text(generations_by_qid[quest['question_id']])
        anns_by_qid[qid].append(generation)
        answers_by_qid[qid].append(ann['answers'])

    scores_by_qid = {} 
    clusts_by_qid = {}
    # Get matrix of scores 
    answer_clusters = {}
    for qid, quest_list in tqdm(anns_by_qid.items()): 
        scores = np.zeros((len(quest_list), len(quest_list))) 
        done = []
        for i, q1 in enumerate(quest_list): 
            for j, q2 in enumerate(quest_list):
                if i == j: 
                    scores[i,j] = 0.0 
                    continue
                sim_score = score_cls.get_similarity(q1, q2) 
                scores[i,j] = 1/sim_score
                done.append((i,j))
                done.append((j,i))

        scores_by_qid[qid] = scores 
        scores = scipy.spatial.distance.squareform(scores)
        link = linkage(scores, method=method, metric="cosine")
        clust = fcluster(link, t=t, criterion=criterion)

        clusts_by_qid[qid] = clust 
        answers_clustered = defaultdict(list)
        ans_list = answers_by_qid[qid]
        for i, idx in enumerate(clust):
            answer = ans_list[i]
            orig_id = answer[0]['mturk_id']
            cluster_dict = {"answer": answer[0]['answer'], "question": quest_list[i], "id": orig_id} 
            answers_clustered[f"g{idx}"].append(cluster_dict)
        answer_clusters[qid] = answers_clustered

    return answer_clusters

In [22]:
from transformers.utils import logging
logging.set_verbosity(50)

pred_path = "/brtx/602-nvme1/estengel/annotator_uncertainty/models/img2q_t5_base_no_limit/output/dev_set_predictions.jsonl"
pred_clusters = get_prediction_clusters(pred_path,
                                        questions, 
                                        annotations,
                                        score_cls=score_cls,
                                        criterion="distance",
                                        method="centroid",
                                        t=1.06)

ann_clusters = get_annotator_clusters(questions, annotations)

pred_to_ann = get_scores(pred_clusters, ann_clusters)
print(f"P: {pred_to_ann[1]*100:.2f}, R: {pred_to_ann[2]*100:.2f}, F1: {pred_to_ann[0]*100:.2f}")

100%|██████████| 11/11 [00:26<00:00,  2.44s/it]

P: 65.91, R: 100.00, F1: 76.06





In [23]:
# BEST: Method: centroid, Crit: distance, t: 1.06, P: 69.21, R: 96.66, F1: 77.83
# Not really, it either predicts a single cluster for everything or puts everything in its own cluster, which sucks 
for method in ["centroid"]:
    # for crit in ['inconsistent', 'distance']: 
    for crit in ['distance']:
        if crit == "distance":
            t_choices = [1.00, 1.01, 1.02, 1.03,  1.04,  1.05,  1.06, 1.07, 0.08, 1.09, 1.10] 
            # t_choices = [1.00,   1.05,   1.10] 
        else:
            t_choices = [0.0, 1.0, 2.0, 3.0]
        for t in t_choices:
            pred_clusters = get_prediction_clusters(pred_path,
                                            questions, 
                                            annotations,
                                            score_cls=score_cls,
                                            criterion=crit,
                                            method = method,
                                            t=t)

            pred_to_ann = get_scores(pred_clusters, ann_clusters)
            print(f"Method: {method}, Crit: {crit}, t: {t}, P: {pred_to_ann[1]*100:.2f}, R: {pred_to_ann[2]*100:.2f}, F1: {pred_to_ann[0]*100:.2f}")    


100%|██████████| 11/11 [00:26<00:00,  2.37s/it]


Method: centroid, Crit: distance, t: 1.0, P: 90.91, R: 96.21, F1: 91.41


100%|██████████| 11/11 [00:27<00:00,  2.50s/it]


Method: centroid, Crit: distance, t: 1.01, P: 87.88, R: 98.48, F1: 91.11


100%|██████████| 11/11 [00:26<00:00,  2.39s/it]


Method: centroid, Crit: distance, t: 1.02, P: 81.06, R: 98.48, F1: 85.66


100%|██████████| 11/11 [00:26<00:00,  2.37s/it]


Method: centroid, Crit: distance, t: 1.03, P: 81.06, R: 98.48, F1: 85.66


100%|██████████| 11/11 [00:26<00:00,  2.37s/it]


Method: centroid, Crit: distance, t: 1.04, P: 81.06, R: 98.48, F1: 85.66


100%|██████████| 11/11 [00:25<00:00,  2.35s/it]


Method: centroid, Crit: distance, t: 1.05, P: 76.52, R: 100.00, F1: 83.64


100%|██████████| 11/11 [00:25<00:00,  2.33s/it]


Method: centroid, Crit: distance, t: 1.06, P: 65.91, R: 100.00, F1: 76.06


100%|██████████| 11/11 [00:25<00:00,  2.34s/it]


Method: centroid, Crit: distance, t: 1.07, P: 65.91, R: 100.00, F1: 76.06


100%|██████████| 11/11 [00:25<00:00,  2.34s/it]


Method: centroid, Crit: distance, t: 0.08, P: 100.00, R: 94.70, F1: 96.46


100%|██████████| 11/11 [00:25<00:00,  2.36s/it]


Method: centroid, Crit: distance, t: 1.09, P: 59.09, R: 100.00, F1: 70.61


 45%|████▌     | 5/11 [00:08<00:09,  1.66s/it]


KeyboardInterrupt: 