In [2]:
import numpy as np 
import torch 
from collections import defaultdict
import json 
import pathlib 
import sys
from tqdm import tqdm



In [3]:
annotations = json.load(open("/home/estengel/annotator_uncertainty/jimena_work/cleaned_data/csv/dev_set/annotations.json"))['annotations']
questions = json.load(open("/home/estengel/annotator_uncertainty/jimena_work/cleaned_data/csv/dev_set/questions.json"))['questions']

In [4]:
# get the clusters from annotations 
def get_annotator_clusters(questions, annotations): 
    anns_by_qid = defaultdict(list)
    for quest, ann in zip(questions, annotations):

        qid, i = quest['question_id'].split("_")
        anns_by_qid[qid].append((quest, ann))

    clusters_by_qid = {}
    for qid, list_of_qas in anns_by_qid.items():
        clusters = defaultdict(list)
        for quest, ann in list_of_qas:
            rewritten = quest['new_question']
            answer = ann['answers'][0]['answer']
            answer_id = ann['answers'][0]['mturk_id']
            cluster_dict = {"answer": answer, "id": answer_id} 
            clusters[rewritten].append(cluster_dict)
        clusters_by_qid[qid] = clusters
    return clusters_by_qid

# get the clusters from kmeans preprocessing
def get_preprocessed_clusters(questions, annotations): 
    anns_by_qid = defaultdict(list)
    for quest, ann in zip(questions, annotations):

        qid, i = quest['question_id'].split("_")
        anns_by_qid[qid].append((quest, ann))

    clusters_by_qid = {}
    for qid, list_of_qas in anns_by_qid.items():
        clusters = defaultdict(list)
        for quest, ann in list_of_qas:
            answer = ann['answers'][0]['answer']
            answer_id = ann['answers'][0]['mturk_id']
            id_key, answer_id_suffix = answer_id.split(".")
            cluster_dict = {"answer": answer, "id": answer_id} 
            clusters[id_key].append(cluster_dict)
        clusters_by_qid[qid] = clusters
    return clusters_by_qid


In [5]:

curr_path = pathlib.Path('').resolve().parent
sys.path.insert(0, str(curr_path.joinpath("hit3.0").joinpath("results")))
from process_csv import f1_score

def preprocess(cluster_data):
    if type(cluster_data) in [dict, defaultdict]:
        # dealing with predicted clusters or preprocessed clusters
        return cluster_data.values()
    return cluster_data

def get_scores(clusters_by_qid_a, clusters_by_qid_b):
    scores = []
    for qid in clusters_by_qid_a.keys():
        cluster_a = preprocess(clusters_by_qid_a[qid])
        cluster_b = preprocess(clusters_by_qid_b[qid])
        f1_tuple = f1_score(cluster_a, cluster_b)
        f1_tuple = f1_tuple[0:-1]
        scores.append(f1_tuple)
    # print(scores)
    scores = np.array(scores)
    return np.mean(scores, axis=0)


/home/estengel/annotator_uncertainty/analysis
['/home/estengel/annotator_uncertainty/analysis', '/home/estengel/annotator_uncertainty/hit3.0/results', '/home/estengel/annotator_uncertainty/analysis', '/home/estengel/.vscode-server/extensions/ms-toolsai.jupyter-2022.6.1201981810/pythonFiles', '/home/estengel/.vscode-server/extensions/ms-toolsai.jupyter-2022.6.1201981810/pythonFiles/lib/python', '/brtx/601-nvme1/estengel/miniconda3/envs/cert/lib/python38.zip', '/brtx/601-nvme1/estengel/miniconda3/envs/cert/lib/python3.8', '/brtx/601-nvme1/estengel/miniconda3/envs/cert/lib/python3.8/lib-dynload', '', '/brtx/601-nvme1/estengel/miniconda3/envs/cert/lib/python3.8/site-packages']


In [6]:
from string_metrics import BertSimilarityScore, BleuSimilarityScore
# score_cls = BertSimilarityScore(device="cuda:0")

In [7]:
import re
from scipy.cluster.hierarchy import linkage, fcluster
import scipy 
np.set_printoptions(precision=2)

def read_generations(output_path):
    flat_data_by_qid = {}
    data = open(output_path).readlines()
    for line in data:
        batch_data = json.loads(line)
        for qid, generation in zip(batch_data['question_id'], batch_data['speaker_utterances'][0]):
            flat_data_by_qid[qid] = generation
    return flat_data_by_qid

def clean_text(text): 
    text = re.sub("<.*?>", "", text)
    text = text.strip() 
    return text 

def normalize(scores):
    # make other min zero 
    scores_no_zero = scores[scores > 0]
    min_score = np.min(scores_no_zero)
    # so everything becomes zero
    scores[scores == 0] += min_score
    # normalize scores so the min is zero and max is 1
    max_score = np.max(scores)
    if max_score == min_score:
        denom = max_score
    else:
        denom = max_score - min_score
    return (scores-min_score) / denom


# get the clusters from predictions 
def get_prediction_clusters(predictions_jsonl,
                            questions, 
                            annotations, 
                            score_cls, 
                            t = 1.06, 
                            criterion = "centroid", 
                            method = "distance"):
    generations_by_qid = read_generations(predictions_jsonl)
    anns_by_qid = defaultdict(list)
    answers_by_qid = defaultdict(list)
    for quest, ann in zip(questions, annotations):
        qid, i = quest['question_id'].split("_")
        generation = clean_text(generations_by_qid[quest['question_id']])
        anns_by_qid[qid].append(generation)
        answers_by_qid[qid].append(ann['answers'])

    scores_by_qid = {} 
    clusts_by_qid = {}
    # Get matrix of scores 
    answer_clusters = {}
    for qid, quest_list in tqdm(anns_by_qid.items()): 
        scores = np.zeros((len(quest_list), len(quest_list))) 
        done = []
        for i, q1 in enumerate(quest_list): 
            for j, q2 in enumerate(quest_list):
                if i == j: 
                    scores[i,j] = 0.0 
                    continue
                sim_score = score_cls.get_similarity(q1, q2) 
                # print(q1, q2)
                # print(f"score: {sim_score}")
                if type(sim_score) == list:
                    # take the first element? 
                    sim_score = sim_score[0]
                
                scores[i,j] = 1/sim_score
                scores[j,i] = 1/sim_score
                # scores[i,j] = 1 - sim_score
                # scores[j,i] = 1 - sim_score
                done.append((i,j))
                done.append((j,i))
        
        # try normalizing 
        scores = normalize(scores)
        # print(answers_by_qid[qid])
        # print(quest_list)
        # print(scores)
        # sys.exit()
        scores_by_qid[qid] = scores 
        # print(scores)
        scores = scipy.spatial.distance.squareform(scores)
        link = linkage(scores, method=method, metric="cosine")
        clust = fcluster(link, t=t, criterion=criterion)

        clusts_by_qid[qid] = clust 
        answers_clustered = defaultdict(list)
        ans_list = answers_by_qid[qid]
        for i, idx in enumerate(clust):
            answer = ans_list[i]
            orig_id = answer[0]['mturk_id']
            cluster_dict = {"answer": answer[0]['answer'], "question": quest_list[i], "id": orig_id} 
            answers_clustered[f"g{idx}"].append(cluster_dict)
        answer_clusters[qid] = answers_clustered

    return answer_clusters

In [8]:
from transformers.utils import logging
logging.set_verbosity(50)

ann_clusters = get_annotator_clusters(questions, annotations)
print(ann_clusters['480779000'])

pred_path = "/brtx/602-nvme1/estengel/annotator_uncertainty/models/img2q_t5_base_no_limit/output/dev_set_predictions_forced.jsonl"
pred_clusters = get_prediction_clusters(pred_path,
                                        questions, 
                                        annotations,
                                        # score_cls=score_cls,
                                        score_cls = BertSimilarityScore(device="cuda:0"),
                                        criterion="distance",
                                        method="ward",
                                        t=1.13)


pred_to_ann = get_scores(pred_clusters, ann_clusters)
print(f"P: {pred_to_ann[1]*100:.2f}, R: {pred_to_ann[2]*100:.2f}, F1: {pred_to_ann[0]*100:.2f}")

defaultdict(<class 'list'>, {'What is the bottom left food item with the green colors?': [{'answer': 'pasta', 'id': 'g4.0'}, {'answer': 'slaw', 'id': 'g3.0'}, {'answer': 'broccoli', 'id': 'g6.0'}, {'answer': 'salad', 'id': 'g2.0'}, {'answer': 'vegetables', 'id': 'g1.0'}, {'answer': 'lettuce', 'id': 'g0.0'}], 'What is the top right food item with the green colors?': [{'answer': 'herb', 'id': 'g7.0'}, {'answer': 'garnish', 'id': 'g5.0'}]})


KeyboardInterrupt: 

In [None]:
# BEST: Method: centroid, Crit: distance, t: 1.06, P: 69.21, R: 96.66, F1: 77.83
# Not really, it either predicts a single cluster for everything or puts everything in its own cluster, which sucks 
for score_cls in [BleuSimilarityScore(), BertSimilarityScore(device="cuda:0")]:
    for method in [ "centroid", "ward"]:
        for crit in ['inconsistent', 'distance']: 
            if crit in ["inconsistent", "distance", "monocrit"]:
                # t_choices = [0.001, 0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.035, 0.04]
                t_choices = [1.1, 1.11, 1.12, 1.13, 1.14, 1.15, 1.16, 1.17, 1.18, 1.19, 1.2] # , 0.025, 0.03, 0.035, 0.04]
                # t_choices = [0.0, 0.05, 0.07, 0.1, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]
                # t_choices = [1.06]
                # t_choices = [0.90, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.00]
                #, 1.01, 1.02, 1.03,  1.04,  1.05,  1.06, 1.07, 0.08, 1.09, 1.10] 
                # t_choices = [1.16, 1.17, 1.18]
                # t_choices = [1.00,   1.05,   1.10] 
            else:
                # t_choices = [0.0, 1.0, 2.0, 3.0]
                t_choices = [0.0, 0.1, 0.2]
            for t in t_choices:
                pred_clusters = get_prediction_clusters(pred_path,
                                                questions, 
                                                annotations,
                                                score_cls=score_cls,
                                                criterion=crit,
                                                method = method,
                                                t=t)

                pred_to_ann = get_scores(pred_clusters, ann_clusters)
                print(f"Score cls: {score_cls}, Method: {method}, Crit: {crit}, t: {t}, P: {pred_to_ann[1]*100:.2f}, R: {pred_to_ann[2]*100:.2f}, F1: {pred_to_ann[0]*100:.2f}")    


In [11]:
avg_num_clusters = np.mean([len(x) for x in ann_clusters.values()])
print(avg_num_clusters)

2.6
