In [1]:
from typing import Any, Generator, Protocol, List, Tuple
import json
import pandas as pd
# import nltk
# nltk.download('punkt_tab')
from sklearn.metrics import balanced_accuracy_score, f1_score, precision_score, recall_score
from minicheck.minicheck import MiniCheck
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
if torch.cuda.is_available():
  dev = "cuda:0"
else:
  dev = "cpu"
device = torch.device(dev)
print(device)


  from .autonotebook import tqdm as notebook_tqdm


cpu


In [2]:
class MinicheckEval():
    def __init__(self, model_name):
        self.model = MiniCheck(model_name=model_name, cache_dir='./ckpts')
    def run_eval(self, source, summary):
        pred_label, raw_prob, _, _ = self.model.score(docs=[source], claims=[summary])
        # print(pred_label)
        return pred_label

In [9]:
detectors = ["hhemv1", "hhem-2.1", "hhem-2.1-english", "trueteacher", "true_nli", "gpt-3.5-turbo", "gpt-4-turbo", "gpt-4o"]
minicheck_models = ['roberta-large', 'deberta-v3-large', 'flan-t5-large']#, 'Bespoke-MiniCheck-7B']
# vllm does not work on CUDA 12.4 cannot run minicheck Bespoke-MiniCheck-7B
# for 'deberta-v3-large', need to change https://github.com/Liyan06/MiniCheck/blob/main/minicheck/inference.py#L60
# 'deberta-v3-large' does not support device_map = "auto"
predictions = {detector: [] for detector in ['human'] + detectors + minicheck_models}
minicheckFlanT5 = MinicheckEval('flan-t5-large')
minicheckRoberta = MinicheckEval('roberta-large')
minicheckDeberta = MinicheckEval('deberta-v3-large')
# minicheck7B = MinicheckEval('Bespoke-MiniCheck-7B')

def get_minicheck_results(result_files, selected_annotators={}, skip_sample_ids={}, skip_meta_sample_ids=[]):
    for file_path in result_files:    
        data = json.load(open(file_path))
        # print(data)
        if file_path in selected_annotators:
            selected_annotators = selected_annotators[file_path]
        
        for sample in data:
            sample_id = sample['sample_id']
            if file_path in skip_sample_ids and sample_id in skip_sample_ids[file_path]:
                continue
            meta_sample_id = sample['meta_sample_id']
            if meta_sample_id in skip_meta_sample_ids:
                continue
            source = sample['source']
            summary = sample['summary']
            annotations = sample['annotations']
            sample_annotations = []
            for annotation in annotations:
                if selected_annotators:
                    annotator = annotation['annotator'] if not annotation['annotator_name'] else annotation['annotator_name'].split()[0].lower()
                    if annotator in selected_annotators:
                        sample_annotations.extend(annotation['label'])
                else:
                    sample_annotations.extend(annotation['label'])
            sample_annotations = set(sample_annotations)
            # human annotation
            if "Unwanted" in sample_annotations or 'Questionable' in sample_annotations:
                predictions['human'].append(0)
            else:
                predictions['human'].append(1)

            for detector in detectors:
                detector_pred = sample[f"meta_{detector}"]
                if 'hhem' in detector:
                    detector_pred = 0 if sample[f"meta_{detector}"] < 0.5 else 1
                predictions[detector].append(detector_pred)
            for detector in minicheck_models:
                if 'flan' in detector:
                    predictions[detector].extend(minicheckFlanT5.run_eval(source, summary))
                elif 'deberta' in detector:
                    predictions[detector].extend(minicheckDeberta.run_eval(source, summary))
                elif 'roberta' in detector:
                    predictions[detector].extend(minicheckRoberta.run_eval(source, summary))
                # elif '7B' in detector:
                #     predictions[detector].extend(minicheck7B.run_eval(source, summary))
                        

    pred_df = pd.DataFrame(predictions)
    print(pred_df.shape)
    return pred_df

In [4]:
result_path = 'batch_5_src_no_sports/results'
result_files = [os.path.join(result_path, f"batch_{batch_id}_annotation.json") for batch_id in range(1,10)]
skip_sample_ids = {os.path.join(result_path, "batch_5_annotation.json"): list(range(40,50))}
selected_annotators = {
        # os.path.join(result_path, "batch_3_annotation.json"): ['yujia', 'rogger'],
        os.path.join(result_path, "batch_7_annotation.json"): ['yujia', 'manveer']
}

pred_df = get_minicheck_results(result_files, skip_sample_ids=skip_sample_ids, selected_annotators=selected_annotators)


Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.28it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.61it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.98it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  5.19it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.14it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.22it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  5.42it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.03it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.62it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  5.11it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.06it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.18it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  5.33it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.20it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.50it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  5.24it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.15it/s]
Evaluating: 10

(440, 12)





In [5]:
pred_df.corr(method='pearson')

Unnamed: 0,human,hhemv1,hhem-2.1,hhem-2.1-english,trueteacher,true_nli,gpt-3.5-turbo,gpt-4-turbo,gpt-4o,roberta-large,deberta-v3-large,flan-t5-large
human,1.0,0.137751,0.135904,0.06403,-0.077987,0.0038,0.018223,0.120192,0.106831,0.195163,-0.012644,0.075341
hhemv1,0.137751,1.0,0.13392,0.112113,0.016744,0.115613,-0.132587,-0.058737,-0.077226,0.099351,0.141767,0.135959
hhem-2.1,0.135904,0.13392,1.0,0.443883,0.288849,0.179807,0.030941,0.126204,0.084131,0.261072,0.157172,0.161223
hhem-2.1-english,0.06403,0.112113,0.443883,1.0,0.169221,0.070674,-0.016833,0.057026,0.09025,0.103483,0.079814,0.223085
trueteacher,-0.077987,0.016744,0.288849,0.169221,1.0,0.273148,0.113745,0.140291,0.120004,0.142603,0.155267,0.090694
true_nli,0.0038,0.115613,0.179807,0.070674,0.273148,1.0,0.110207,0.185854,0.194216,0.106355,0.096908,0.058548
gpt-3.5-turbo,0.018223,-0.132587,0.030941,-0.016833,0.113745,0.110207,1.0,0.15178,0.069475,0.047927,0.029825,0.188032
gpt-4-turbo,0.120192,-0.058737,0.126204,0.057026,0.140291,0.185854,0.15178,1.0,0.552003,0.04648,0.059966,0.064969
gpt-4o,0.106831,-0.077226,0.084131,0.09025,0.120004,0.194216,0.069475,0.552003,1.0,-0.001413,0.016756,0.007433
roberta-large,0.195163,0.099351,0.261072,0.103483,0.142603,0.106355,0.047927,0.04648,-0.001413,1.0,0.132813,0.257395


In [6]:
pred_df.corr(method='spearman')

Unnamed: 0,human,hhemv1,hhem-2.1,hhem-2.1-english,trueteacher,true_nli,gpt-3.5-turbo,gpt-4-turbo,gpt-4o,roberta-large,deberta-v3-large,flan-t5-large
human,1.0,0.137751,0.135904,0.06403,-0.077987,0.0038,0.018223,0.120192,0.106831,0.195163,-0.012644,0.075341
hhemv1,0.137751,1.0,0.13392,0.112113,0.016744,0.115613,-0.132587,-0.058737,-0.077226,0.099351,0.141767,0.135959
hhem-2.1,0.135904,0.13392,1.0,0.443883,0.288849,0.179807,0.030941,0.126204,0.084131,0.261072,0.157172,0.161223
hhem-2.1-english,0.06403,0.112113,0.443883,1.0,0.169221,0.070674,-0.016833,0.057026,0.09025,0.103483,0.079814,0.223085
trueteacher,-0.077987,0.016744,0.288849,0.169221,1.0,0.273148,0.113745,0.140291,0.120004,0.142603,0.155267,0.090694
true_nli,0.0038,0.115613,0.179807,0.070674,0.273148,1.0,0.110207,0.185854,0.194216,0.106355,0.096908,0.058548
gpt-3.5-turbo,0.018223,-0.132587,0.030941,-0.016833,0.113745,0.110207,1.0,0.15178,0.069475,0.047927,0.029825,0.188032
gpt-4-turbo,0.120192,-0.058737,0.126204,0.057026,0.140291,0.185854,0.15178,1.0,0.552003,0.04648,0.059966,0.064969
gpt-4o,0.106831,-0.077226,0.084131,0.09025,0.120004,0.194216,0.069475,0.552003,1.0,-0.001413,0.016756,0.007433
roberta-large,0.195163,0.099351,0.261072,0.103483,0.142603,0.106355,0.047927,0.04648,-0.001413,1.0,0.132813,0.257395


In [8]:
performance_results = {}
for detector in detectors + minicheck_models:
    detector_results = {
        "ba": round(balanced_accuracy_score(pred_df['human'], pred_df[detector])*100,2),
        "f1-macro": round(f1_score(pred_df['human'], pred_df[detector], pos_label=1, average="macro")*100,2),
        "f1-halu": round(f1_score(pred_df['human'], pred_df[detector], pos_label=0)*100,2),
        "pr-halu": round(precision_score(pred_df['human'], pred_df[detector], pos_label=0)*100,2),
        're-halu': round(recall_score(pred_df['human'], pred_df[detector], pos_label=0)*100,2),
        "f1-cons": round(f1_score(pred_df['human'], pred_df[detector], pos_label=1)*100,2),
        "pr-cons": round(precision_score(pred_df['human'], pred_df[detector], pos_label=1)*100,2),
        're-cons': round(recall_score(pred_df['human'], pred_df[detector], pos_label=1)*100,2)
    }
    performance_results[detector] = detector_results
pd.DataFrame.from_dict(performance_results, orient='index')

Unnamed: 0,ba,f1-macro,f1-halu,pr-halu,re-halu,f1-cons,pr-cons,re-cons
hhemv1,60.61,54.13,21.71,14.74,41.18,86.55,94.2,80.05
hhem-2.1,58.67,56.11,21.51,16.95,29.41,90.72,93.7,87.93
hhem-2.1-english,52.93,53.15,12.9,14.29,11.76,93.4,92.72,94.09
trueteacher,44.69,45.57,3.88,2.9,5.88,87.26,91.37,83.5
true_nli,50.12,49.54,4.35,8.33,2.94,94.72,92.29,97.29
gpt-3.5-turbo,51.63,45.08,13.76,8.39,38.24,76.41,92.63,65.02
gpt-4-turbo,58.42,54.62,20.37,14.86,32.35,88.86,93.72,84.48
gpt-4o,57.32,54.2,19.23,14.29,29.41,89.18,93.51,85.22
roberta-large,68.26,44.71,22.22,12.78,85.29,67.21,97.65,51.23
deberta-v3-large,48.86,32.51,13.33,7.47,61.76,51.68,91.82,35.96
