In [1]:
import jsonlines
import json
import pandas as pd
import numpy as np
from sklearn.metrics import balanced_accuracy_score, f1_score

In [2]:
detectors = {
    "HHEMv1":"HHEM-1", 
    "HHEM-2.1": "HHEM-2.1-Tri" , 
    "HHEM-2.1-English": "HHEM-2.1-English", 
    "HHEM-2.1-Open": "HHEM-2.1-Open",
    "alignscore-base": "AlignScore-BS",
    "alignscore-large": "AlignScore-LG",
    "trueteacher": "True-Teacher", 
    "true_nli": "True-NLI", 
    "gpt-3.5-turbo": "GPT-3.5-Turbo, zero-shot", 
    "gpt-4-turbo": "GPT-4-Turbo, zero-shot", 
    "gpt-4o": "GPT-4o, zero-shot", 
    "gpt-4": "GPT-4, zero-shot",
    "minicheck-roberta-large": "Minicheck-Roberta-LG",
    "minicheck-deberta-v3-large": "Minicheck-Deberta-LG",
    "minicheck-flan-t5-large": "Minicheck-Flan-T5-LG",
    "Ragas_gpt-4o": "Ragas-GPT-4o",
    "Trulens_gpt-4o_scores": "Trulens-GPT-4o",
}

In [3]:
def best_pooling(labels):
    if 'Consistent' in labels:
        return 1
    else:
        return 0

def worst_pooling(labels):
    if 'Unwanted' in labels or 'Questionable' in labels or 'Benign' in labels:
        return 0
    else:
        return 1

In [4]:
def load_predictions(method):
    '''
    method: `best pooling` or  `worst pooling`
    '''
    assert method in ['worst-pooling', 'best-pooling'], 'Only support \'worst-pooling\' or \'best-pooling\''
    predictions = {detector: [] for detector in ['human'] + list(detectors.keys())}
    for detector in predictions:
        # print(detector)
        if 'ragas' in detector.lower():
            with open('processed_ragas_claim_level_preds.jsonl') as reader:
                for record in jsonlines.Reader(reader):
                    for sent, sent_result in record['results'].items():
                        if len(sent_result['claims']) < 1: # no prediction
                            predictions[detector].append(1)
                        elif 0 in sent_result['claim_preds']:
                            predictions[detector].append(0)
                        else:
                            predictions[detector].append(1)
        elif 'trulens' in detector.lower():
            with open('processed_trulens_claim_level_preds.jsonl') as reader:
                for record in jsonlines.Reader(reader):
                        for sent, sent_result in record['results'].items():
                            if len(sent_result['claims']) < 1: # no prediction
                                predictions[detector].append(1)
                            elif np.mean(sent_result['claim_preds']) < 1:
                                predictions[detector].append(0)
                            else:
                                predictions[detector].append(1)
        else:
            with open('dectectors_claim_level_preds.json') as f:
                data = json.load(f)
                for meta_id in data:
                    record = data[meta_id]
                    for _, results in record.items():
                        if 'human' in detector.lower():
                            if method == 'best-pooling':
                                predictions[detector].append(best_pooling(results['labels']))
                            else:
                                predictions[detector].append(worst_pooling(results['labels']))
                        else:
                            if results[detector] is not None:
                                predictions[detector].append(int(results[detector] > 0.5))
                            else: # prediction is None. may occur for trueteacher/truenli
                                if method == 'best-pooling':
                                    predictions[detector].append(1-best_pooling(results['labels']))
                                else:
                                    predictions[detector].append(1-worst_pooling(results['labels']))
                                

    return predictions

In [5]:
def compute_performance(pred_df):
    performance_results = {}
    for detector in list(detectors.keys()):
        detector_results = {
            "ba": round(balanced_accuracy_score(pred_df['human'], pred_df[detector])*100,2),
            "f1-macro": round(f1_score(pred_df['human'], pred_df[detector], pos_label=1, average="macro")*100,2),
            # "f1-halu": round(f1_score(self.pred_df['human'], self.pred_df[detector], pos_label=0)*100,2),
            # "pr-halu": round(precision_score(self.pred_df['human'], self.pred_df[detector], pos_label=0)*100,2),
            # 're-halu': round(recall_score(self.pred_df['human'], self.pred_df[detector], pos_label=0)*100,2),
            # "f1-cons": round(f1_score(self.pred_df['human'], self.pred_df[detector], pos_label=1)*100,2),
            # "pr-cons": round(precision_score(self.pred_df['human'], self.pred_df[detector], pos_label=1)*100,2),
            # 're-cons': round(recall_score(self.pred_df['human'], self.pred_df[detector], pos_label=1)*100,2)
        }
        performance_results[detector] = detector_results
        df = pd.DataFrame.from_dict(performance_results, orient='index')
        df = df.rename(columns=detectors)
    return df

In [6]:
preds = load_predictions('best-pooling')
# print(preds)
# for detector in preds:
#     print(detector)
#     print(len(preds[detector]))
pred_df = pd.DataFrame(preds)
best_pooling_results = compute_performance(pred_df)
best_pooling_results

Unnamed: 0,ba,f1-macro
HHEMv1,49.96,49.02
HHEM-2.1,54.15,50.36
HHEM-2.1-English,52.72,47.36
HHEM-2.1-Open,54.36,50.78
alignscore-base,53.3,52.77
alignscore-large,55.96,55.84
trueteacher,51.38,48.39
true_nli,50.89,48.62
gpt-3.5-turbo,50.0,39.77
gpt-4-turbo,50.0,39.77


In [7]:
preds = load_predictions('worst-pooling')
pred_df = pd.DataFrame(preds)
worst_pooling_results = compute_performance(pred_df)
worst_pooling_results

Unnamed: 0,ba,f1-macro
HHEMv1,49.96,49.02
HHEM-2.1,54.15,50.36
HHEM-2.1-English,52.72,47.36
HHEM-2.1-Open,54.36,50.78
alignscore-base,53.3,52.77
alignscore-large,55.96,55.84
trueteacher,51.38,48.39
true_nli,50.89,48.62
gpt-3.5-turbo,50.0,39.77
gpt-4-turbo,50.0,39.77
