In [1]:
import json
import csv
import os
current_folder = globals()['_dh'][0]
eval_results_folder = os.path.join(os.path.dirname(current_folder), 'hhem2/_eval/eval_output/leaderboard_3_hems_with_gooogle_2/')
file_name = 'leaderboard_summaries_with_predictions.json'
output_file = 'leaderboard_summaries_with_predictions_selected'

In [2]:
# All criteria
# valid_summary: summary contains at least 5 words
# hhem2_hhems_disagree: hhem1 == hhem2free AND hhem21 != hhem1
# googles_disagree: google-t5_trueteacher != google-t5_true_nli
# hhem2_google_disagree: hhem2 != google-t5_trueteacher OR hhem2 != google-t5_true_nli

criteria = ['valid_summary', 'hhem2_hhems_disagree', 'googles_disagree', 'hhem2_google_disagree']

In [3]:
detectors = ['hem_v1', 'HHEM-2.1', 'HHEM-2-free', 'google-t5_trueteacher', 'google-t5_true_nli']

In [4]:
def valid_summary(record):
    return len(record['summary'].split()) > 4

def hhem2_hhems_disagree(record):
    '''
    input: a record containing scores from hhem1, hhem21, hhem2free, and google models
    return: boolean
    '''
    return (record['hem_v1'] == record['HHEM-2-free']) and (record['HHEM-2.1'] != record['HHEM-2-free'])

def googles_disagree(record):
    return record['google-t5_trueteacher'] != record['google-t5_true_nli']

def hhem2_google_disagree(record):
    return (record['HHEM-2.1'] != record['google-t5_trueteacher']) or (record['HHEM-2.1'] != record['google-t5_true_nli'])

def check_criteria(criteria, record):
    '''
    Inputs:
        - criteria: list of criteria that needs to be checked
        - record: a record containing scores from hhem1, hhem21, hhem2free, and google models
    Return:
        boolean
    '''
    for c in criteria:
        select_fn = eval(c)
        if not select_fn(record):
            return False
    return True

In [5]:
summarizer_count = {}
summarizer_count_combination = {}
summarizer_count_each = {}
selected_records = []
with open(os.path.join(eval_results_folder, file_name)) as f:
    data = json.load(f)
    for record_id, record in enumerate(data):
        if check_criteria(criteria, record):
            selected_records.append(record)
            detected_halu = [detector for detector in detectors if record[detector] == 0]
            if record['LLM-summarizer'] not in summarizer_count:
                summarizer_count[record['LLM-summarizer']] = 1
            else:
                summarizer_count[record['LLM-summarizer']] += 1

            if record['LLM-summarizer'] not in summarizer_count_combination:
                summarizer_count_combination[record['LLM-summarizer']] = {}
            if ' + '.join(detected_halu) not in summarizer_count_combination[record['LLM-summarizer']]:
                summarizer_count_combination[record['LLM-summarizer']][' + '.join(detected_halu)] = 1
            else:
                summarizer_count_combination[record['LLM-summarizer']][' + '.join(detected_halu)] += 1

            if record['LLM-summarizer'] not in summarizer_count_each:
                summarizer_count_each[record['LLM-summarizer']] = {k:0 for k in ['total'] + detectors}
            summarizer_count_each[record['LLM-summarizer']]['total'] += 1
            for detector in detected_halu:
                summarizer_count_each[record['LLM-summarizer']][detector] += 1

summarizer_count = {k: v for k, v in sorted(summarizer_count.items(), key=lambda item: item[1], reverse=True)}
summarizer_count_combination = {k: v for k, v in sorted(summarizer_count_combination.items(), key=lambda item: sum(list((item[1].values()))), reverse=True)}
summarizer_count_each = {k: v for k, v in sorted(summarizer_count_each.items(), key=lambda item: item[1]["total"], reverse=True)}

In [6]:
with open(output_file + '.json', 'w') as f:
    json.dump(selected_records, f, indent=2)
    
data_file = open(output_file + '.csv', 'w', newline='')
csv_writer = csv.writer(data_file)
count = 0
for data in selected_records:
    if count == 0:
        header = data.keys()
        csv_writer.writerow(header)
        count += 1
    csv_writer.writerow(data.values())

print(f'{str(len(selected_records))} selected examples are saved to {output_file} .json/.csv')

466 selected examples are saved to leaderboard_summaries_with_predictions_selected .json/.csv


In [7]:

print(json.dumps(summarizer_count, indent=2))

{
  "mistral 7b": 59,
  "cohere": 43,
  "google/gemma-1.1-2b-it": 37,
  "Mixtral 8x7B": 36,
  "cohere-chat": 31,
  "Amazon Titan Express": 21,
  "Phi-2": 18,
  "claude": 18,
  "microsoft/WizardLM-2-8x22B": 17,
  "Google Gemini Pro": 16,
  "meta-llama/Llama-3-8B-chat-hf": 16,
  "google/Gemini-1.5-Pro": 16,
  "llama2 7b": 15,
  "llama2 13b": 14,
  "palm-chat": 13,
  "CohereForAI/c4ai-command-r-plus": 13,
  "google/gemma-2-9b-it": 11,
  "google/gemma-1.1-7b-it": 10,
  "google/Gemini-1.5-flash": 10,
  "palm": 9,
  "apple/OpenELM-3B-Instruct": 7,
  "llama2 70b": 6,
  "mistralai/Mixtral-8x22B-Instruct-v0.1": 6,
  "Anthropic/claude-3-5-sonnet-20240620": 6,
  "databricks/dbrx-instruct": 5,
  "meta-llama/Llama-3-70B-chat-hf": 5,
  "gpt-4": 4,
  "gpt3.5": 2,
  "openai/GPT-4-Turbo-2024-04-09": 1,
  "snowflake/snowflake-arctic-instruct": 1
}


In [8]:
print(json.dumps(summarizer_count_combination, indent=2))

{
  "mistral 7b": {
    "HHEM-2.1 + google-t5_trueteacher": 56,
    "hem_v1 + HHEM-2-free + google-t5_trueteacher": 1,
    "HHEM-2.1 + google-t5_true_nli": 2
  },
  "cohere": {
    "HHEM-2.1 + google-t5_trueteacher": 40,
    "hem_v1 + HHEM-2-free + google-t5_true_nli": 1,
    "hem_v1 + HHEM-2-free + google-t5_trueteacher": 1,
    "HHEM-2.1 + google-t5_true_nli": 1
  },
  "google/gemma-1.1-2b-it": {
    "HHEM-2.1 + google-t5_true_nli": 3,
    "HHEM-2.1 + google-t5_trueteacher": 32,
    "hem_v1 + HHEM-2-free + google-t5_trueteacher": 2
  },
  "Mixtral 8x7B": {
    "HHEM-2.1 + google-t5_trueteacher": 33,
    "hem_v1 + HHEM-2-free + google-t5_true_nli": 1,
    "hem_v1 + HHEM-2-free + google-t5_trueteacher": 2
  },
  "cohere-chat": {
    "HHEM-2.1 + google-t5_trueteacher": 30,
    "hem_v1 + HHEM-2-free + google-t5_trueteacher": 1
  },
  "Amazon Titan Express": {
    "HHEM-2.1 + google-t5_trueteacher": 20,
    "HHEM-2.1 + google-t5_true_nli": 1
  },
  "Phi-2": {
    "HHEM-2.1 + google-t5_tru

In [9]:
print(json.dumps(summarizer_count_each, indent=2))

{
  "mistral 7b": {
    "total": 59,
    "hem_v1": 1,
    "HHEM-2.1": 58,
    "HHEM-2-free": 1,
    "google-t5_trueteacher": 57,
    "google-t5_true_nli": 2
  },
  "cohere": {
    "total": 43,
    "hem_v1": 2,
    "HHEM-2.1": 41,
    "HHEM-2-free": 2,
    "google-t5_trueteacher": 41,
    "google-t5_true_nli": 2
  },
  "google/gemma-1.1-2b-it": {
    "total": 37,
    "hem_v1": 2,
    "HHEM-2.1": 35,
    "HHEM-2-free": 2,
    "google-t5_trueteacher": 34,
    "google-t5_true_nli": 3
  },
  "Mixtral 8x7B": {
    "total": 36,
    "hem_v1": 3,
    "HHEM-2.1": 33,
    "HHEM-2-free": 3,
    "google-t5_trueteacher": 35,
    "google-t5_true_nli": 1
  },
  "cohere-chat": {
    "total": 31,
    "hem_v1": 1,
    "HHEM-2.1": 30,
    "HHEM-2-free": 1,
    "google-t5_trueteacher": 31,
    "google-t5_true_nli": 0
  },
  "Amazon Titan Express": {
    "total": 21,
    "hem_v1": 0,
    "HHEM-2.1": 21,
    "HHEM-2-free": 0,
    "google-t5_trueteacher": 20,
    "google-t5_true_nli": 1
  },
  "Phi-2": {
    