In [1]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification
from peft import PeftModel
from datasets import load_dataset
from typing import Any, Generator, Protocol, List, Tuple
import torch
import numpy as np
import pandas as pd
if torch.cuda.is_available():
  dev = "cuda:0"
else:
  dev = "cpu"
device = torch.device(dev)
print(device)


  from .autonotebook import tqdm as notebook_tqdm


cuda:0


In [2]:
def _create_batch(source: List[str], summary: List[str], batch_size: int) -> Generator:
    l = len(source)
    for ndx in range(0, l, batch_size):
        batch = []
        for i in range(ndx, min(ndx + batch_size, l)):
            batch.append([source[i], summary[i]])
        
        yield batch

In [3]:
def HHEMv1Eval(filename, col_name = 'HHEMv1', device=device, update=True, batch_size=20):
    df = pd.read_csv(filename, encoding='utf-8').fillna('')
    if (not update) and (col_name in df): # store HHEM scores
        return
    model = AutoModelForSequenceClassification.from_pretrained('vectara/hallucination_evaluation_model',revision = 'hhem-1.0-open').to(device)
    tokenizer = AutoTokenizer.from_pretrained('vectara/hallucination_evaluation_model', revision = 'hhem-1.0-open')
    scores = []
    for batch in _create_batch(df['source'].tolist(), df['summary'].tolist(), batch_size):
        inputs = tokenizer.batch_encode_plus(batch, return_tensors='pt', padding=True).to(device)
        model.eval()
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits.cpu().numpy()
            # convert logits to probabilities
            batch_scores = 1 / (1 + np.exp(-logits)).flatten()
            batch_scores = [round(x, 5) for x in batch_scores]
            scores += batch_scores
    # print(scores)
    if col_name in df:
        df[col_name] = scores
    else:
        df.insert(len(df.columns), col_name, scores)
    df.to_csv(filename, mode='w', index=False, header=True)
    print('HHEMv1 Scores have been saved')


In [4]:
def HHEM21Eval(filename, col_name = 'HHEM-2.1', device=device, update=True, batch_size=20):
    df = pd.read_csv(filename, encoding='utf-8').fillna('')
    if (not update) and (col_name in df): # store HHEM scores
        return
    prompt = "<pad> Determine if the hypothesis is true given the premise?\n\nPremise: {text1}\n\nHypothesis: {text2}"
    tokenizer = AutoTokenizer.from_pretrained('t5-base')
    config = AutoConfig.from_pretrained('google/flan-t5-large')
    model = AutoModelForTokenClassification.from_pretrained('vectara/HHEM-2.1', config=config).to(device)
    scores = []
    for batch in _create_batch(df['source'].tolist(), df['summary'].tolist(), batch_size):
        inputs = tokenizer([prompt.format(text1=pair[0], text2=pair[1]) for pair in batch], 
                            return_tensors='pt', padding='longest').to(device)
        model.eval()
        with torch.no_grad():
            output = model(**inputs)
        logits = output.logits

        logits = logits[:,0,:] # get the logits on the first token

        logits = torch.softmax(logits, dim=-1)
        batch_scores = [round(x, 5) for x in logits[:, 1].tolist()] # list of float
        scores += batch_scores
    # print(scores)
    print(len(scores))
    if col_name in df:
        df[col_name] = scores
    else:
        df.insert(len(df.columns), col_name, scores)
    df.to_csv(filename, mode='w', index=False, header=True)
    print('HHEM-2.1 Scores have been saved')

In [5]:
def HHEM21EnglishEval(filename, col_name = 'HHEM-2.1-English', device=device, update=True, batch_size=20):
    df = pd.read_csv(filename, encoding='utf-8').fillna('')
    if (not update) and (col_name in df): # store HHEM scores
        return
    prompt = "<pad> Determine if the hypothesis is true given the premise?\n\nPremise: {text1}\n\nHypothesis: {text2}"
    config = AutoConfig.from_pretrained('google/flan-t5-large')
    tokenizer = AutoTokenizer.from_pretrained('t5-base')
    model = AutoModelForTokenClassification.from_pretrained('vectara/HHEM-2.1-English', config=config).to(device)
    scores = []
    for batch in _create_batch(df['source'].tolist(), df['summary'].tolist(), batch_size):
        inputs = tokenizer([prompt.format(text1=pair[0], text2=pair[1]) for pair in batch], 
                            return_tensors='pt', padding='longest').to(device)
        model.eval()
        with torch.no_grad():
            output = model(**inputs)
        logits = output.logits

        logits = logits[:,0,:] # get the logits on the first token

        logits = torch.softmax(logits, dim=-1)
        batch_scores = [round(x, 5) for x in logits[:, 1].tolist()] # list of float
        scores += batch_scores
    # print(scores)
    if col_name in df:
        df[col_name] = scores
    else:
        df.insert(len(df.columns), col_name, scores)
    df.to_csv(filename, mode='w', index=False, header=True)
    print('HHEM-2.1-English Scores have been saved')

In [6]:
complete_df = pd.read_csv('../leaderboard_results/leaderboard_summaries.csv', encoding='utf-8')
models = set(complete_df['model'].values.tolist())
print(models)
print(len(models))
for idx, model_name in enumerate(models):
    if model_name not in ['openai/o1-mini', 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'google/Gemini-Pro','google/gemma-2-9b-it','CohereForAI/c4ai-command-r-plus','anthropic/Claude-2', '01-ai/Yi-1.5-6B-Chat', '01-ai/Yi-1.5-9B-Chat']:
        continue
    filename = model_name + '.csv'
    print(f"Processing file {str(idx)}: {filename} ......")
    HHEMv1Eval(filename, batch_size=15)
    HHEM21Eval(filename, batch_size=15)
    HHEM21EnglishEval(filename, batch_size=15)
    print(f"Finshed {filename}")
    print('='*20)

{'anthropic/Claude-3-sonnet', 'google/gemma-7b-it', 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'google/gemma-1.1-7b-it', 'meta-llama/Llama-3-70B-chat-hf', 'microsoft/Phi-3.5-MoE-instruct', 'microsoft/WizardLM-2-8x22B', 'microsoft/Phi-2', 'databricks/dbrx-instruct', 'google/gemma-2-9b-it', 'Anthropic/claude-3-5-sonnet-20240620', 'openai/GPT-4o-mini', 'google/PaLM-2', 'meta-llama/Llama-3-8B-chat-hf', 'deepseek/deepseek-chat', 'google/gemini-1.5-pro-001', 'meta-llama/Llama-2-13b-chat-hf', 'amazon/Titan-Express', '01-ai/Yi-1.5-34B-Chat', 'openai/o1-preview', 'cohere/command-r-plus-08-2024', '01-ai/Yi-1.5-9B-Chat', 'google/gemma-1.1-2b-it', 'meta-llama/Meta-Llama-3.1-70B-Instruct', 'anthropic/Claude-2', 'openai/GPT-3.5-Turbo', 'openai/gpt-4o', 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'microsoft/Phi-3-mini-128k-instruct', 'cohere/Cohere', 'google/Gemini-1.5-Pro', 'mistralai/Mistral-7B-Instruct-v0.3', 'Qwen/Qwen2-VL-2B-Instruct', 'google/Gemini-1.5-flash', 'google/gemini-pro-experimental', 'mist

Token indices sequence length is longer than the specified maximum sequence length for this model (813 > 512). Running this sequence through the model will result in indexing errors


HHEMv1 Scores have been saved


Some weights of T5ForTokenClassification were not initialized from the model checkpoint at google/flan-t5-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1006
HHEM-2.1 Scores have been saved


Some weights of T5ForTokenClassification were not initialized from the model checkpoint at google/flan-t5-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HHEM-2.1-English Scores have been saved
Finshed google/gemma-2-9b-it.csv
Processing file 21: 01-ai/Yi-1.5-9B-Chat.csv ......


Token indices sequence length is longer than the specified maximum sequence length for this model (778 > 512). Running this sequence through the model will result in indexing errors


HHEMv1 Scores have been saved


Some weights of T5ForTokenClassification were not initialized from the model checkpoint at google/flan-t5-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1006
HHEM-2.1 Scores have been saved


Some weights of T5ForTokenClassification were not initialized from the model checkpoint at google/flan-t5-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HHEM-2.1-English Scores have been saved
Finshed 01-ai/Yi-1.5-9B-Chat.csv
Processing file 24: anthropic/Claude-2.csv ......


Token indices sequence length is longer than the specified maximum sequence length for this model (1049 > 512). Running this sequence through the model will result in indexing errors


HHEMv1 Scores have been saved


Some weights of T5ForTokenClassification were not initialized from the model checkpoint at google/flan-t5-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


831
HHEM-2.1 Scores have been saved


Some weights of T5ForTokenClassification were not initialized from the model checkpoint at google/flan-t5-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HHEM-2.1-English Scores have been saved
Finshed anthropic/Claude-2.csv
Processing file 27: meta-llama/Meta-Llama-3.1-8B-Instruct.csv ......


Token indices sequence length is longer than the specified maximum sequence length for this model (770 > 512). Running this sequence through the model will result in indexing errors


HHEMv1 Scores have been saved


Some weights of T5ForTokenClassification were not initialized from the model checkpoint at google/flan-t5-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1006
HHEM-2.1 Scores have been saved


Some weights of T5ForTokenClassification were not initialized from the model checkpoint at google/flan-t5-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HHEM-2.1-English Scores have been saved
Finshed meta-llama/Meta-Llama-3.1-8B-Instruct.csv
Processing file 37: openai/o1-mini.csv ......


Token indices sequence length is longer than the specified maximum sequence length for this model (785 > 512). Running this sequence through the model will result in indexing errors


HHEMv1 Scores have been saved


Some weights of T5ForTokenClassification were not initialized from the model checkpoint at google/flan-t5-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1006
HHEM-2.1 Scores have been saved


Some weights of T5ForTokenClassification were not initialized from the model checkpoint at google/flan-t5-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HHEM-2.1-English Scores have been saved
Finshed openai/o1-mini.csv
Processing file 51: google/Gemini-Pro.csv ......


Token indices sequence length is longer than the specified maximum sequence length for this model (852 > 512). Running this sequence through the model will result in indexing errors


HHEMv1 Scores have been saved


Some weights of T5ForTokenClassification were not initialized from the model checkpoint at google/flan-t5-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1006
HHEM-2.1 Scores have been saved


Some weights of T5ForTokenClassification were not initialized from the model checkpoint at google/flan-t5-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HHEM-2.1-English Scores have been saved
Finshed google/Gemini-Pro.csv
Processing file 58: CohereForAI/c4ai-command-r-plus.csv ......


Token indices sequence length is longer than the specified maximum sequence length for this model (768 > 512). Running this sequence through the model will result in indexing errors


HHEMv1 Scores have been saved


Some weights of T5ForTokenClassification were not initialized from the model checkpoint at google/flan-t5-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1006
HHEM-2.1 Scores have been saved


Some weights of T5ForTokenClassification were not initialized from the model checkpoint at google/flan-t5-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HHEM-2.1-English Scores have been saved
Finshed CohereForAI/c4ai-command-r-plus.csv
Processing file 59: 01-ai/Yi-1.5-6B-Chat.csv ......


Token indices sequence length is longer than the specified maximum sequence length for this model (866 > 512). Running this sequence through the model will result in indexing errors


HHEMv1 Scores have been saved


Some weights of T5ForTokenClassification were not initialized from the model checkpoint at google/flan-t5-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1006
HHEM-2.1 Scores have been saved


Some weights of T5ForTokenClassification were not initialized from the model checkpoint at google/flan-t5-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HHEM-2.1-English Scores have been saved
Finshed 01-ai/Yi-1.5-6B-Chat.csv
