In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoTokenizer, AutoModelForSeq2SeqLM
from typing import Any, Generator, List
import torch
import numpy as np
import pandas as pd
import json
import sys
import os
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, '../eval')
from utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# can only load one model, otherwise cause OOM

# MODEL_NAME = 'trueteacher'
# tokenizer = T5Tokenizer.from_pretrained('google/t5_11b_trueteacher_and_anli')
# model = T5ForConditionalGeneration.from_pretrained('google/t5_11b_trueteacher_and_anli').to(device)

MODEL_NAME = 'true_nli'
tokenizer = AutoTokenizer.from_pretrained('google/t5_xxl_true_nli_mixture')
model = AutoModelForSeq2SeqLM.from_pretrained('google/t5_xxl_true_nli_mixture').to(device)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Loading checkpoint shards: 100%|██████████| 5/5 [00:00<00:00,  5.84it/s]


In [None]:
def _create_batch(source: List[str], summary: List[str], batch_size: int=1) -> Generator:
    l = len(source)
    for ndx in range(0, l, batch_size):
        batch = []
        for i in range(ndx, min(ndx + batch_size, l)):
            batch.append([source[i], summary[i]])
        
        yield batch

In [None]:
def TrueTeacherEval(input_pairs):
    prompt = "premise: {source} hypothesis: {summary}"
    # input_pairs = [[source, summary]]
    inputs = tokenizer([prompt.format(source=pair[0], summary=pair[1]) for pair in input_pairs], 
        return_tensors='pt',
        truncation=True,
        padding="longest",
        max_length=2048).to(device)
    with torch.no_grad():
        outputs = model.generate(inputs.input_ids, max_new_tokens=5)
             
    result = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    # print(result)
    result = [int(score.split()[0]) if (len(score.strip()) > 0 and score.split()[0] in ['1','0']) else None for score in result]
    
    return result

## Sample-level Prediction

In [None]:
data_folder = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'backup_data_with_detector_results')
subfolders = [f.path for f in os.scandir(data_folder) if f.is_dir()]
model_files = []
for subfolder in subfolders:
    model_files += [f.path for f in os.scandir(subfolder) if f.is_file()]
print(model_files)
selected_models = [
    "openai/GPT-3.5-Turbo",
    "openai/gpt-4o",
    "Qwen/Qwen2.5-7B-Instruct",
    "microsoft/Phi-3-mini-4k-instruct",
    "cohere/command-r-08-2024",
    "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "meta-llama/Meta-Llama-3.1-70B-Instruct",
    "google/gemini-1.5-flash-001",
    "Anthropic/claude-3-5-sonnet-20240620",
    "mistralai/Mistral-7B-Instruct-v0.3",
]
batch_size = 2
column_name = 'google/t5_xxl_true_nli_mixture' if MODEL_NAME == 'true_nli' else 'google/t5_11b_trueteacher_and_anli'
for file_name in model_files:
    print(file_name.replace(data_folder,'')[1:].replace('.csv',''))
    if file_name.replace(data_folder,'')[1:].replace('.csv','') not in selected_models:
        continue
    with open(file_name) as f:
        print(file_name)
        df = pd.read_csv(file_name).fillna('')
        scores = []
        for batch in _create_batch(df['source'].tolist(), df['summary'].tolist(), batch_size):
            batch_scores = TrueTeacherEval(batch)
            scores.extend(batch_scores)
        if column_name in df:
            df[column_name] = scores
        else:
            df.insert(len(df.columns), column_name, scores)
        df.to_csv(file_name, mode='w', index=False, header=True)

    

## Sent-level Prediction

In [None]:
sent_level_labels = {}
result_files, skip_sample_ids, selected_annotators, num_annotators = process_result_files()
for file_path in result_files:
    _, _, _, batch_sent_level_labels = read_annotation(file_path, skip_sample_ids=skip_sample_ids)
    # print(batch_sent_level_labels)
    sent_level_labels.update(batch_sent_level_labels)

In [None]:
t5_11b_trueteacher_and_anli_existing_meta_ids = []
t5_xxl_true_nli_mixture_existing_meta_ids = []

fname = '../eval/sent_level_results/detectors_sent_level_preds.json'
sources = []
df = pd.read_csv('../assign/examples_to_annotate.csv')
for index, row in df.iterrows():
    sources.append(row['source'])
data = {}
if os.path.exists(fname):
    with open(fname) as r:
        data = json.load(r)
        for meta_id in data:
            # print(list(data[meta_id].values())[0])
            if 'trueteacher' in list(data[meta_id].values())[0]:
                t5_11b_trueteacher_and_anli_existing_meta_ids.append(meta_id)
            if 'true_nli' in list(data[meta_id].values())[0]:
                t5_xxl_true_nli_mixture_existing_meta_ids.append(meta_id)
            
for meta_id in sent_level_labels:
    meta_id = str(meta_id)
    if MODEL_NAME == 'trueteacher' and meta_id in t5_11b_trueteacher_and_anli_existing_meta_ids:
        continue
    elif MODEL_NAME == 'true_nli' and meta_id in t5_xxl_true_nli_mixture_existing_meta_ids:
        continue
    if meta_id in data:
        item = data[meta_id]
        print(item)
    else:
        item = {}
    for sent, sent_labels in sent_level_labels[int(meta_id)].items():
        if sent not in item:
            item[sent] = {'labels': sent_labels}
        item[sent][MODEL_NAME] = TrueTeacherEval([[sources[int(meta_id)], sent]])
        
    print(item)
    if os.path.exists(fname):
        with open(fname, 'r') as f:
            json_data = json.load(f)
            json_data[meta_id] = item
    else:
        json_data = {meta_id:item}
    with open(fname, 'w') as f:
        f.write(json.dumps(json_data, indent=2))

{"Here's a concise summary of the passage, covering the core pieces of information:\n\n": {'labels': ['Consistent'], 'alignscore-base': 0.0026, 'alignscore-large': 0.04955, 'HHEMv1': 0.36422, 'HHEM-2.1': 0.70906, 'HHEM-2.1-English': 0.74753, 'HHEM-2.1-Open': 0.79617, 'minicheck-roberta-large': 0, 'minicheck-deberta-v3-large': 1, 'minicheck-flan-t5-large': 0}, 'The passage contains two unrelated pieces of information:\n\n': {'labels': ['Consistent'], 'alignscore-base': 0.05622, 'alignscore-large': 0.86291, 'HHEMv1': 0.08295, 'HHEM-2.1': 0.71762, 'HHEM-2.1-English': 0.65195, 'HHEM-2.1-Open': 0.71545, 'minicheck-roberta-large': 0, 'minicheck-deberta-v3-large': 0, 'minicheck-flan-t5-large': 1}, '1. Anne Rice: \n- Born in New Orleans\n- Spent much of her early life in New Orleans\n- Later moved to Texas and then to San Francisco\n\n': {'labels': ['Consistent'], 'alignscore-base': 0.46265, 'alignscore-large': 0.50817, 'HHEMv1': 0.96417, 'HHEM-2.1': 0.98892, 'HHEM-2.1-English': 0.99657, 'HHEM