In [1]:
from entailment_bank.utils.nlp_agent import MultiAngleModel, NlpAgent
from llama_entailer import llama_Entailer
from entailer import Entailer
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import json
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm
INFO:datasets:PyTorch version 2.3.0 available.
INFO:datasets:TensorFlow version 2.16.1 available.


In [2]:
def get_scores(tree, entailer:Entailer, llama_entailer:llama_Entailer):    
    if len(tree.keys()) == 0:
        return []
    res = []
    premises = tree.keys()
    for premise in premises:
        info = {}
        info["HYP"] = premise
        info["t5_truth"] = str(entailer.truthfulness_score(premise))
        info["llama_truth"] = str(llama_entailer.truthfulness_score(premise))
        if len(tree[premise].keys())!=0:
            info["t5_faith"] = str(entailer.faithfulness_score(premise, tree[premise].keys()))
            info["llama_faith"] = str(llama_entailer.faithfulness_score(premise, tree[premise].keys()))
        else:
            info["t5_faith"] = str(0.0)
            info["llama_faith"] = str(0.0)
        
        info["premises"] = get_scores(tree[premise], entailer, llama_entailer)
        
        res.append(info)
        
    return res

def get_score_tree(hyp, entailer:Entailer, llama_entailer:llama_Entailer, prover_prefix=None, depth=3):
    info = {}
    info["HYP"] = hyp
    info["t5_truth"] = str(entailer.truthfulness_score(hyp))
    info["llama_truth"] = str(llama_entailer.truthfulness_score(hyp))
    if depth==0:
        info["t5_faith"] = str(0.0)
        info["llama_faith"] = str(0.0)
        info["premises"] = []
    else:
        if prover_prefix is not None:
            proof_prefix = "[PREMISE] "+prover_prefix
            proof = entailer.prover({"hypothesis": hyp},  options={"output_prefix": {"proof": proof_prefix}})
        else:
            proof = entailer.prover({"hypothesis": hyp})
        premises = [x.strip() for x in proof.split("[PREMISE]") if x.strip()]
        info["t5_faith"] = str(entailer.faithfulness_score(hyp, premises))
        info["llama_faith"] = str(llama_entailer.faithfulness_score(hyp, premises))
        info["premises"] = []
        for premise in premises:
            premise_info = get_score_tree(premise, entailer, llama_entailer, prover_prefix=None, depth=depth-1)
            info["premises"].append(premise_info)
    
    return info

In [3]:
truth_device = 'cuda:11'
info_device =  'cuda:12'

info_judge = AutoModelForCausalLM.from_pretrained("allenai/truthfulqa-info-judge-llama2-7B").to(info_device)
info_tokenizer = AutoTokenizer.from_pretrained("allenai/truthfulqa-info-judge-llama2-7B", max_length=500)

truth_judge = AutoModelForCausalLM.from_pretrained("allenai/truthfulqa-truth-judge-llama2-7B").to(truth_device)
truth_tokenizer = AutoTokenizer.from_pretrained("allenai/truthfulqa-truth-judge-llama2-7B", max_length=500)

ew_model = MultiAngleModel(model_path="allenai/entailer-11b", cuda_devices=[13, 14])
prover = NlpAgent(model=ew_model, default_outputs="proof")
entail_verifier = NlpAgent(model=ew_model, default_outputs=["implied"], default_options={"explicit_outputs": ['true', 'false']})
hyp_verifier = NlpAgent(model=ew_model, default_outputs=["valid"], default_options={"explicit_outputs": ['true', 'false']})

entailer = Entailer(ew_model, prover, entail_verifier, hyp_verifier)
llama_entailer = llama_Entailer(ew_model, prover, entail_verifier, truth_judge, truth_tokenizer, info_judge, info_tokenizer, truth_device, info_device)

Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.35s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.17s/it]
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
hyp = "If a moving object slows down, it will have _____ kinetic energy. more"
prover_prefix = "Anything that is moving has kinetic energy, and the faster it is moving, the more kinetic energy it has."

tree = get_score_tree(hyp, entailer, llama_entailer, prover_prefix, 3)

In [5]:
print(tree)

{'HYP': 'If a moving object slows down, it will have _____ kinetic energy. more', 't5_truth': '0.0872523946270124', 'llama_truth': '0.9629448', 't5_faith': '0.5282286909037457', 'llama_faith': '0.9969072377640202', 'premises': [{'HYP': 'Anything that is moving has kinetic energy, and the faster it is moving, the more kinetic energy it has.', 't5_truth': '0.9980767806757831', 'llama_truth': '0.9996535', 't5_faith': '0.9996485765364971', 'llama_faith': '0.6022574278192963', 'premises': [{'HYP': 'Kinetic energy is a measure of the speed at which an object is moving.', 't5_truth': '0.15759111276743043', 'llama_truth': '0.9932254', 't5_faith': '0.99681995379518', 'llama_faith': '0.9462813350065744', 'premises': [{'HYP': 'Kinetic energy is a measure of the speed with which an object is moving.', 't5_truth': '0.16663328878398564', 'llama_truth': '0.9953239', 't5_faith': '0.0', 'llama_faith': '0.0', 'premises': []}, {'HYP': 'Speed is a kind of measure of kinetic energy.', 't5_truth': '0.904960

In [14]:
dataset = load_dataset("allenai/quartz")

In [18]:
with open('fulldepth_quartz_scores.jsonl', 'a') as f:
    for i in tqdm(range(len(dataset['test']))):
        data = dataset["test"][i]
        for j in range(len(data["choices"]["text"])):
            hyp = data["question"]+" "+data["choices"]['text'][j]
            prover_prefix = data["para"]
            
            tree = get_score_tree(hyp, entailer, llama_entailer, prover_prefix, 3)
            
            json.dump(tree, f)
            f.write('\n')

  0%|          | 0/784 [00:00<?, ?it/s]

100%|██████████| 784/784 [10:18:03<00:00, 47.30s/it] 


In [19]:
dataset = load_dataset("truthful_qa", 'multiple_choice')

In [20]:
with open('fulldepth_truthfulqa_scores.jsonl', 'a') as f:
    for i in tqdm(range(len(dataset['validation']))):
        data = dataset["validation"][i]
        for j in range(min(3, len(data["mc1_targets"]["choices"]))):
            hyp = data["question"]+" "+data["mc1_targets"]["choices"][j]
            
            tree = get_score_tree(hyp, entailer, llama_entailer, prover_prefix=None, depth=3)
            
            json.dump(tree, f)
            f.write('\n')

100%|██████████| 817/817 [13:51:49<00:00, 61.09s/it]  
