In [1]:
from entailment_bank.utils.nlp_agent import MultiAngleModel, NlpAgent
from llama_entailer import llama_Entailer
from entailer import Entailer
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import json
from datasets import load_dataset
from truth_faith_score import get_score_tree

  from .autonotebook import tqdm as notebook_tqdm
INFO:datasets:PyTorch version 2.3.0 available.
INFO:datasets:TensorFlow version 2.16.1 available.


### Generating full-depth score tree using t5 and llama

In [2]:
truth_device = 'cuda:11'
info_device =  'cuda:12'

info_judge = AutoModelForCausalLM.from_pretrained("allenai/truthfulqa-info-judge-llama2-7B").to(info_device)
info_tokenizer = AutoTokenizer.from_pretrained("allenai/truthfulqa-info-judge-llama2-7B", max_length=500)

truth_judge = AutoModelForCausalLM.from_pretrained("allenai/truthfulqa-truth-judge-llama2-7B").to(truth_device)
truth_tokenizer = AutoTokenizer.from_pretrained("allenai/truthfulqa-truth-judge-llama2-7B", max_length=500)

ew_model = MultiAngleModel(model_path="allenai/entailer-11b", cuda_devices=[13, 14])
prover = NlpAgent(model=ew_model, default_outputs="proof")
entail_verifier = NlpAgent(model=ew_model, default_outputs=["implied"], default_options={"explicit_outputs": ['true', 'false']})
hyp_verifier = NlpAgent(model=ew_model, default_outputs=["valid"], default_options={"explicit_outputs": ['true', 'false']})

entailer = Entailer(ew_model, prover, entail_verifier, hyp_verifier)
llama_entailer = llama_Entailer(ew_model, prover, entail_verifier, truth_judge, truth_tokenizer, info_judge, info_tokenizer, truth_device, info_device)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.30s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [12]:
hyp = "Rich has a leaf with a small surface area. This adaptation likely causes less water vapor to evaporate"

tree = get_score_tree(hyp, entailer, llama_entailer, prover_prefix=None, depth=3)

In [13]:
print(tree)

{'HYP': 'Rich has a leaf with a small surface area. This adaptation likely causes less water vapor to evaporate', 't5_truth': '0.15019136867302618', 'llama_truth': '0.99533564', 't5_faith': '0.9219023463676145', 'llama_faith': '0.9994847355224437', 'premises': [{'HYP': 'A leaf with a small surface area will absorb less water vapor.', 't5_truth': '0.7671776399277875', 'llama_truth': '0.9975401', 't5_faith': '0.9966529393501569', 'llama_faith': '0.8766601781165875', 'premises': [{'HYP': 'As the surface area of a leaf decreases, the amount of water vapor absorbed by that leaf will decrease.', 't5_truth': '0.808694537717028', 'llama_truth': '0.9893165', 't5_faith': '0.9946293803544977', 'llama_faith': '3.344236080934953e-07', 'premises': [{'HYP': 'As the surface area of a substance decreases, the amount of that substance absorbed by that object will decrease.', 't5_truth': '0.784831773963933', 'llama_truth': '0.9945073', 't5_faith': '0.0', 'llama_faith': '0.0', 'premises': []}, {'HYP': 'A 

### Modifying the truth scores using llama-7b that is pre-trained on truthfulqa

Dataset used for training can be found at `truthfulqa_reeval/data/ARC+world_tree.jsonl`

Finetuning script can be found at `truthfulqa_reeval/scripts/finetune_judge.sh`

In [5]:
finetuned_truth_judge = AutoModelForCausalLM.from_pretrained("truthfulqa_reeval/output/llama2_7B_truth_judge_final").to('cuda:15')
finetuned_truth_tokenizer = AutoTokenizer.from_pretrained("truthfulqa_reeval/output/llama2_7B_truth_judge_final", max_length=500)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [14]:
from modify_scores import modify_truth_scores

modified_tree = modify_truth_scores(tree, finetuned_truth_judge, finetuned_truth_tokenizer, 'cuda:15')

In [15]:
print(modified_tree)

{'HYP': 'Rich has a leaf with a small surface area. This adaptation likely causes less water vapor to evaporate', 't5_truth': '0.15019136867302618', 'llama_truth': '0.5601153', 't5_faith': '0.9219023463676145', 'llama_faith': '0.9994847355224437', 'premises': [{'HYP': 'A leaf with a small surface area will absorb less water vapor.', 't5_truth': '0.7671776399277875', 'llama_truth': '0.91927266', 't5_faith': '0.9966529393501569', 'llama_faith': '0.8766601781165875', 'premises': [{'HYP': 'As the surface area of a leaf decreases, the amount of water vapor absorbed by that leaf will decrease.', 't5_truth': '0.808694537717028', 'llama_truth': '0.8495511', 't5_faith': '0.9946293803544977', 'llama_faith': '3.344236080934953e-07', 'premises': [{'HYP': 'As the surface area of a substance decreases, the amount of that substance absorbed by that object will decrease.', 't5_truth': '0.784831773963933', 'llama_truth': '0.9945073', 't5_faith': '0.0', 'llama_faith': '0.0', 'premises': []}, {'HYP': 'A 