In [1]:
# a function to grab a single fact from json file
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from factcheck import EntailmentFactChecker, EntailmentModel 
from transformers import AutoConfig

# Initialize the EntailmentFactChecker
model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
ent_tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer = ent_tokenizer
roberta_ent_model = AutoModelForSequenceClassification.from_pretrained(model_name)
ent_model = EntailmentModel(roberta_ent_model, ent_tokenizer)
fact_checker = EntailmentFactChecker(ent_model)

# Print out details about the tokenizer
print("Tokenizer Algorithm:", tokenizer.__class__.__name__)
print("Special Tokens:")
print("  Start Token:", tokenizer.bos_token, "ID:", tokenizer.bos_token_id)
print("  End (EOS) Token:", tokenizer.eos_token, "ID:", tokenizer.eos_token_id)
print("  Unknown Token:", tokenizer.unk_token, "ID:", tokenizer.unk_token_id)
print("  Padding Token:", tokenizer.pad_token, "ID:", tokenizer.pad_token_id)
print("  Mask Token:", tokenizer.mask_token, "ID:", tokenizer.mask_token_id)
print("Vocabulary Size:", tokenizer.vocab_size)
print("Model Max Length:", tokenizer.model_max_length)
print("Is Lowercase:", tokenizer.do_lower_case if hasattr(tokenizer, "do_lower_case") else "Not Applicable")

# # If you want to see more attributes, you can print out the tokenizer's configuration
# print("\nTokenizer Configuration:")
# print(tokenizer.config)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dgturner01\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dgturner01\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tokenizer Algorithm: DebertaV2TokenizerFast
Special Tokens:
  Start Token: [CLS] ID: 1
  End (EOS) Token: [SEP] ID: 2
  Unknown Token: [UNK] ID: 3
  Padding Token: [PAD] ID: 0
  Mask Token: [MASK] ID: 128000
Vocabulary Size: 128000
Model Max Length: 512
Is Lowercase: False


In [2]:
def test_fact_from_file(file_path, fact_name, threshold):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        data = [json.loads(line) for line in lines]
        
    # Search for the fact based on the "sent" field
    fact_data = next((entry for entry in data if entry['sent'] == fact_name), None)
    
    if not fact_data:
        print(f"Error: No fact found with name '{fact_name}'.")
        return
    
    fact = fact_data['sent']
    passages = fact_data['passages']
    
    result = fact_checker.check_fact_whole_passage(fact_checker.clean_text(fact), passages, threshold=threshold)

    # Append the original passages and sentences to the result
    result['original_passages'] = passages

    return result

In [4]:
import html
import json
from IPython.display import display, HTML

# Custom CSS to enable word wrapping for <pre> tags
custom_css = """
<style>
    pre {
        white-space: pre-wrap;
    }
</style>
"""

display(HTML(custom_css))

def display_json(data):
    formatted_json = html.escape(json.dumps(data, indent=4))
    display(HTML(f"<pre>{formatted_json}</pre>"))

# for interactive fact testing
file_path = "data/passages_bm25_ChatGPT_humfacts.jsonl"
fact_name = "Florencia Bertotti is a singer."

tokens = ent_tokenizer.tokenize(fact_name)
print("tokens for fact: ", tokens)

back_to_text = ent_tokenizer.convert_tokens_to_string(tokens)
print("back to text: ", back_to_text)

t = 0.60
result = test_fact_from_file(file_path, fact_name, threshold=t)
display_json(result)


tokens for fact:  ['▁Flor', 'encia', '▁Bert', 'otti', '▁is', '▁a', '▁singer', '.']
back to text:  Florencia Bertotti is a singer.


## TESTING SINGLE PASSAGE

In [6]:
file_path = "data/passages_bm25_ChatGPT_humfacts.jsonl"
fact_name = "Florencia Bertotti is a singer."
passage_text = """
<s>Florencia Bertotti Mar\u00eda Florencia Bertotti (born 15 March 1983), better known as Florencia Bertotti is an Argentine actress, singer, songwriter and businesswoman.</s><s>Biography. Florencia Bertotti parents are Gustavo Bertotti, a jeweler and Mar\u00eda Candelaria P\u00e9rez Colman, a psychologist and a teacher of children with disabilities. Her parents divorced when Florencia was seven years old. She has an older sister called Clara Bertotti. Her father passed away in 1999 when she was filming the series \"Verano del '98\". Florencia studied at the Colegio Nuestra Se\u00f1ora de la Misericordia in Recoleta, Buenos Aires, Argentina.</s><s>Personal life. On 2 December 2006 she got married in a religious ceremony with Guido Kaczka, whom she met in the recordings of \"Verano del '98\" and who was her boyfriend since then. On 10 July 2008, she gave birth to the couple's first child, a boy, whom they called Romeo Kaczka Bertottia.The couple divorced in March 2010. They both share custody of their son. Since 2010, Florencia Bertotti
"""

results_list = ent_model.check_entailment(fact_name, passage_text)
print("Results= ", results_list)


Results=  [0.10194315016269684, 0.8856976628303528, 0.01235916931182146]


## TESTING CLEAN_TEXT

In [8]:
test_text = "Jean Daull� died in 1763."
text_clean = fact_checker.clean_text(test_text)
print("text_clean ", text_clean)

tokens = tokenizer.tokenize(test_text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

print("Unknown Token ", tokenizer.unk_token_id)

text_clean  Jean Daull� died in 1763.
[6105, 41306, 436, 1675, 267, 84306, 260]
Unknown Token  3
