Test to evaluate the quality of the mini-facts from the evidence with BART-large-MNLI

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

bart_model_path = "D:\huggingface\huggingface\hub\models--facebook--bart-large-mnli\snapshots\d7645e127eaf1aefc7862fd59a17a5aa8558b8ce"
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Loading BART model...")
bart_model = AutoModelForSequenceClassification.from_pretrained(bart_model_path, local_files_only=True)
bart_tokenizer = AutoTokenizer.from_pretrained(bart_model_path, local_files_only=True)
bart_model.to(device)

In [None]:
def split_source_to_fit_with_hypothesis(source, hypothesis, bart_tokenizer, max_length=1024):
    """Splits the source into chunks so that each chunk, when combined with the hypothesis, fits within the token limit."""
    original_max_length = bart_tokenizer.model_max_length
    bart_tokenizer.model_max_length = int(1e12)  
    hypothesis_tokens = bart_tokenizer.encode(hypothesis, add_special_tokens=False)
    hypothesis_length = len(hypothesis_tokens)
    num_special_tokens = bart_tokenizer.num_special_tokens_to_add(pair=True)
    max_source_length = max_length - hypothesis_length - num_special_tokens
    if max_source_length <= 0:
        bart_tokenizer.model_max_length = original_max_length
        raise ValueError("The hypothesis is too long to fit within the max_length limit.")
    source_tokens = bart_tokenizer.encode(source, add_special_tokens=False)
    bart_tokenizer.model_max_length = original_max_length
    token_chunks = [source_tokens[i:i+max_source_length] for i in range(0, len(source_tokens), max_source_length)]
    text_chunks = [bart_tokenizer.decode(chunk, skip_special_tokens=True) for chunk in token_chunks]
    return text_chunks
    
def call_bart_model(source, statement):
    source_chunks = split_source_to_fit_with_hypothesis(source, statement, bart_tokenizer, max_length=1024)
    entailment_probs = []
    pred_labels = []
    for idx, chunk in enumerate(source_chunks):
        inputs = bart_tokenizer(
            chunk,
            statement,
            return_tensors='pt',
            truncation=True,
            max_length=1024,
            add_special_tokens=True
        )
        input_length = inputs['input_ids'].shape[1]
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = bart_model(**inputs)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=1)
            dominating_class = probs.argmax(dim=1).item()

        class_names = ["Contradiction", "Neutral", "Entailment"]
        prob_entailment = probs[:, 2].item()
        entailment_probs.append(prob_entailment)
        pred_labels.append(class_names[dominating_class])

    filtered_labels = [label for label in pred_labels if label != "Neutral"]
    if filtered_labels:
        final_label = max(set(filtered_labels), key=pred_labels.count)
    else:
        final_label = max(set(pred_labels), key=pred_labels.count)
    return final_label

In [None]:
import pandas as pd


samples = 1000
df = pd.read_pickle("datasets_fever/mini_fact_fever.pkl").iloc[:samples]

counter_entailment = 0
grouped_df = df.groupby("gen_evidence")

for name, group in grouped_df:
    mini_facts = group["output_mini_fact"].tolist()
    gen_evidence = group["gen_evidence"].iloc[0]
    for mini_fact in mini_facts:
        label = call_bart_model(gen_evidence, mini_fact)
        if label == "Entailment":
            counter_entailment += 1

In [None]:
counter_entailment

Tests Probs

In [None]:
import pandas as pd


dataset = "hover"

df1 = pd.read_pickle(f"probs_test_llama/df_new_{dataset}_probs_sentence.pkl")
df2 = pd.read_pickle(f"processed_datasets_with_bart_{dataset}_layer-1/sentence_{dataset}_test_unbalanced.pkl")

In [None]:
for gen_evidence in df1["output_sentence"].tolist():
    if gen_evidence not in df2["output_sentence"].tolist():
        print(gen_evidence)

In [None]:
for gen_evidence in df2["output_sentence"].tolist():
    if gen_evidence not in df1["output_sentence"].tolist():
        print(gen_evidence)

Tests Mini Facts

In [None]:
import pandas as pd 
dataset = "fever"

df_train = pd.read_pickle(f"processed_datasets_with_bart_{dataset}_layer-8/mini_fact_{dataset}_train.pkl")
print(len(df_train))
df_dev = pd.read_pickle(f"processed_datasets_with_bart_{dataset}_layer-8/mini_fact_{dataset}_dev.pkl")
print(len(df_dev))
df_test = pd.read_pickle(f"processed_datasets_with_bart_{dataset}_layer-8/mini_fact_{dataset}_test_unbalanced.pkl")
print(len(df_test))

In [None]:
df_train = pd.read_pickle(f"processed_datasets_with_bart_{dataset}_layer-1/mini_fact_{dataset}_train.pkl")
df_test = pd.read_pickle(f"processed_datasets_with_bart_{dataset}_layer-1/mini_fact_{dataset}_test_unbalanced.pkl")


all_docs1 = [item for sublist in df_train['docs'] for item in sublist]
all_docs2 = [item for sublist in df_test['docs'] for item in sublist]

mini_facts_train = df_train["output_mini_fact"].tolist() 
mini_facts_test = df_test["output_mini_fact"].tolist()

In [None]:
for mini_fact in mini_facts_train:
    if mini_fact in mini_facts_test:
        print(mini_fact)

In [None]:
for mini_fact in mini_facts_test:
    if mini_fact in mini_facts_train:
        print(mini_fact)

In [None]:
set_docs1 = set(all_docs1)
set_docs2 = set(all_docs2)

# Check if there's any overlap
common_docs = set_docs1.intersection(set_docs2)

# Print the result
if not common_docs:
    print("No common documents found between df1 and df2.")
else:
    print("Common documents found:", common_docs)

Test Sentences

In [None]:
import pandas as pd

dataset = "fever"

df = pd.read_pickle("datasets_hover_llama/gen_evidence_hover.pkl")

In [None]:
import pandas as pd

dataset = "fever"

df_train = pd.read_pickle(f"processed_datasets_llama_{dataset}_layer-1/sentence_{dataset}_train.pkl")
df_test = pd.read_pickle(f"processed_datasets_llama_{dataset}_layer-1/sentence_{dataset}_test_unbalanced.pkl")
print(len(df_train))


all_docs1 = [item for sublist in df_train['docs'] for item in sublist]
all_docs2 = [item for sublist in df_test['docs'] for item in sublist]

sentence_train = df_train["output_sentence"].tolist() 
sentence_test = df_test["output_sentence"].tolist()

In [None]:
for sentence in sentence_train:
    if sentence in sentence_test:
        print(sentence)

In [None]:
for sentence in sentence_test:
    if sentence in sentence_train:
        print(sentence)

In [None]:
set_docs1 = set(all_docs1)
set_docs2 = set(all_docs2)

# Check if there's any overlap
common_docs = set_docs1.intersection(set_docs2)

# Print the result
if not common_docs:
    print("No common documents found between df1 and df2.")
else:
    print("Common documents found:", common_docs)

Mini Facts - Sentence Match

In [None]:
df_test_sentence = pd.read_pickle(f"processed_datasets_with_bart_{dataset}_layer-1/sentence_{dataset}_test_unbalanced.pkl")
df_test_mini_fact = pd.read_pickle(f"processed_datasets_with_bart_{dataset}_layer-1/mini_fact_{dataset}_test_unbalanced.pkl")


def balance_dataframe(df, label_name):
        df_label_1 = df[df[str(label_name)] == 1]
        df_label_0 = df[df[str(label_name)] == 0]
        min_class_count = min(len(df_label_1), len(df_label_0))
        df_label_1_downsampled = df_label_1.sample(min_class_count, random_state=42)
        df_label_0_downsampled = df_label_0.sample(min_class_count, random_state=42)
        balanced_df = pd.concat([df_label_1_downsampled, df_label_0_downsampled])
        return balanced_df.reset_index(drop=True)

df_test_sentence = balance_dataframe(df_test_sentence, 'label_sentence')
df_test_mini_fact = df_test_mini_fact[df_test_mini_fact['gen_sentence'].isin(df_test_sentence['output_sentence'])]

In [None]:
for gen_s in df_test_mini_fact['gen_sentence'].tolist():
    if gen_s not in df_test_sentence['output_sentence'].tolist():
        print(gen_s)

In [None]:
for gen_s in df_test_sentence['output_sentence'].tolist():
    if gen_s not in df_test_mini_fact['gen_sentence'].tolist():
        print(gen_s)

Manual Check that all mini facts remain in sentence

In [None]:
df_grouped = df_test_mini_fact.groupby("gen_sentence")


for name, group in df_grouped:
    gen_sentence = group["gen_sentence"].iloc[0]
    print(gen_sentence)
    mini_facts = group["output_mini_fact"].tolist()
    print(mini_facts)
    print("###")
    

Test Phi

In [None]:
import pandas as pd

dataset = "hover"

df1 = pd.read_pickle(f"probs_test_phi/probs_sentence_{dataset}.pkl")
df2 = pd.read_pickle(f"processed_datasets_phi/sentence_{dataset}_layer-1_test_unbalanced.pkl")

In [None]:
for gen_evidence in df1["output_sentence"].tolist():
    if gen_evidence not in df2["output_sentence"].tolist():
        print(gen_evidence)

In [None]:
for gen_evidence in df2["output_sentence"].tolist():
    if gen_evidence not in df1["output_sentence"].tolist():
        print(gen_evidence)

In [None]:
df_train = pd.read_pickle(f"processed_datasets_llama_{dataset}_layer-1/mini_fact_{dataset}_train.pkl")
df_test = pd.read_pickle(f"processed_datasets_phi_llama/mini_fact_{dataset}_layer-1_test_unbalanced.pkl")


all_docs1 = [item for sublist in df_train['docs'] for item in sublist]
all_docs2 = [item for sublist in df_test['docs'] for item in sublist]

mini_facts_train = df_train["output_mini_fact"].tolist() 
mini_facts_test = df_test["output_mini_fact"].tolist()

In [None]:
for mini_fact in mini_facts_train:
    if mini_fact in mini_facts_test:
        print(mini_fact)

In [None]:
for mini_fact in mini_facts_test:
    if mini_fact in mini_facts_train:
        print(mini_fact)

In [None]:
set_docs1 = set(all_docs1)
set_docs2 = set(all_docs2)

# Check if there's any overlap
common_docs = set_docs1.intersection(set_docs2)

# Print the result
if not common_docs:
    print("No common documents found between df1 and df2.")
else:
    print("Common documents found:", common_docs)

In [None]:
import torch 
import os 
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import pandas as pd
import numpy as np

model_path = "/home/wombat_share/llms/llama/Meta-Llama-3-8B-Instruct"

os.environ['CUDA_VISIBLE_DEVICES'] = "1"

device = torch.device(f"cuda:1")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
#    #bnb_4bit_use_double_quant=True,
#    #bnb_4bit_quant_type="nf4",
#    #bnb_4bit_compute_dtype=torch.bfloat16 #if use_flash_attention2 else torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    #quantization_config=bnb_config,
    load_in_4bit=True,
    local_files_only=True,
)




def call_text_llm(new_prompt):
    #number_examples = new_prompt.count("###")

    #sentinel_token_ids = tokenizer("###", add_special_tokens=False, return_tensors="pt").input_ids.to("cuda")
    #stopping_criteria_list = transformers.StoppingCriteriaList([
    #    TokenStoppingCriteria(sentinel_token_ids=sentinel_token_ids, starting_idx=0, counter=0, stop_counter=number_examples)
    #])


    inputs = tokenizer(new_prompt, return_tensors="pt").to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=150,
        #stopping_criteria=stopping_criteria_list,
        do_sample=False,
        return_dict_in_generate=True,
        output_scores=True,
        temperature=1.0,
        top_p=1.0,
    )

    transition_scores = model.compute_transition_scores(
        outputs.sequences, outputs.scores, normalize_logits=True
    )

    #prompt_length = inputs['input_ids'].shape[1]
    #response = outputs[0][prompt_length:]
    #new_output = tokenizer.decode(response, skip_special_tokens=False)


    #input_length = 1 if model.config.is_encoder_decoder else inputs.shape[1]

    input_length = inputs.input_ids.shape[1]
    
    generated_tokens = outputs.sequences[:, input_length:]
    data = []
    for tok, score in zip(generated_tokens[0], transition_scores[0]):
        # | token | token string | logits | probability
        #print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.cpu().numpy():.4f} | {np.exp(score.cpu().numpy()):.2%}")
        #print(f"{self.tokenizer.decode(tok):8s} | {np.exp(score_cpu)}")
        data.append([tokenizer.decode(tok), np.exp(score.cpu().numpy())])

    return tokenizer.decode(generated_tokens[0], skip_special_tokens=True), data





def call_message_llm(messages):
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = model.generate(
        input_ids,
        max_new_tokens=256,
        eos_token_id=terminators,
        do_sample=False,
        #output_hidden_states=True,
        #return_dict_in_generate=with_probs,
        #output_scores=with_probs,
        temperature=1.0,
        top_p=1.0,
    )

    response = outputs[0][input_ids.shape[-1]:]
    return tokenizer.decode(response, skip_special_tokens=True)

text_prompt = """ Please breakdown the following sentence into independent facts: He made his acting debut in the film The Moon is the Sun’s Dream (1992), and continued to
 appear in small and supporting roles throughout the 1990s. - He made his acting debut in the film. - He made his acting debut in The Moon is the Sun’s Dream. - The Moon is the Sun’s Dream is a film. - The Moon is the Sun’s Dream was released in 1992. - After his acting debut, he appeared in small and supporting roles. - After his acting debut, he appeared in small and supporting roles throughout the 1990s.
 Please breakdown the following sentence into independent facts: He is also a successful producer and engineer, having worked with a wide variety of artists,
 including Willie Nelson, Tim McGraw, and Taylor Swift. - He is successful. -He is aproducer. -He is a engineer. -He has worked with a wide variety of artists. - Willie Nelson is an artist. - He has worked with Willie Nelson. -Tim McGraw is an artist. - He has worked with Tim McGraw. - Taylor Swift is an artist. - He has worked with Taylor Swift.
 Please breakdown the following sentence into independent facts: In 1963, Collins became one of the third group of astronauts selected by NASA and he served
 as the back-up Command Module Pilot for the Gemini 7 mission. - Collins became an astronaut. - Collins became one of the third group of astronauts.- Collins became one of the third group of astronauts selected. - Collins became one of the third group of astronauts selected by NASA. - Collins became one of the third group of astronauts selected by NASA in 1963. - He served as the Command Module Pilot. - He served as the back-up Command Module Pilot. - He served as the Command Module Pilot for the Gemini 7 mission.
 Please breakdown the following sentence into independent facts: In addition to his acting roles, Bateman has written and directed two short films and is
 currently in development on his feature debut.- Bateman has acting roles. - Bateman has written two short films. - Bateman has directed two short films. - Bateman has written and directed two short films. - Bateman is currently in development on his feature debut.
 Please breakdown the following sentence into independent facts: Michael Collins (born October 31, 1930) is a retired American astronaut and test pilot who
 was the Command Module Pilot for the Apollo 11 mission in 1969. - Michael Collins was born on October 31, 1930. - Michael Collins is retired. - Michael Collins is an American. - Michael Collins was an astronaut. - Michael Collins was a test pilot. - Michael Collins was the Command Module Pilot. - Michael Collins was the Command Module Pilot for the Apollo 11 mission. - Michael Collins was the Command Module Pilot for the Apollo 11 mission in 1969.
 Please breakdown the following sentence into independent facts: He was an American composer, conductor, and musical director.- He was an American. - He was a composer. - He was a conductor. -He was a musical director.
 Please breakdown the following sentence into independent facts: She currently stars in the romantic comedy series, Love and Destiny, which premiered in 2019. - She currently stars in Love and Destiny. - Love and Destiny is a romantic comedy series. - Love and Destiny premiered in 2019.
 Please breakdown the following sentence into independent facts: During his professional career, McCoy played for the Broncos, the San Diego Chargers, the
 Minnesota Vikings, and the Jacksonville Jaguars. - McCoy played for the Broncos. - McCoy played for the Broncos during his professional career. - McCoy played for the San Diego Chargers. - McCoy played for the San Diego Chargers during his professional career. - McCoyplayed for the Minnesota Vikings. - McCoy played for the Minnesota Vikings during his professional career. - Mc Coy played for the Jacksonville Jaguars. - McCo yplayed for the Jacksonville Jaguars during his professional career. Please breakdown the following sentence into independent facts:"""


instruct_atomic_facts_prompt = """Please breakdown the following sentence into independent facts:"""
samples = ["He made his acting debut in the film The Moon is the Sun’s Dream (1992), and continued to appear in small and supporting roles throughout the 1990s.", 
           "He is also a successful producer and engineer, having worked with a wide variety of artists, including Willie Nelson, Tim McGraw, and Taylor Swift.",
           "Michael Collins (born October 31, 1930) is a retired American astronaut and test pilot who was the Command Module Pilot for the Apollo 11 mission in 1969.",
           "He was an American composer, conductor, and musical director.", 
           "She currently stars in the romantic comedy series, Love and Destiny, which premiered in 2019.",
           "During his professional career, McCoy played for the Broncos, the San Diego Chargers, the Minnesota Vikings, and the Jacksonville Jaguars."
           ]

sample_atomic_facts = ["""- He made his acting debut in The Moon is the Sun’s Dream. - The Moon is the Sun’s Dream is a film. - The Moon is the Sun’s Dream was released in 1992. - After his acting debut, he appeared in small and supporting roles. - After his acting debut, he appeared in small and supporting roles throughout the 1990s.""",
"""- He is successful. -He is aproducer. -He is a engineer. -He has worked with a wide variety of artists. - Willie Nelson is an artist. - He has worked with Willie Nelson. -Tim McGraw is an artist. - He has worked with Tim McGraw. - Taylor Swift is an artist. - He has worked with Taylor Swift.""",
"""- Collins became an astronaut. - Collins became one of the third group of astronauts.- Collins became one of the third group of astronauts selected. - Collins became one of the third group of astronauts selected by NASA. - Collins became one of the third group of astronauts selected by NASA in 1963. - He served as the Command Module Pilot. - He served as the back-up Command Module Pilot. - He served as the Command Module Pilot for the Gemini 7 mission.""",
"""- He was an American. - He was a composer. - He was a conductor. -He was a musical director.""",
"""- She currently stars in Love and Destiny. - Love and Destiny is a romantic comedy series. - Love and Destiny premiered in 2019.""",
"""- McCoy played for the Broncos. - McCoy played for the Broncos during his professional career. - McCoy played for the San Diego Chargers. - McCoy played for the San Diego Chargers during his professional career. - McCoyplayed for the Minnesota Vikings. - McCoy played for the Minnesota Vikings during his professional career. - Mc Coy played for the Jacksonville Jaguars. - McCo yplayed for the Jacksonville Jaguars during his professional career."""
]

#text_prompt = text_prompt + "Stranger Than Fiction is a 2006 American fantasy comedy-drama film directed by Spike Jonze and written by Charlie Kaufman."


def get_messages(sample):
    messages = [
        {"role": "system", "content": instruct_atomic_facts_prompt},
        {"role": "user", "content": samples[0]},
        {"role": "assistant", "content": sample_atomic_facts[0]},
        #{"role": "system", "content": instruct_atomic_facts_prompt},
        {"role": "user", "content": samples[1]},
        {"role": "assistant", "content": sample_atomic_facts[1]},
        #{"role": "system", "content": instruct_atomic_facts_prompt},
        {"role": "user", "content": samples[2]},
        {"role": "assistant", "content": sample_atomic_facts[2]},
        #{"role": "system", "content": instruct_atomic_facts_prompt},
        {"role": "user", "content": samples[3]},
        {"role": "assistant", "content": sample_atomic_facts[3]},
        #{"role": "system", "content": instruct_atomic_facts_prompt},
        {"role": "user", "content": samples[4]},
        {"role": "assistant", "content": sample_atomic_facts[4]},
        #{"role": "system", "content": instruct_atomic_facts_prompt},
        {"role": "user", "content": samples[5]},
        {"role": "assistant", "content": sample_atomic_facts[5]},
        #{"role": "system", "content": instruct_atomic_facts_prompt},

    ]
    messages.append({"role": "user", "content": sample})
    return messages


test_samples = ["Stranger Than Fiction is a 2006 American fantasy comedy-drama film directed by Spike Jonze and written by Charlie Kaufman.", 
                "The Peloponnesian War was a devastating conflict between Athens and Sparta, which lasted from 431 to 404 BCE. The war was won by Athens, not Sparta.",
                "Ronald Reagan was the 40th President of the United States, serving from 1981 to 1989",
                "Debbie Reynolds starred in the 1989 revival of Irene, not 1994."]

for test in test_samples:
    messages = get_messages(test)
    #new_prompt = text_prompt + test
    output = call_message_llm(messages)
    print(output)