Test to evaluate the quality of the mini-facts from the evidence with BART-large-MNLI

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

bart_model_path = "D:\huggingface\huggingface\hub\models--facebook--bart-large-mnli\snapshots\d7645e127eaf1aefc7862fd59a17a5aa8558b8ce"
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Loading BART model...")
bart_model = AutoModelForSequenceClassification.from_pretrained(bart_model_path, local_files_only=True)
bart_tokenizer = AutoTokenizer.from_pretrained(bart_model_path, local_files_only=True)
bart_model.to(device)

In [None]:
def split_source_to_fit_with_hypothesis(source, hypothesis, bart_tokenizer, max_length=1024):
    """Splits the source into chunks so that each chunk, when combined with the hypothesis, fits within the token limit."""
    original_max_length = bart_tokenizer.model_max_length
    bart_tokenizer.model_max_length = int(1e12)  
    hypothesis_tokens = bart_tokenizer.encode(hypothesis, add_special_tokens=False)
    hypothesis_length = len(hypothesis_tokens)
    num_special_tokens = bart_tokenizer.num_special_tokens_to_add(pair=True)
    max_source_length = max_length - hypothesis_length - num_special_tokens
    if max_source_length <= 0:
        bart_tokenizer.model_max_length = original_max_length
        raise ValueError("The hypothesis is too long to fit within the max_length limit.")
    source_tokens = bart_tokenizer.encode(source, add_special_tokens=False)
    bart_tokenizer.model_max_length = original_max_length
    token_chunks = [source_tokens[i:i+max_source_length] for i in range(0, len(source_tokens), max_source_length)]
    text_chunks = [bart_tokenizer.decode(chunk, skip_special_tokens=True) for chunk in token_chunks]
    return text_chunks
    
def call_bart_model(source, statement):
    source_chunks = split_source_to_fit_with_hypothesis(source, statement, bart_tokenizer, max_length=1024)
    entailment_probs = []
    pred_labels = []
    for idx, chunk in enumerate(source_chunks):
        inputs = bart_tokenizer(
            chunk,
            statement,
            return_tensors='pt',
            truncation=True,
            max_length=1024,
            add_special_tokens=True
        )
        input_length = inputs['input_ids'].shape[1]
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = bart_model(**inputs)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=1)
            dominating_class = probs.argmax(dim=1).item()

        class_names = ["Contradiction", "Neutral", "Entailment"]
        prob_entailment = probs[:, 2].item()
        entailment_probs.append(prob_entailment)
        pred_labels.append(class_names[dominating_class])

    filtered_labels = [label for label in pred_labels if label != "Neutral"]
    if filtered_labels:
        final_label = max(set(filtered_labels), key=pred_labels.count)
    else:
        final_label = max(set(pred_labels), key=pred_labels.count)
    return final_label

In [None]:
import pandas as pd


samples = 1000
df = pd.read_pickle("datasets_fever/mini_fact_fever.pkl").iloc[:samples]

counter_entailment = 0
grouped_df = df.groupby("gen_evidence")

for name, group in grouped_df:
    mini_facts = group["output_mini_fact"].tolist()
    gen_evidence = group["gen_evidence"].iloc[0]
    for mini_fact in mini_facts:
        label = call_bart_model(gen_evidence, mini_fact)
        if label == "Entailment":
            counter_entailment += 1

In [None]:
counter_entailment

Tests Probs

In [None]:
import pandas as pd


dataset = "hover"

df1 = pd.read_pickle(f"probs_test_llama/df_new_{dataset}_probs_sentence.pkl")
df2 = pd.read_pickle(f"processed_datasets_with_bart_{dataset}_layer-1/sentence_{dataset}_test_unbalanced.pkl")

In [None]:
for gen_evidence in df1["output_sentence"].tolist():
    if gen_evidence not in df2["output_sentence"].tolist():
        print(gen_evidence)

In [None]:
for gen_evidence in df2["output_sentence"].tolist():
    if gen_evidence not in df1["output_sentence"].tolist():
        print(gen_evidence)

Tests Mini Facts

In [None]:
import pandas as pd 
dataset = "fever"

df_train = pd.read_pickle(f"processed_datasets_with_bart_{dataset}_layer-8/mini_fact_{dataset}_train.pkl")
print(len(df_train))
df_dev = pd.read_pickle(f"processed_datasets_with_bart_{dataset}_layer-8/mini_fact_{dataset}_dev.pkl")
print(len(df_dev))
df_test = pd.read_pickle(f"processed_datasets_with_bart_{dataset}_layer-8/mini_fact_{dataset}_test_unbalanced.pkl")
print(len(df_test))

In [None]:
df_train = pd.read_pickle(f"processed_datasets_with_bart_{dataset}_layer-1/mini_fact_{dataset}_train.pkl")
df_test = pd.read_pickle(f"processed_datasets_with_bart_{dataset}_layer-1/mini_fact_{dataset}_test_unbalanced.pkl")


all_docs1 = [item for sublist in df_train['docs'] for item in sublist]
all_docs2 = [item for sublist in df_test['docs'] for item in sublist]

mini_facts_train = df_train["output_mini_fact"].tolist() 
mini_facts_test = df_test["output_mini_fact"].tolist()

In [None]:
for mini_fact in mini_facts_train:
    if mini_fact in mini_facts_test:
        print(mini_fact)

In [None]:
for mini_fact in mini_facts_test:
    if mini_fact in mini_facts_train:
        print(mini_fact)

In [None]:
set_docs1 = set(all_docs1)
set_docs2 = set(all_docs2)

# Check if there's any overlap
common_docs = set_docs1.intersection(set_docs2)

# Print the result
if not common_docs:
    print("No common documents found between df1 and df2.")
else:
    print("Common documents found:", common_docs)

Test Sentences

In [None]:
import pandas as pd

dataset = "fever"

df = pd.read_pickle("datasets_hover_llama/gen_evidence_hover.pkl")

In [None]:
import pandas as pd

dataset = "fever"

df_train = pd.read_pickle(f"processed_datasets_llama_{dataset}_layer-1/sentence_{dataset}_train.pkl")
df_test = pd.read_pickle(f"processed_datasets_llama_{dataset}_layer-1/sentence_{dataset}_test_unbalanced.pkl")
print(len(df_train))


all_docs1 = [item for sublist in df_train['docs'] for item in sublist]
all_docs2 = [item for sublist in df_test['docs'] for item in sublist]

sentence_train = df_train["output_sentence"].tolist() 
sentence_test = df_test["output_sentence"].tolist()

In [None]:
for sentence in sentence_train:
    if sentence in sentence_test:
        print(sentence)

In [None]:
for sentence in sentence_test:
    if sentence in sentence_train:
        print(sentence)

In [None]:
set_docs1 = set(all_docs1)
set_docs2 = set(all_docs2)

# Check if there's any overlap
common_docs = set_docs1.intersection(set_docs2)

# Print the result
if not common_docs:
    print("No common documents found between df1 and df2.")
else:
    print("Common documents found:", common_docs)

Mini Facts - Sentence Match

In [None]:
df_test_sentence = pd.read_pickle(f"processed_datasets_with_bart_{dataset}_layer-1/sentence_{dataset}_test_unbalanced.pkl")
df_test_mini_fact = pd.read_pickle(f"processed_datasets_with_bart_{dataset}_layer-1/mini_fact_{dataset}_test_unbalanced.pkl")


def balance_dataframe(df, label_name):
        df_label_1 = df[df[str(label_name)] == 1]
        df_label_0 = df[df[str(label_name)] == 0]
        min_class_count = min(len(df_label_1), len(df_label_0))
        df_label_1_downsampled = df_label_1.sample(min_class_count, random_state=42)
        df_label_0_downsampled = df_label_0.sample(min_class_count, random_state=42)
        balanced_df = pd.concat([df_label_1_downsampled, df_label_0_downsampled])
        return balanced_df.reset_index(drop=True)

df_test_sentence = balance_dataframe(df_test_sentence, 'label_sentence')
df_test_mini_fact = df_test_mini_fact[df_test_mini_fact['gen_sentence'].isin(df_test_sentence['output_sentence'])]

In [None]:
for gen_s in df_test_mini_fact['gen_sentence'].tolist():
    if gen_s not in df_test_sentence['output_sentence'].tolist():
        print(gen_s)

In [None]:
for gen_s in df_test_sentence['output_sentence'].tolist():
    if gen_s not in df_test_mini_fact['gen_sentence'].tolist():
        print(gen_s)

Manual Check that all mini facts remain in sentence

In [None]:
df_grouped = df_test_mini_fact.groupby("gen_sentence")


for name, group in df_grouped:
    gen_sentence = group["gen_sentence"].iloc[0]
    print(gen_sentence)
    mini_facts = group["output_mini_fact"].tolist()
    print(mini_facts)
    print("###")
    

Test Phi

In [None]:
import pandas as pd

dataset = "hover"

df1 = pd.read_pickle(f"probs_test_phi/probs_sentence_{dataset}.pkl")
df2 = pd.read_pickle(f"processed_datasets_phi/sentence_{dataset}_layer-1_test_unbalanced.pkl")

In [None]:
for gen_evidence in df1["output_sentence"].tolist():
    if gen_evidence not in df2["output_sentence"].tolist():
        print(gen_evidence)

In [None]:
for gen_evidence in df2["output_sentence"].tolist():
    if gen_evidence not in df1["output_sentence"].tolist():
        print(gen_evidence)

In [None]:
df_train = pd.read_pickle(f"processed_datasets_llama_{dataset}_layer-1/mini_fact_{dataset}_train.pkl")
df_test = pd.read_pickle(f"processed_datasets_phi_llama/mini_fact_{dataset}_layer-1_test_unbalanced.pkl")


all_docs1 = [item for sublist in df_train['docs'] for item in sublist]
all_docs2 = [item for sublist in df_test['docs'] for item in sublist]

mini_facts_train = df_train["output_mini_fact"].tolist() 
mini_facts_test = df_test["output_mini_fact"].tolist()

In [None]:
for mini_fact in mini_facts_train:
    if mini_fact in mini_facts_test:
        print(mini_fact)

In [None]:
for mini_fact in mini_facts_test:
    if mini_fact in mini_facts_train:
        print(mini_fact)

In [None]:
set_docs1 = set(all_docs1)
set_docs2 = set(all_docs2)

# Check if there's any overlap
common_docs = set_docs1.intersection(set_docs2)

# Print the result
if not common_docs:
    print("No common documents found between df1 and df2.")
else:
    print("Common documents found:", common_docs)