In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ProbeNN(nn.Module):
    def __init__(self, input_dim):
        super(ProbeNN, self).__init__()
        self.layer1 = nn.Linear(input_dim, 256)
        self.layer2 = nn.Linear(256, 128)
        self.layer3 = nn.Linear(128, 64)
        self.output = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = torch.relu(self.layer3(x))
        x = torch.sigmoid(self.output(x))
        return x



def evaluate_unbalanced_testset(test_pred_probs, df_test, accuracy_threshold):
    df_pred = df_test.copy()
    df_pred['pred_prob'] = test_pred_probs
    df_pred['binary_pred'] = df_pred['pred_prob'].apply(lambda x: 1 if x > 0.5 else 0)

    # Initialize counters in a dictionary
    collect_counts = {'collect1s': 0, 'collect0s': 0}

    # Define the evaluation function for each group
    def evaluate_group(group, counts):
        if group['label_mini_fact'].sum() == len(group):  # All labels are 1
            counts['collect1s'] += 1
            if group['binary_pred'].sum() == len(group):  # All predictions must be 1
                return 'correct'
            else:
                return 'incorrect'
        else:  # At least one label is 0
            counts['collect0s'] += 1
            if (group['binary_pred'] == 0).any():  # At least one prediction must be 0
                return 'correct'
            else:
                return 'incorrect'

    # Apply the evaluation function to each group
    grouped_predictions = df_pred.groupby('gen_sentence').apply(lambda grp: evaluate_group(grp, collect_counts)).reset_index(name='group_prediction')
    
    num_correct = (grouped_predictions['group_prediction'] == 'correct').sum()
    accuracy = num_correct / len(grouped_predictions)

    # Calculate the AUC-ROC score
    df_grouped = df_pred.groupby('gen_sentence').agg(
        true_group_label=('label_mini_fact', lambda x: 1 if x.sum() == len(x) else 0),
        pred_group_prob=('pred_prob', 'min')
    ).reset_index()
    
    auc_roc_score = roc_auc_score(df_grouped['true_group_label'], df_grouped['pred_group_prob'])
    return accuracy, collect_counts['collect1s'], collect_counts['collect0s'], auc_roc_score



def compute_roc_curve(test_labels, test_pred_prob):
    fpr, tpr, _ = roc_curve(test_labels, test_pred_prob)
    roc_auc = auc(fpr, tpr)
    return roc_auc, fpr, tpr 


def get_results(df_test, model_path, layer, probe_method, accuracy_threshold):
    test_embeddings = np.array(df_test[f'embeddings{layer}_{probe_method}'].tolist())
    test_labels = df_test[f'label_{probe_method}']

    model = ProbeNN(test_embeddings.shape[1]).to(device)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    with torch.no_grad():
        test_pred_prob = model(torch.tensor(test_embeddings, dtype=torch.float32).to(device)).cpu().numpy()

    test_pred = (test_pred_prob > accuracy_threshold).astype(int)
    
    # Calculate accuracy
    test_accuracy = accuracy_score(test_labels, test_pred)
    
    # Calculate ROC-AUC
    roc_auc = roc_auc_score(test_labels, test_pred_prob)
    
    # Calculate F1 score for positive (label 1) and negative (label 0) classes
    f1_score_positive = f1_score(test_labels, test_pred, pos_label=1)
    f1_score_negative = f1_score(test_labels, test_pred, pos_label=0)
    
    return test_pred_prob, test_accuracy, roc_auc, f1_score_positive, f1_score_negative


In [2]:
results_fever_test = []
results_hover_test = []

train_datasets = ["hover"]
test_datasets = ["hover"]

layer = -16
probe = "with_train_popularity_balanced"
test_real_samples = False

df_test_mini_fact1 = pd.read_pickle(f"./test/test_llm_generations.pkl")

if test_real_samples:
    df_test_mini_fact1 = df_test_mini_fact1[df_test_mini_fact1['label_mini_fact'] == 0]
    #df_test_mini_fact2 = pd.read_pickle(f"./train_folder/mini_fact_hover_dev_with_popularity_unbalanced.pkl")
    #df_test_mini_fact2 = df_test_mini_fact2[df_test_mini_fact2['label_mini_fact'] == 0]
    df_test_mini_fact3 = pd.read_pickle(f"./test/test_all_popularity_real_samples.pkl")
    df_test_mini_fact3 = df_test_mini_fact3.sample(frac=1).reset_index(drop=True)
    df_test_mini_fact = pd.concat([df_test_mini_fact1, df_test_mini_fact3], axis=0)
else:
    df_test_mini_fact = df_test_mini_fact1

model_path_mini_fact = f"./probes/{probe}.pth"


test_pred_probs_mini_fact, test_accuracy_mini_fact, roc_auc_mini_fact, f1_score_positive, f1_score_negative = get_results(df_test_mini_fact, 
                                                                                        model_path_mini_fact, 
                                                                                         layer=layer, 
                                                                                         probe_method="mini_fact", accuracy_threshold=0.5)

df_test_mini_fact['pred_prob'] = test_pred_probs_mini_fact
df_test_mini_fact['binary_pred'] = df_test_mini_fact['pred_prob'].apply(lambda x: 1 if x > 0.5 else 0)

if test_real_samples:
    df_test_mini_fact.to_pickle(f"predictions/prediction_with_real_samples_{probe}.pkl")
else:
    df_test_mini_fact.to_pickle(f"predictions/prediction_{probe}.pkl")


In [None]:
import pandas as pd

df = pd.read_pickle("test/sentence_hover_test_unbalanced.pkl")

df2 = pd.read_pickle("test/test_with_popularity_llm_generations.pkl")
df2.drop_duplicates(subset=['gen_evidence'], inplace=True)

df_merged = pd.merge(df, df2, on='gen_evidence', how='left')


test_pred_probs_sentences, test_accuracy_sentences, roc_auc_sentences ,f1_score_positive, f1_score_negative= get_results(df_test_mini_fact, 
                                                                                            model_path_mini_fact, 
                                                                                            layer=layer, 
                                                                                            probe_method="sentence", accuracy_threshold=0.5)

In [None]:
model_path_mini_fact = f"./probes/mini_fact_embeddings{layer}_hover_with_popularity_unbalanced.pth"
test_pred_probs_mini_fact, test_accuracy_mini_fact, roc_auc_mini_fact, f1_score_positive, f1_score_negative = get_results(df_test_mini_fact, 
                                                                                                model_path_mini_fact, 
                                                                                                layer=layer, 
                                                                                                probe_method="mini_fact", accuracy_threshold=0.5)


df_test_mini_fact['pred_prob'] = test_pred_probs_mini_fact
df_test_mini_fact['binary_pred'] = df_test_mini_fact['pred_prob'].apply(lambda x: 1 if x > 0.5 else 0)
df_test_mini_fact.to_pickle(f"train_folder/prediction_with_probe_popularity_unbalanced.pkl")
print(f1_score_positive)
print(f1_score_negative)

In [None]:
print(f1_score_positive)
print(f1_score_negative)



#if not balanced:
#    correct_ratio, collect1s, collect0s, auc_roc_score = evaluate_unbalanced_testset(test_pred_probs_mini_fact, df_test_mini_fact, accuracy_threshold=0.5)
    
if test_dataset == "fever":
    print(f"Test dataset: {test_dataset}")
    results_fever_test.append({
        "train_dataset": train_dataset,
        "test_dataset": test_dataset,
        "layer": str(33 + layer) + "th-layer",
        #"accuracySentence": test_accuracy_sentences if balanced else None,
        #"auc_sentences": roc_auc_sentences,
        "accuracyMiniFacts": test_accuracy_mini_fact if balanced else None,
        "auc_mini_facts": roc_auc_mini_fact,
        #"auc_mini_facts_sentences_match": auc_roc_score if not balanced else None
    })
elif test_dataset == "hover":
    print(f"Test dataset: {test_dataset}")
    results_hover_test.append({
        "train_dataset": train_dataset,
        "test_dataset": test_dataset,
        "layer": str(33 + layer) + "th-layer",
        #"accuracySentence": test_accuracy_sentences if balanced else None,
        #"auc_sentences": roc_auc_sentences,
        "accuracyMiniFacts": test_accuracy_mini_fact if balanced else None,
        "auc_mini_facts": roc_auc_mini_fact,
        #"auc_mini_facts_sentences_match": auc_roc_score if not balanced else None
    })

In [None]:
layers_order = [str(33 + layer) + "th-layer" for layer in layers]


# Function to prepare data
def prepare_data(df, value_vars, index_name):
    df_melted = df.melt(
        id_vars=['train_dataset', 'test_dataset', 'layer'],
        value_vars=value_vars,
        var_name=index_name,
        value_name='Value'
    )
    df_pivot = df_melted.pivot_table(
        index=[index_name, 'train_dataset', 'test_dataset'],
        columns='layer',
        values='Value'
    )
    df_pivot.columns.name = None  # Remove multi-index for columns
    df_pivot.reset_index(inplace=True)
    return df_pivot

# Prepare data for 'fever' and 'hover'
fever_df = pd.DataFrame(results_fever_test)
hover_df = pd.DataFrame(results_hover_test)

if balanced:
    fever_accuracy_pivot = prepare_data(fever_df, ['accuracySentence', 'accuracyMiniFacts'], 'Accuracy')
    hover_accuracy_pivot = prepare_data(hover_df, ['accuracySentence', 'accuracyMiniFacts'], 'Accuracy')
else:
    fever_auroc_pivot = prepare_data(fever_df, ['auc_sentences', 'auc_mini_facts', 'auc_mini_facts_sentences_match'], 'AUROC')
    hover_auroc_pivot = prepare_data(hover_df, ['auc_sentences', 'auc_mini_facts', 'auc_mini_facts_sentences_match'], 'AUROC')

In [None]:
fever_accuracy_pivot

In [None]:
import numpy as np
import pandas as pd

# General function to generate LaTeX table lines with bold max value per dataset
def generate_table_lines(train_dataset_name, test_dataset_name, df_pivot, metrics, metric_names):
    lines = []
    
    for idx, metric in enumerate(metrics):
        metric_name = metric_names[idx]
        if metric_name == 'Sentences' and not balanced:
            lines.append('\\addlinespace[11pt]')
        #train_dataset_label = f"\\textbf{{{train_dataset_name}}}" if idx == 0 else ''
        #test_dataset_label = f"\\textbf{{{test_dataset_name}}}" if idx == 0 else ''
        train_test_label = f"\\textbf{{{train_dataset_name}}} / \\textbf{{{test_dataset_name}}}" if idx == 0 else ''
        row_values = []

        max_value_row = max(df_pivot.loc[df_pivot['Accuracy'] == metric].values[0][3:])
        
        for layer in layers_order:
            value = df_pivot.loc[df_pivot['Accuracy'] == metric, layer].item()
            formatted_value = f"{value:.3f}"
            if np.isclose(float(value), max_value_row):  # Highlight max value
                formatted_value = f"\\textbf{{{formatted_value}}}"
            row_values.append(formatted_value)
        
        row = [train_test_label, metric_name] + row_values
        lines.append(' & '.join(row) + ' \\\\')
        
        if metric == metrics[-1]:
            lines.append('\\addlinespace[5pt]')  # Add space after last metric
    return lines

# Function to create a complete LaTeX table
def create_latex_table(df_pivot_fever, df_pivot_hover, metrics, metric_names, caption, label):
    header = ['Train/Test Set', 'Probe'] + [str(layer) for layer in layers_order]
    header_line = ' & '.join(header) + ' \\\\ \\hline'
    
    table_lines = [header_line]

    #if cross_test:
    #    test_dataset_name = "HoVer"
    #    train_dataset_name = "FEVER"
    #else:
    #    test_dataset_name = "FEVER"
    #    train_dataset_name = "FEVER"
    train_dataset_name =  "FEVER" if df_pivot_fever['train_dataset'].tolist()[0]=="fever" else "HoVer"
    test_dataset_name = "FEVER" if df_pivot_fever['test_dataset'].tolist()[0]=="fever" else "HoVer"
    table_lines.extend(generate_table_lines(train_dataset_name, test_dataset_name, df_pivot_fever, metrics, metric_names))
    #table_lines.append('\\addlinespace[5pt]')
    table_lines.append('\\hline')
    table_lines.append('\\addlinespace[5pt]')

    #if cross_test:
    #    test_dataset_name = "FEVER"
    #    train_dataset_name = "HoVer"
    #else:
    #    test_dataset_name = "HoVer"
    #    train_dataset_name = "HoVer"
    train_dataset_name =  "FEVER" if df_pivot_hover['train_dataset'].tolist()[0]=="fever" else "HoVer"
    test_dataset_name = "FEVER" if df_pivot_hover['test_dataset'].tolist()[0]=="fever" else "HoVer"

    table_lines.extend(generate_table_lines(train_dataset_name, test_dataset_name, df_pivot_hover, metrics, metric_names))
    table_lines.append('\\hline')
    
    # Assemble the LaTeX table
    latex_table = [
        '\\begin{table}[h]',
        '\\centering',
        '\\small',
        f'\\label{{{label}}}',
        '\\begin{tabular}{p{1.5cm} p{4.5cm} r r r r r }',
        '\\hline'
    ]
    latex_table.extend(table_lines)
    latex_table.append('\\end{tabular}')
    latex_table.append(f'\\caption{{{caption}}}')
    latex_table.append('\\end{table}')
    
    return '\n'.join(latex_table)


if balanced:
    # Create and print the Accuracy table
    metrics_accuracy = ['accuracyMiniFacts', 'accuracySentence']
    metric_names_accuracy = ['Mini Facts', 'Sentence']
    latex_table_accuracy = create_latex_table(
        fever_accuracy_pivot, hover_accuracy_pivot,
        metrics_accuracy, metric_names_accuracy,
        '\\textbf{Accuracy} Across Different Layers for balanced FEVER and HoVer test datasets',
        'tab:accuracy_probes'
    )
    print(latex_table_accuracy)

else:
    # Create and print the AUROC table
    metrics_auc = ['auc_mini_facts', 'auc_mini_facts_sentences_match', 'auc_sentences']
    metric_names_auc = ['Mini Facts -> Mini Facts Level', 'Mini Facts -> Sentence Level', 'Sentences']
    latex_table_auc = create_latex_table(
        fever_auroc_pivot, hover_auroc_pivot,
        metrics_auc, metric_names_auc,
        """\\textbf{AUROC} scores across Different Layers for unbalanced FEVER and HoVer test datasets""" if train_datasets[0]==test_datasets[0] else """\\textbf{AUROC} scores for Cross Evaluation for HoVer probes on a FEVER test set and FEVER probes on a HoVer test set.""",
        'tab:auroc_probes' if not train_datasets[0]==test_datasets[0] else 'tab:auroc_probes_cross_test'
    )
    print(latex_table_auc)




In [None]:
df_pivot

Tests

In [None]:
test_pred_probs_mini_fact, test_accuracy_mini_fact, roc_auc_mini_fact = get_results(df_test_mini_fact, 
                                                                                    model_path_mini_fact, 
                                                                                    layer=layer, 
                                                                                    probe_method="mini_fact", accuracy_threshold=0.5)

df_test_mini_fact['pred_prob'] = test_pred_probs_mini_fact
df_test_mini_fact['binary_pred'] = df_test_mini_fact['pred_prob'].apply(lambda x: 1 if x > 0.5 else 0)
true_count = 0

for name, group in df_test_mini_fact.groupby('gen_sentence'):
    print(name)
    print(group['label_mini_fact'].values)

    #print(group['pred_prob'].values)
    #print(group['binary_pred'].values)
    """
    if 0 in group['label_mini_fact'].values:
        if 0 in group['binary_pred'].values:
            print("True")
            true_count += 1
        else:
            print("False")
    else:
        if 0 in group['binary_pred'].values:
            print("False")
        else:
            print("True")
            true_count += 1
    """
    gen_sentence = df_test_sentence[df_test_sentence['output_sentence'] == name]['output_sentence'].values[0]
    label_sentence = df_test_sentence[df_test_sentence['output_sentence'] == name]['label_sentence'].values[0]
    print(gen_sentence)
    print(label_sentence)
    print("###")

#true_count / len(df_test_mini_fact['gen_sentence'].unique())

In [None]:
import pandas as pd
import ast

df = pd.read_csv("./datasets_with_bart_hover/sentences_hover.csv")

df['docs'] = df['docs'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
#del df['concat_probs']

df.to_pickle("./datasets_with_bart_hover/sentence_hover.pkl")

In [None]:
df.to_pickle("./datasets_with_bart_fever/sentence_fever.pkl")

In [None]:

df.to_pickle("./datasets_with_bart_hover/mini_facts_hover.pkl")

In [None]:
def correct_str(str_arr):
    val_to_ret = (str_arr.replace("[array(", "")
                        .replace("dtype=float32)]", "")
                        .replace("\n","")
                        .replace(" ","")
                        .replace("],","]")
                        .replace("[","")
                        .replace("]",""))
    return val_to_ret

In [None]:
import ast
import pandas as pd

df = pd.read_csv("mini_facts_correction/atomic_facts_hover_gen_evidence_without_embeddings.csv")

df['docs'] = df['docs'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

del df['docs_y']
del df['concat_probs']

df.to_pickle("mini_facts_correction/mini_facts_hover_gen_evidence.pkl")

In [None]:
df.head()

In [None]:
df['docs'] = df['docs'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

del df['embeddings_sentence']


df.to_pickle(f"processed_datasets_{dataset}/{probe_method}_{dataset}.pkl")