In [5]:
import math
import torch
import pandas as pd
from sklearn.metrics import roc_auc_score
import numpy as np

datasets = ["fever", "hover"]
model_name = "llama"

df_metrics = pd.DataFrame()

for dataset in datasets:
    print(dataset)
    df_sentence_test = pd.read_pickle(f"probs_test_{model_name}/probs_sentence_{dataset}_with_token_importance.pkl")



    def calc_min_prob(probs):
        """Calculate the minimum probability from concat_probs."""
        return min(probs)


    def calc_token_importance(token_importance, probs, pe):
        """Calculate the weighted score using token importance and probabilities."""
        log_probs = [-math.log(prob) * prob for prob in probs]
        weighted_score = (token_importance / token_importance.sum()) * torch.tensor(log_probs)
        return weighted_score.sum().item()

    def calc_entropy(pe):
        """Calculate the entropy from concat_probs."""
        return sum(pe) / len(pe)

    # Helper function to process each row of the DataFrame
    def process_row(row):
        concat_probs = row['concat_probs_sentence']

        # llama
        if model_name == "llama":
            probs = [probs[2] for probs in concat_probs]
            probs = [item for sublist in probs for item in sublist]
            pe = [probs[3] for probs in concat_probs]
            pe = [item for sublist in pe for item in sublist]

        # phi 
        elif model_name == "phi":
            probs = [probs[1] for probs in concat_probs]
            pe = [probs[2] for probs in concat_probs]

        token_importance = torch.tensor(row['token_importance'])
        label = row['label_sentence']

        if not probs:
            return None
        else:
            # Calculate necessary values using the defined functions
            min_prob = calc_min_prob(probs)
            entropy = calc_entropy(pe)
            weighted_score = calc_token_importance(token_importance, probs, pe)

        
        return pd.Series({
            "label_sentence": label,
            "min_prob": min_prob,
            "token_weighted_score": weighted_score,
            "entropy": entropy
        })

    # Main function to process the DataFrame using .apply()
    def process_dataframe(df):
        return df.apply(process_row, axis=1).dropna()

    # Main execution
    df_results = df_sentence_test.copy()

    # Apply the processing function to each row of the DataFrame and store the results in a new DataFrame
    df_results = process_dataframe(df_results)


    def calculate_auroc(df, col_name, inverse=False):
        if inverse:
            labels = 1 - df["label_sentence"]
        else:
            labels = df["label_sentence"]
        prob_scores = df[col_name]
        auroc = roc_auc_score(labels, prob_scores)
        return auroc
    



    # Calculate AUROC for each calculated metric
    auroc_min_prob = calculate_auroc(df_results, "min_prob")
    auroc_entropy = calculate_auroc(df_results, "entropy", inverse=True)
    auroc_token_weighted_score = calculate_auroc(df_results, "token_weighted_score", inverse=True)


    new_metrics_df = pd.DataFrame({
        "auroc": [auroc_min_prob, auroc_entropy, auroc_token_weighted_score],
        "metric": ["min_prob", "entropy", "token_weighted_score"],
        "dataset": [dataset] * 3
    })

    df_metrics = pd.concat([df_metrics, new_metrics_df], ignore_index=True)

    # Results
    print(f"AUROC for minimum probability: {auroc_min_prob:.4f}")
    print(f"AUROC for token weighted score: {auroc_token_weighted_score:.4f}")
    print(f"AUROC for entropy: {auroc_entropy:.4f}")

fever
AUROC for minimum probability: 0.6492
AUROC for token weighted score: 0.6076
AUROC for entropy: 0.6384
hover
AUROC for minimum probability: 0.6893
AUROC for token weighted score: 0.6493
AUROC for entropy: 0.6735


In [None]:
metric_order = ['avg_prob', 'min_prob', 'entropy', 'token_weighted_score']
df_metrics['metric'] = pd.Categorical(df_metrics['metric'], categories=metric_order, ordered=True)

df_pivot = df_metrics.pivot(index='metric', columns='dataset', values='auroc')

In [None]:
def create_custom_latex_table(df_pivot, metrics, metric_names, caption, label):
    # Start constructing the table with LaTeX syntax
    table_lines = [
        '\\begin{table}[h]',
        '\\centering',
        '\\small',
        f'\\label{{{label}}}',
        '\\begin{tabular}{l r r}',  # l: left-align for metric, r: right-align for values
        '\\hline',
        'Metric & Fever & Hover \\\\ \\hline'
    ]
    
    # Loop through metrics and manually add rows
    for metric, metric_name in zip(metrics, metric_names):
        fever_value = df_pivot.loc[metric, 'fever']
        hover_value = df_pivot.loc[metric, 'hover']
        row = f'{metric_name} & {fever_value:.3f} & {hover_value:.3f} \\\\'
        table_lines.append(row)
    
    # Finalize the table with LaTeX syntax
    table_lines.append('\\hline')
    table_lines.append('\\end{tabular}')
    table_lines.append(f'\\caption{{{caption}}}')
    table_lines.append('\\end{table}')
    
    return '\n'.join(table_lines)

# Example usage
metrics = ['avg_prob', 'entropy', 'min_prob', 'token_weighted_score']
metric_names = ['Average Probability', 'Minimum Probability', 'Entropy', 'TokenSAR']
caption = 'Sentence Level Evaluation of Baselines on FEVER and HoVer Test Datasets'
label = 'tab:metric_performance'

# Assuming `df_pivot` is your pivoted DataFrame as created earlier
latex_table_code = create_custom_latex_table(df_pivot, metrics, metric_names, caption, label)
print(latex_table_code)
