In [6]:
import math
import torch
import pandas as pd
from sklearn.metrics import roc_auc_score
import numpy as np

datasets = ["fever", "hover"]
model_name = "llama"

df_metrics = pd.DataFrame()

for dataset in datasets:
    print(dataset)
    #df_probs_importance_sentences = pd.read_pickle(f"datasets_{dataset}/probs_token_importance_sentence_with_bart_{dataset}.pkl")
    #df_sentence_test = pd.read_pickle(f"probs_test_llama/df_new_{dataset}_probs_sentence_with_token_importance.pkl")
    df_sentence_test = pd.read_pickle(f"probs_test_{model_name}/probs_sentence_{dataset}_with_token_importance.pkl")

    #df_merged = pd.merge(df_probs_importance_sentences, df_sentence_test, on="output_sentence", how="inner")
    #df_merged.rename(columns={"label_sentence_x": "label_sentence"}, inplace=True)
    #df_merged_clean = df_merged[['output_sentence', 'label_sentence', 'concat_probs_sentence', 'token_importance']]

    # Define a helper function to flatten the nested lists of probabilities
    def flatten_probs(concat_probs):
        """Flatten the nested lists of probabilities in concat_probs."""
        return [prob for _, prob_list in concat_probs for prob in prob_list]

    # Define functions for specific calculations
    def calc_min_prob(probs):
        """Calculate the minimum probability from concat_probs."""
        #flat_probs = flatten_probs(concat_probs)
        log_probs = [-math.log(p) for p in probs]
        # Return the maximum value from the log probabilities
        return max(log_probs)

    def calc_avg_prob(probs):
        """Calculate the average probability from concat_probs."""
        #flat_probs = flatten_probs(concat_probs)
        log_probs = [-math.log(p) for p in probs]
        # Compute the average of the log probabilities
        avg_log_prob = sum(log_probs) / len(log_probs) if log_probs else 0.0
        return avg_log_prob

    def calc_token_importance(token_importance, probs, pe):
        """Calculate the weighted score using token importance and probabilities."""
        #flat_probs = flatten_probs(concat_probs)
        log_probs = [-math.log(prob) * prob for prob in probs]
        weighted_score = (token_importance / token_importance.sum()) * torch.tensor(pe)
        return weighted_score.sum().item()

    def calc_entropy(pe):
        """Calculate the entropy from concat_probs."""
        #flat_probs = flatten_probs(concat_probs)
        #return -sum([prob * math.log(prob) for prob in probs])
        return max(pe) 

    # Helper function to process each row of the DataFrame
    def process_row(row):
        concat_probs = row['concat_probs_sentence']
       #print(concat_probs)

        # llama
        if model_name == "llama":
            probs = [probs[2] for probs in concat_probs]
            probs = [item for sublist in probs for item in sublist]
            pe = [probs[3] for probs in concat_probs]
            pe = [item for sublist in pe for item in sublist]

        # phi 
        elif model_name == "phi":
            probs = [probs[1] for probs in concat_probs]
            pe = [probs[2] for probs in concat_probs]

        #print(pe)
        token_importance = torch.tensor(row['token_importance'])
        label = row['label_sentence']

        if not probs:
            return None
        else:
            # Calculate necessary values using the defined functions
            min_prob = calc_min_prob(probs)
            avg_prob = calc_avg_prob(probs)
            entropy = calc_entropy(pe)
            weighted_score = calc_token_importance(token_importance, probs, pe)

        

        return pd.Series({
            "label_sentence": label,
            "avg_prob": avg_prob,
            "min_prob": min_prob,
            "token_weighted_score": weighted_score,
            "entropy": entropy
        })

    # Main function to process the DataFrame using .apply()
    def process_dataframe(df):
        return df.apply(process_row, axis=1).dropna()

    # Main execution
    df_results = df_sentence_test.copy()

    # Apply the processing function to each row of the DataFrame and store the results in a new DataFrame
    df_results = process_dataframe(df_results)


    def calculate_auroc(df, col_name, inverse=False):
        if inverse:
            labels = 1 - df["label_sentence"]
        else:
            labels = df["label_sentence"]
        prob_scores = df[col_name]
        auroc = roc_auc_score(labels, prob_scores)
        return auroc
    



    # Calculate AUROC for each calculated metric
    auroc_avg_prob = calculate_auroc(df_results, "avg_prob", inverse=True)
    auroc_max_prob = calculate_auroc(df_results, "min_prob", inverse=True)
    auroc_entropy = calculate_auroc(df_results, "entropy", inverse=True)
    auroc_token_weighted_score = calculate_auroc(df_results, "token_weighted_score", inverse=True)


    new_metrics_df = pd.DataFrame({
        "auroc": [auroc_avg_prob, auroc_max_prob, auroc_entropy, auroc_token_weighted_score],
        "metric": ["avg_prob", "max_prob", "entropy", "token_weighted_score"],
        "dataset": [dataset] * 4
    })

    df_metrics = pd.concat([df_metrics, new_metrics_df], ignore_index=True)

    # Results
    print(f"AUROC for average probability: {auroc_avg_prob:.4f}")
    print(f"AUROC for maximum probability: {auroc_max_prob:.4f}")


    print(f"AUROC for token weighted score: {auroc_token_weighted_score:.4f}")
    print(f"AUROC for entropy: {auroc_entropy:.4f}")


    df_results['prob_label_avg'] = df_results.apply(lambda row: 1 if row['avg_prob'] >= 0.2 else 0, axis=1)
    df_results['prob_label_min'] = df_results.apply(lambda row: 1 if row['min_prob'] >= 0.2 else 0, axis=1)

    # Accuracy
    #accuracy_avg = (df_results['prob_label_avg'] == df_results['label_sentence']).mean()
    #accuracy_min = (df_results['prob_label_min'] == df_results['label_sentence']).mean()

    # Results
    #print(f"Accuracy for average probability: {accuracy_avg:.4f}") 
    #print(f"Accuracy for minimum probability: {accuracy_min:.4f}") 
    
    
    



fever
AUROC for average probability: 0.6245
AUROC for maximum probability: 0.6492
AUROC for token weighted score: 0.6518
AUROC for entropy: 0.6921
hover
AUROC for average probability: 0.6630
AUROC for maximum probability: 0.6893
AUROC for token weighted score: 0.6993
AUROC for entropy: 0.7155


In [43]:
metric_order = ['avg_prob', 'min_prob', 'entropy', 'token_weighted_score']
df_metrics['metric'] = pd.Categorical(df_metrics['metric'], categories=metric_order, ordered=True)

df_pivot = df_metrics.pivot(index='metric', columns='dataset', values='auroc')

In [44]:
df_pivot

dataset,fever,hover
metric,Unnamed: 1_level_1,Unnamed: 2_level_1
avg_prob,0.518684,0.627768
min_prob,0.647431,0.725818
entropy,0.591875,0.68729
token_weighted_score,0.518436,0.639406


In [45]:
def create_custom_latex_table(df_pivot, metrics, metric_names, caption, label):
    # Start constructing the table with LaTeX syntax
    table_lines = [
        '\\begin{table}[h]',
        '\\centering',
        '\\small',
        f'\\label{{{label}}}',
        '\\begin{tabular}{l r r}',  # l: left-align for metric, r: right-align for values
        '\\hline',
        'Metric & Fever & Hover \\\\ \\hline'
    ]
    
    # Loop through metrics and manually add rows
    for metric, metric_name in zip(metrics, metric_names):
        fever_value = df_pivot.loc[metric, 'fever']
        hover_value = df_pivot.loc[metric, 'hover']
        row = f'{metric_name} & {fever_value:.3f} & {hover_value:.3f} \\\\'
        table_lines.append(row)
    
    # Finalize the table with LaTeX syntax
    table_lines.append('\\hline')
    table_lines.append('\\end{tabular}')
    table_lines.append(f'\\caption{{{caption}}}')
    table_lines.append('\\end{table}')
    
    return '\n'.join(table_lines)

# Example usage
metrics = ['avg_prob', 'entropy', 'min_prob', 'token_weighted_score']
metric_names = ['Average Probability', 'Minimum Probability', 'Entropy', 'TokenSAR']
caption = 'Sentence Level Evaluation of Baselines on FEVER and HoVer Test Datasets'
label = 'tab:metric_performance'

# Assuming `df_pivot` is your pivoted DataFrame as created earlier
latex_table_code = create_custom_latex_table(df_pivot, metrics, metric_names, caption, label)
print(latex_table_code)


\begin{table}[h]
\centering
\small
\label{tab:metric_performance}
\begin{tabular}{l r r}
\hline
Metric & Fever & Hover \\ \hline
Average Probability & 0.519 & 0.628 \\
Minimum Probability & 0.592 & 0.687 \\
Entropy & 0.647 & 0.726 \\
TokenSAR & 0.518 & 0.639 \\
\hline
\end{tabular}
\caption{Sentence Level Evaluation of Baselines on FEVER and HoVer Test Datasets}
\end{table}


In [23]:
import pandas as pd 
dataset = "hover"

df = pd.read_pickle(f"probs_test_llama/probs_sentence_{dataset}_with_token_importance.pkl")
#df_sentence = pd.read_pickle(f"processed_datasets_with_bart_{dataset}_layer-1/sentence_{dataset}_test_unbalanced.pkl")

In [24]:
for index, row in df.iterrows():
    concat = row['concat_probs_sentence']
    sentence = row['output_sentence']
    #concat = [item for sublist in concat for item in sublist]
    #print(concat)
    tokens = [tokens[0] for tokens in concat]
    words = [tokens[1] for tokens in concat]
    #tokens = [item for sublist in tokens for item in sublist]
    print(tokens)

    print("".join(tokens))
    print(sentence.replace(tokens[1], ""))
    break

['The', 'song', '"After', 'the', 'News"', 'is', 'by', 'the', 'band', 'The', 'Alan', 'Parsons', 'Project,', 'from', 'their', '1985', 'album', '"Stereotomy".']
Thesong"AftertheNews"isbythebandTheAlanParsonsProject,fromtheir1985album"Stereotomy".
The  "After the News" is by the band The Alan Parsons Project, from The Alan Parsons Project's 1985 album "Stereotomy".


In [None]:
print(len(df_sentence))
df_sentence.drop_duplicates(subset=['output_sentence'], inplace=True)
print(len(df_sentence))

In [None]:
for gen_e in df['output_sentence'].tolist():
    if gen_e not in df_sentence['output_sentence'].tolist():
        print(gen_e)

In [None]:
for gen_e in df_sentence['output_sentence'].tolist():
    if gen_e not in df['output_sentence'].tolist():
        print(gen_e)