In [1]:
import os
import sys
import torch
from scripts.utils.utils import summarize_attributions

In [2]:
ds = "liar"
totals = {}
for load in ["ig", "lime", "shap"]:
    with torch.no_grad():
        models = {}
        for item in os.listdir(f"./pt/{ds}/{load}/"):
            models[item] = []
            for idx, pt in enumerate(os.listdir(f"./pt/{ds}/{load}/{item}")):
                pt: torch.Tensor = torch.load(f"./pt/{ds}/{load}/{item}/{pt}")[0]
                if load != "ig":
                    models[item].append(pt.detach())
                else:
                    models[item].append(summarize_attributions(pt.detach()))
    totals[load] = models

In [3]:
totals.keys()

dict_keys(['ig', 'lime', 'shap'])

In [4]:
totals['ig'].keys()

dict_keys(['Llama-2-7b-chat-hf', 'Orca-2-7b', 'falcon-7b-instruct', 'Mistral-7B-Instruct-v0.2'])

In [5]:
# for model in totals['ig'].keys():
#     assert len(totals['ig'][model]) == len(totals['lime'][model])
#     assert len(totals['ig'][model]) == len(totals['shap'][model])

In [6]:
cats = {}
for exp in totals:
    cats[exp]  = torch.cat([torch.cat(totals[exp][model]) for model in totals[exp]])
cats.keys()

dict_keys(['ig', 'lime', 'shap'])

In [7]:
print(f"abs means:  ig: {cats['ig'].abs().mean():.3f}, lime: {cats['lime'].abs().mean():.3f}, shap: {cats['shap'].abs().mean():.3f}")
print(f"std:  ig: {cats['ig'].std():.3f}, lime: {cats['lime'].std():.3f}, shap: {cats['shap'].std():.3f}")

abs means:  ig: 0.026, lime: 0.008, shap: 0.030
std:  ig: 0.048, lime: 0.018, shap: 0.043


In [8]:
# try calculating (mean, std) per-sample before averaging them
per_sample_means = {}
per_sample_std = {}
for exp in totals:
    per_sample_means[exp] = torch.tensor([torch.tensor([el.abs().mean() for el in totals[exp][model]]).mean() for model in totals[exp]]).mean()
    per_sample_std[exp] = torch.tensor([torch.tensor([el.std() for el in totals[exp][model]]).mean() for model in totals[exp]]).mean()
print(f"abs means:  ig: {per_sample_means['ig']:.3f}, lime: {per_sample_means['lime']:.3f}, shap: {per_sample_means['shap']:.3f}")
print(f"std:  ig: {per_sample_std['ig']:.3f}, lime: {per_sample_std['lime']:.3f}, shap: {per_sample_std['shap']:.3f}")

abs means:  ig: 0.026, lime: 0.007, shap: 0.029
std:  ig: 0.042, lime: 0.015, shap: 0.039


In [9]:
# Find the average number of tokens that together add up to the absolute value of the sample
# basically, how many tokens on average it takes to mostly represent a sample
def num_representative(t: torch.Tensor, thresh:float=0.8)->int:
    t= t.abs()
    s = t.sum()

    max_sorted = t.sort(descending=True)
    cur_sum = 0
    n = 0
    while cur_sum < thresh * s:
        cur_sum += max_sorted.values[n]
        n += 1
    return n


In [10]:
num_representative(torch.tensor([0.9, 0.8, 0.1, -0.5, -11.]))

1

In [11]:
avg_num_representative_percent = {}
for exp in totals:
    avg_num_representative_percent[exp] = torch.cat([torch.tensor([num_representative(sample, 0.85) / sample.shape[0] for sample in totals[exp][model]], dtype=torch.float) for model in totals[exp]]).mean()
print(f"avg percentage of tokens needed to make up 85% of a sample's magnitude:  ig: {avg_num_representative_percent['ig']:.3f}, lime: {avg_num_representative_percent['lime']:.3f}, shap: {avg_num_representative_percent['shap']:.3f}")

avg percentage of tokens needed to make up 85% of a sample's magnitude:  ig: 0.476, lime: 0.403, shap: 0.560


In [12]:
avg_num_representative = {}
for exp in totals:
    avg_num_representative[exp] = torch.cat([torch.tensor([num_representative(sample, 0.85) for sample in totals[exp][model]], dtype=torch.float) for model in totals[exp]]).mean()
print(f"avg number of tokens needed to make up 85% of a sample's magnitude:  ig: {avg_num_representative['ig']:.3f}, lime: {avg_num_representative['lime']:.3f}, shap: {avg_num_representative['shap']:.3f}")

avg number of tokens needed to make up 85% of a sample's magnitude:  ig: 41.042, lime: 35.220, shap: 48.423


In [24]:
average_tokens = {}
for exp in totals:
    average_tokens[exp] = {model: torch.tensor([num_representative(sample, 0.85) for sample in totals[exp][model]], dtype=torch.float) for model in totals[exp]}

# sum rows:
summed_rows = {exp: torch.cat([val for val in average_tokens[exp].values()]).mean() for exp in average_tokens}
print(summed_rows)
# sum cols
summed_cols = {model: torch.cat([average_tokens[exp][model] for exp in totals]).mean() for model in average_tokens['ig']}
print(summed_cols)

print()
# models on top
# types on left
# so each row will print out the models' results for that type, as well as the total result for that type

result_str = "Experiment Type"
for model in average_tokens['ig']:
    result_str += f" & {model}"
result_str += " & Average\n"
for exp in totals:
    result_str += f"{exp} & "
    for model in average_tokens[exp]:
        result_str += f"{average_tokens[exp][model].mean():.3f} & "
    result_str += f" {summed_rows[exp]:.3f}\n"

result_str += "Average & "
for model in average_tokens['ig']:
    result_str += f"{summed_cols[model]:.3f} & "

print(result_str)

{'ig': tensor(41.0425), 'lime': tensor(35.2200), 'shap': tensor(48.4231)}
{'Llama-2-7b-chat-hf': tensor(42.1250), 'Orca-2-7b': tensor(42.4780), 'falcon-7b-instruct': tensor(36.8300), 'Mistral-7B-Instruct-v0.2': tensor(43.7600)}

Experiment Type & Llama-2-7b-chat-hf & Orca-2-7b & falcon-7b-instruct & Mistral-7B-Instruct-v0.2 & Average
ig & 40.130 & 41.230 & 38.680 & 44.130 &  41.042
lime & 38.960 & 41.200 & 23.820 & 36.900 &  35.220
shap & 49.280 & 48.375 & 46.140 & 49.880 &  48.423
Average & 42.125 & 42.478 & 36.830 & 43.760 & 


In [26]:
average_tokens = {}
for exp in totals:
    average_tokens[exp] = {model: torch.tensor([num_representative(sample, 0.85) / sample.shape[0] for sample in totals[exp][model]], dtype=torch.float) for model in totals[exp]}

# sum rows:
summed_rows = {exp: torch.cat([val for val in average_tokens[exp].values()]).mean() for exp in average_tokens}
print(summed_rows)
# sum cols
summed_cols = {model: torch.cat([average_tokens[exp][model] for exp in totals]).mean() for model in average_tokens['ig']}
print(summed_cols)

print()
# models on top
# types on left
# so each row will print out the models' results for that type, as well as the total result for that type

result_str = "Experiment Type"
for model in average_tokens['ig']:
    result_str += f" & {model}"
result_str += " & Average\n"
for exp in totals:
    result_str += f"{exp} & "
    for model in average_tokens[exp]:
        result_str += f"{average_tokens[exp][model].mean()*100:.1f}\\% & "
    result_str += f" {summed_rows[exp]*100:.1f}\\%\n"

result_str += "Average & "
for model in average_tokens['ig']:
    result_str += f"{summed_cols[model]*100:.1f}\\% & "

print(result_str)

{'ig': tensor(0.4758), 'lime': tensor(0.4034), 'shap': tensor(0.5605)}
{'Llama-2-7b-chat-hf': tensor(0.4754), 'Orca-2-7b': tensor(0.4802), 'falcon-7b-instruct': tensor(0.4472), 'Mistral-7B-Instruct-v0.2': tensor(0.5055)}

Experiment Type & Llama-2-7b-chat-hf & Orca-2-7b & falcon-7b-instruct & Mistral-7B-Instruct-v0.2 & Average
ig & 45.5\% & 46.6\% & 47.0\% & 51.2\% &  47.6\%
lime & 43.8\% & 46.2\% & 29.0\% & 42.4\% &  40.3\%
shap & 55.4\% & 55.1\% & 55.9\% & 57.5\% &  56.0\%
Average & 47.5\% & 48.0\% & 44.7\% & 50.5\% & 
