In [3]:
import json
import numpy as np
import calibration_metric
from calibration_metric.vis.calibration_plot import plot_df, get_df_from_file


## Qualitative analysis 1: what's up with the spikes?
On CalFlow and TreeDST, there is one spike in error at relatively high confidence for large models with a BPE tokenizer (Bart-large, Codegen).
What's going on in here? 

In [10]:
models_and_paths = [("bart-large-calflow", "/brtx/603-nvme1/estengel/calflow_calibration/benchclamp/text_to_calflow/1.0/bart-large_calflow_last_user_all_0.0001/checkpoint-10000/outputs/test_all.logits"),
                    ("bart-large-treedst", "/brtx/605-nvme1/estengel/calflow_calibration/benchclamp/text_to_treedst/1.0/bart-large_tree_dst_last_user_all_0.0001/checkpoint-10000/outputs/test_all.logits"),
                    ("codegen-350M-calflow", "/brtx/604-nvme2/estengel/calflow_calibration/benchclamp/codegen-350M_calflow/outputs/test_all.logits"),
                    ("codegen-2B-calflow", "/brtx/604-nvme2/estengel/calflow_calibration/benchclamp/codegen-2B_calflow/outputs/test_all.logits"),
                    ("codegen-6B-calflow", "/brtx/604-nvme2/estengel/calflow_calibration/benchclamp/codegen-6B_calflow/outputs/test_all.logits")
                    ]

dfs_by_model = {}
data_by_model = {}
for model, path in models_and_paths:
    df, ece = get_df_from_file(path, binning_strategy="adaptive")
    dfs_by_model[model] = df
    with open(path) as f1:
        data = [json.loads(line) for line in f1.readlines()]
    data_by_model[model] = data


In [48]:
from collections import defaultdict

def detect_bins(df, threshold=0.20):

    diff = (df['prob_model'] - df['prob_correct']).abs()
    df['diff'] = diff
    outlier_lines =  df[df['diff'] >= threshold]
    return outlier_lines

def get_tokens_of_prob(prob_range, data):
    tokens_to_ret = []
    for line in data: 
        top_k_logits = np.array(line['top_logits']) 
        top_k_logit_idxs = np.array(line['top_logit_idxs'])

        top_one_logit_local_idx = np.argmax(top_k_logits, axis=-1)
        top_one_logit_local_idx = top_one_logit_local_idx.reshape((-1, 1))

        top_one_logit = np.take_along_axis(top_k_logits, top_one_logit_local_idx, axis=1)
        top_one_logit_idx = np.take_along_axis(top_k_logit_idxs, top_one_logit_local_idx, axis=1)

        top_logits = top_one_logit.reshape(-1)
        top_logit_idxs = top_one_logit_idx.reshape(-1)

        
        for logit, idx in zip(top_logits, top_logit_idxs):
            if logit > prob_range[0] and logit < prob_range[1]:
                tokens_to_ret.append(idx)
        
    return tokens_to_ret

def get_prob_range(prob, df):
    line = df[df['prob_model'] == prob]
    # get previous and next line 
    try:
        prev_line = df[df['prob_model'] < prob].iloc[0]
        prev_prob = prev_line['prob_model']
    except IndexError:
        prev_prob = prob
    try:
        next_line = df[df['prob_model'] > prob].iloc[-1]
        next_prob = next_line['prob_model']
    except IndexError:
        next_prob = prob
    return (prev_prob, next_prob)


tokens_by_prob_and_model = defaultdict(lambda: defaultdict(list)) 

for model, df in dfs_by_model.items():
    if model == "codegen-2B-calflow":
        thresh = 0.12
    else:
        thresh = 0.19
    outlier_lines = detect_bins(df, threshold=thresh)
    # iterate over pandas df line-by-line as a dict
    for line in outlier_lines.to_records(index=False):
        model_prob = line['prob_model']
        prob_range = get_prob_range(model_prob, df)
        tokens = get_tokens_of_prob(prob_range, data_by_model[model])
        tokens_by_prob_and_model[model][model_prob] = tokens


In [50]:
from transformers import AutoTokenizer

def get_model_name(model):
    if "bart" in model:
        return "/brtx/601-nvme1/estengel/.cache/bart-large/"
    elif "codegen" in model:
        return "/brtx/601-nvme1/estengel/.cache/codegen-350M/"
    return None

for model, data in tokens_by_prob_and_model.items():
    tokenizer = AutoTokenizer.from_pretrained(get_model_name(model))
    for prob, toks in data.items():
        toks = set(tokenizer.convert_ids_to_tokens(toks))
        toks = sorted(list(toks))
        print(f"{model} {prob} {toks}")


bart-large-calflow 0.9596818642975404 ['!"', '"', '")', '"))', '(', ')', '))', ')))', '))))', ',', '.', '0', '2', '</s>', '=', '==', '>', '?', 'AM', 'ARA', 'Accept', 'After', 'All', 'Am', 'And', 'Anything', 'At', 'Att', 'Bar', 'Before', 'Bell', 'Between', 'But', 'C', 'Cal', 'Car', 'Ch', 'Choose', 'Cl', 'Co', 'Coach', 'Com', 'Con', 'Create', 'D', 'Dan', 'Date', 'David', 'Day', 'Delete', 'Do', 'Dr', 'During', 'E', 'Event', 'Exec', 'F', 'Find', 'For', 'From', 'Fu', 'Full', 'G', 'Generic', 'H', 'Here', 'Hol', 'Hour', 'Howard', 'Is', 'J', 'Jackson', 'John', 'L', 'Location', 'M', 'MD', 'Marsh', 'Max', 'Military', 'Min', 'Month', 'Mr', 'Multi', 'N', 'Ne', 'Need', 'New', 'Next', 'Now', 'Num', 'Number', 'Of', 'On', 'One', 'Other', 'P', 'PM', 'People', 'Person', 'Ph', 'Place', 'Query', 'Rec', 'Rem', 'Res', 'Rev', 'Role', 'Ron', 'S', 'Saturday', 'Scope', 'Small', 'Spec', 'Sports', 'Star', 'Stone', 'Sum', 'T', 'Temperature', 'This', 'Time', 'Today', 'Tomorrow', 'Trivia', 'UC', 'Update', 'User', 'W

In [None]:
done = []
for model_a, data_a in tokens_by_prob_and_model.items():
    for model_b, data_b in tokens_by_prob_and_model.items():
        if model_a == model_b:
            continue
        if (model_a, model_b) in done or (model_b, model_a) in done:
            continue
        all_toks_a = data_a.values()
        all_toks_a = set([item for sublist in all_toks_a for item in sublist]) 
        all_toks_b = data_b.values()
        all_toks_b = set([item for sublist in all_toks_b for item in sublist])

        intersect  = all_toks_a.intersection(all_toks_b)
        union = all_toks_a.union(all_toks_b)
        print(f"{model_a} {model_b} {len(intersect)} {len(union)} {len(intersect)/len(union):.2f}")
