In [1]:
import os 
import re 
import pandas as pd 
import numpy as np
from transformers import AutoTokenizer
import Rbeast
from tqdm import tqdm
from src.utils import convert_to_llama_prompt, read_json


tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-3.2-3B-Instruct",
    padding_side="left"
    )


def determine_answer(sentence):
    options = ['A', 'B', 'C', 'D', 'E']
    last_option = None

    for word in sentence:
        if word in options:
            last_option = word

    return last_option


def one_hot_encode(option):
    options = ['A', 'B', 'C', 'D', 'E']
    encoding = np.zeros(len(options))
    if option in options:
        encoding[options.index(option)] = 1
    return encoding

  from .autonotebook import tqdm as notebook_tqdm


## Load all datas

In [2]:
# original data
data_paths = os.listdir('./src/data/original_data')
datas = [pd.read_csv(os.path.join('./src/data/original_data', p)) for p in data_paths]

processed_datas = []
for data in datas:
    new_data = data.copy()
    new_data['input'] = convert_to_llama_prompt(data['input'])
    processed_datas.append(new_data)

input_prompts = list(processed_datas[0]['input'])
answers = list(processed_datas[0]['ground_truth'])

# base_path + top_k replace
all_data_path = [os.path.join('./src/data/base_path', p) for p in os.listdir('./src/data/base_path')]
all_data = []
for p in all_data_path:
    all_data.append(read_json(p))

# resampling 
forking_data_path = [os.path.join('./src/data/forking_path/aqua', p) for p in os.listdir('./src/data/forking_path/aqua')]
forking_data = []
forking_data_index = []

for p in forking_data_path:
    forking_data.append(read_json(p))
    forking_data_index.append(int(re.sub(r'[^0-9]', '', p.split('aqua')[-1])))

## Make outcome distributions

In [4]:
list_dist_o_t_w = []
list_dist_o_t = []

list_base_path = []

list_answer = []
list_answer_score = []

ground_truth_answer = []

for e_i, index in enumerate(forking_data_index):
    
    all_o_t_w = [] 
    all_o_t_w_s =[]
    all_o_t_w_prob = []

    for t_s, idx, t_prob in zip(forking_data[e_i]['all_path'], forking_data[e_i]['t_index'], forking_data[e_i]['prob']):
        
        temp_o_t = []
        temp_o_t_s = []
        temp_o_t_prob = []
        
        for i in range(int(len(t_s)//30)):
            
            temp_o_t_w = []
            o_t_w_prob = []
            for j in range(30):
                answer = determine_answer(t_s[i*30+j])
                encoded_answer = one_hot_encode(answer)
                temp_o_t_w.append(encoded_answer)
                o_t_w_prob.append(t_prob[i*30+j])
                
            temp_o_t_s.append(all_data[0]['replace_token_score'][index][idx][i])
            temp_o_t.append(temp_o_t_w)
            temp_o_t_prob.append(o_t_w_prob)
        
        all_o_t_w.append(temp_o_t)
        all_o_t_w_s.append(temp_o_t_s)
        all_o_t_w_prob.append(temp_o_t_prob)
            
    new = []
    for i in range(len(all_o_t_w_prob[0][0])):
        new.append(all_o_t_w_prob[0][0][i] * all_o_t_w[0][0][i])

    dist_o_t_w = []
    dist_o_t = []

    for o_t_w, o_t_w_prob in zip(all_o_t_w, all_o_t_w_prob): 
        temp_dist_o_t_w = []
        for t_w, t_w_prob in zip(o_t_w, o_t_w_prob):
            new = []
            for i in range(len(t_w)):
                new.append(t_w_prob[i] * t_w[i])
            temp_dist_o_t_w.append(np.mean(new, axis=0))
        dist_o_t_w.append(temp_dist_o_t_w)

    for o_t_w, o_t_w_s in zip(dist_o_t_w, all_o_t_w_s):
        new = []
        for i in range(len(o_t_w)):
            new.append(o_t_w[i] * o_t_w_s[i])
        dist_o_t.append(np.mean(new, axis=0))
    
    list_dist_o_t_w.append(dist_o_t_w)
    list_dist_o_t.append(dist_o_t)
    
    list_base_path.append(all_data[0]['base_path'][index])
    list_answer.append(all_data[0]['replace_token'][index])
    list_answer_score.append(all_data[0]['replace_token_score'][index])

## Run Bayesian Change Point Detection (BCPD)

In [5]:
def preprocess_y(list_dist_o_t):
    o_0 = np.array(list_dist_o_t[0][0])

    y = []
    time = []

    for t, o_t in enumerate(list_dist_o_t):
        y.append(np.linalg.norm(o_0 - np.array(o_t)))  
        time.append(t)  

    return np.array(y), np.array(time)


def run_beast_for_cpd(y, time, alpha2_str='lambda range_y: 2.0 + (1000 ** (1.0 - range_y))',
                      tcp_minmax=[0, 6], tseg_minlength=10, mcmc_chains=10, mcmc_burnin=1000, 
                      mcmc_samples=20000, mcmc_thin=5, prec_value=10, alpha1=0.01):
    
    alpha2_fn = eval(alpha2_str)

    range_y = y.max() - y.min()
    alpha2 = alpha2_fn(range_y)

    result = Rbeast.beast(
        y,
        time=time,
        season='none',
        tcp_minmax=tcp_minmax,
        torder_minmax=[1, 1],
        tseg_minlength=tseg_minlength,
        mcmc_seed=0,
        mcmc_chains=mcmc_chains,
        mcmc_burnin=mcmc_burnin,
        mcmc_samples=mcmc_samples,
        mcmc_thin=mcmc_thin,
        print_progress=False,
        print_options=False,
        quiet=True,
        precPriorType='constant',
        precValue=prec_value,
        alpha1=alpha1,
        alpha2=alpha2
    )

    return result.trend.__dict__

In [6]:
available_answers = ['A', 'B', 'C', 'D', 'E', ' A', ' B', ' C', ' D', 'E', '(A', '(B', '(C', '(D', '(E']
available_answers_index = [tokenizer.encode(t, add_special_tokens=False)[0] for t in available_answers]

In [7]:
def find_answer_logit(list_base_path, list_answer_score):
    all_score = []
    answer_list =[]
    for j, sent in enumerate(list_base_path):
        answer = 'None'
        last_answer_index = -1
        for i, token in enumerate(sent):
            if token in available_answers_index:
                last_answer_index = i 
                answer = tokenizer.decode(token).strip()
                if '(' in answer:
                    answer = answer.repace('(', '')
        
        s = list_answer_score[j][last_answer_index]
        all_score.append(s)
        answer_list.append(answer)
    return all_score, answer_list

all_score, answer_list = find_answer_logit(list_base_path, list_answer_score)

In [8]:
critical_token_index = []

critical_token_score = []
last_answer_token_score = []

ground_truth_answer = []

for i in range(len(list_dist_o_t)):
    y, time = preprocess_y(list_dist_o_t[i])
    cp = run_beast_for_cpd(y=y, time=time)['cp'][0]
    #print(tokenizer.decode(list_base_path[i][forking_data[i]['t_index'][int(cp)]]))
    critical_token_score.append(np.max(list_answer_score[i][forking_data[i]['t_index'][int(cp)]]))
    last_answer_token_score.append(np.max(all_score[i]))
    ground_truth_answer.append(int(answers[i]==answer_list[i]))

## Show results

In [9]:
df = pd.DataFrame()
df['critical_token_score'] = critical_token_score
df['ground_truth_answer'] = ground_truth_answer
df['last_answer_token_score'] = last_answer_token_score
df['answer_list']   = answer_list

In [11]:
def calculate_ece(scores, ground_truth, num_bins=10):
    bins = np.linspace(0, 1, num_bins + 1)
    bin_indices = np.digitize(scores, bins, right=True)
    
    ece = 0
    for bin_lower in range(1, num_bins + 1):
        bin_scores = scores[bin_indices == bin_lower]
        bin_truth = ground_truth[bin_indices == bin_lower]
        if len(bin_scores) > 0:
            bin_accuracy = np.mean(bin_truth)
            bin_confidence = np.mean(bin_scores)
            ece += (len(bin_scores) / len(scores)) * abs(bin_accuracy - bin_confidence)
    return ece

critical_scores = df["critical_token_score"].values
last_scores = df["last_answer_token_score"].values
ground_truth = df["ground_truth_answer"].values

critical_ece = calculate_ece(critical_scores, ground_truth)
answer_token_ece = calculate_ece(last_scores, ground_truth)

critical_ece, answer_token_ece

(0.4633710563182831, 0.7258704992135366)