In [1]:
from utils import *

In [2]:
import torch, transformers
device = "cuda"

model_name_or_path = "google/gemma-2-2b-it"
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name_or_path, torch_dtype=torch.bfloat16, device_map=device)

# get tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_name_or_path, model_max_length=2048, 
    padding_side="right", use_fast=False)



Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


In [3]:
# Run random baseline to get these files first!
n_training_qIDs = "train_qIDs.json"
n_testing_qIDs = "test_qIDs.json"

# demographic group and output type
demographic_group = "POLPARTY"
demographic = "Republican"
output_type = "sequence"

In [4]:
def apply_chat_template(row):
    messages = [{"role": "user", "content": row["input"]}]
    nobos = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True)[1:]
    return tokenizer.decode(nobos)

test_pool = get_test_questions_with_distributions(
    seen_qIDs={}, 
    demographic_group=demographic_group,
    demographic=demographic,
)
test_qIDs = json.load(open(n_testing_qIDs))

k = 1
success_rates = []
probabilities_list = []
for test_qID in test_qIDs:
    print("Evaluating:", test_qID)
    # test_qID = "ECON5_d_W54"
    n = (sum(test_pool[test_qID][demographic].values()))
    MC_options = list(test_pool[test_qID][demographic].keys())
    all_options, probs = [], []
    for i, option in enumerate(MC_options):
        all_options.append(options[i])
        probs.append(test_pool[test_qID][demographic][option]/n)
    golden_dist = dict(zip(all_options, probs))
    # print("Golden dist:")
    # print(golden_dist)

    instruction = get_icl_prompt_opinionqa(
        test_qID,
        demographic_group=demographic_group,
        demographic=demographic,
        output_type=output_type
    )
    
    instruction = apply_chat_template({"input": instruction})
    model_inputs = tokenizer(instruction, return_tensors="pt").to(device)

    successful_parsings = 0
    total_attempts = 0
    while successful_parsings < k:
        outputs = model.generate(
            **model_inputs, max_new_tokens=36, do_sample=True, 
            eos_token_id=tokenizer.eos_token_id, early_stopping=True
        )
        response = tokenizer.decode(outputs[0][model_inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
        # print(response)
        success, result = parse_answers(response, all_options, answer_tag=False)
        total_attempts += 1
        if success:
            successful_parsings += 1
            probabilities_list.append([golden_dist, result["probabilities"]])
        success_rate = successful_parsings / total_attempts
        success_rates += [success_rate]
success_rate = np.array(success_rates).mean()
print("Success rate:", success_rate)

The 'max_batch_size' argument of HybridCache is deprecated and will be removed in v4.46. Use the more precisely named 'batch_size' argument instead.


Evaluating: WHYNOTBIZF2G_W36


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Evaluating: GAP21Q33_r_W82
Evaluating: NEIGHINTERA_W32
Evaluating: FUTRCLASSc_W41
Evaluating: TRAITPOLMF1B_W36


  icl_values = np.array(icl_values)/np.sum(icl_values)


Evaluating: FUD37A_W34
Evaluating: HIGHEDWRNGB_W36
Evaluating: WHYNOTPOLF1C_W36
Evaluating: GAP21Q4_f_W82
Evaluating: ESSENPOLF1B_W36
Evaluating: RQ4_F1Ba_W42
Evaluating: RACESURV14_W43
Evaluating: INFOCREATEa_W45
Evaluating: GAP21Q19_a_W82
Evaluating: GROWUPVIOL_W26
Evaluating: FAMSURV23e_W50
Evaluating: GUNTYPEOWNC_W26
Evaluating: ROMRELDUR_W50
Evaluating: GAP21Q31_W82
Evaluating: BILLION_W92
Success rate: 1.0


In [5]:
distances = compute_l1_values(probabilities_list)
json.dump(distances, open("distance_icl.json", "w"))
np.mean(distances)

0.8153693915522485