In [1]:
!pip install -q transformers accelerate bitsandbytes lexicalrichness sentence-transformers datasets scikit-learn

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/97.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.8/97.8 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for lexicalrichness (setup.py) ... [?25l[?25hdone


In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from lexicalrichness import LexicalRichness
import numpy as np
import pandas as pd
from collections import Counter

In [6]:
model_name = "Qwen/Qwen2.5-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [7]:
dataset = load_dataset("commonsense_qa", split="validation[:100]")
questions = [item['question'] for item in dataset]
choices = [item['choices'] for item in dataset]
answers = [item['answerKey'] for item in dataset]

In [8]:
few_shot_examples = """Question: What do people typically do when they are tired?
Choices: (A) rest (B) run (C) jump (D) work (E) exercise
Answer: Let me think step by step. When people are tired, they need to recover energy. Resting helps recover energy. Running, jumping, working, and exercising all require energy. The answer is (A) rest.

Question: Where would you find a fish that is not alive?
Choices: (A) ocean (B) aquarium (C) plate (D) river (E) lake
Answer: Let me think step by step. A fish that is not alive would be dead. Dead fish can be cooked and served on a plate. Ocean, aquarium, river, and lake contain live fish. The answer is (C) plate.

"""

In [9]:
def format_question(question, choices_dict):
    labels = choices_dict['label']
    texts = choices_dict['text']
    choices_str = " ".join([f"({labels[i]}) {texts[i]}" for i in range(len(labels))])
    return f"{question}\nChoices: {choices_str}"

def generate_responses(question, choices_dict, num_samples=5):
    responses = []
    formatted_q = format_question(question, choices_dict)

    for _ in range(num_samples):
        prompt = few_shot_examples + f"Question: {formatted_q}\nAnswer: Let me think step by step."
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = response.split("Answer:")[-1].strip()
        responses.append(response)

    return responses

In [10]:
all_responses = []
for i in range(len(questions)):
    print(f"Processing {i+1}/100: {questions[i][:50]}...")
    responses = generate_responses(questions[i], choices[i])
    all_responses.append(responses)

Processing 1/100: A revolving door is convenient for two direction t...
Processing 2/100: What do people aim to do at work?...
Processing 3/100: Where would you find magazines along side many oth...
Processing 4/100: Where are  you likely to find a hamburger?...
Processing 5/100: James was looking for a good place to buy farmland...
Processing 6/100: What island country is ferret popular?...
Processing 7/100: In what Spanish speaking North American country ca...
Processing 8/100: What do animals do when an enemy is approaching?...
Processing 9/100: Reading newspaper one of many ways to practice you...
Processing 10/100: What do people typically do while playing guitar?...
Processing 11/100: What would vinyl be an odd thing to replace?...
Processing 12/100: If you want harmony, what is something you should ...
Processing 13/100: Where does a heifer's master live?...
Processing 14/100: Aside from water and nourishment what does your do...
Processing 15/100: Janet was watching the film be

In [11]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def semantic_entropy(responses):
    if len(responses) < 2:
        return 0.0

    embeddings = embedding_model.encode(responses)
    clustering = DBSCAN(eps=0.5, min_samples=1, metric='cosine').fit(embeddings)
    labels = clustering.labels_

    cluster_counts = Counter(labels)
    total = len(labels)
    probs = [count / total for count in cluster_counts.values()]

    entropy = -sum(p * np.log(p) for p in probs if p > 0)
    return entropy

def self_consistency(responses):
    if len(responses) < 2:
        return 1.0

    normalized = [r.strip().lower()[:50] for r in responses]
    most_common = Counter(normalized).most_common(1)[0][1]
    consistency = most_common / len(responses)

    return 1 - consistency

def token_entropy_approx(responses):
    all_tokens = []
    for response in responses:
        tokens = response.lower().split()
        all_tokens.extend(tokens)

    if len(all_tokens) == 0:
        return 0.0

    token_counts = Counter(all_tokens)
    total = len(all_tokens)
    probs = [count / total for count in token_counts.values()]

    entropy = -sum(p * np.log(p) for p in probs if p > 0)
    return entropy

def perplexity_approx(responses):
    token_ent = token_entropy_approx(responses)
    return np.exp(token_ent)

def lexical_diversity(responses):
    combined_text = " ".join(responses)

    if len(combined_text.strip()) == 0:
        return 0.0

    lex = LexicalRichness(combined_text)
    try:
        ttr = lex.ttr
    except:
        ttr = 0.0

    return ttr

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
results = []

for i, responses in enumerate(all_responses):
    se = semantic_entropy(responses)
    sc = self_consistency(responses)
    te = token_entropy_approx(responses)
    ppl = perplexity_approx(responses)
    ld = lexical_diversity(responses)

    results.append({
        'question': questions[i],
        'semantic_entropy': se,
        'self_consistency': sc,
        'token_entropy': te,
        'perplexity': ppl,
        'lexical_diversity': ld
    })

df = pd.DataFrame(results)

In [13]:
import re

def extract_answer_letter(response):
    patterns = [
        r'\(([A-E])\)',
        r'answer is \(?([A-E])\)?',
        r'answer: \(?([A-E])\)?',
        r'^([A-E])\b'
    ]

    for pattern in patterns:
        match = re.search(pattern, response, re.IGNORECASE)
        if match:
            return match.group(1).upper()

    return None

def check_correctness(response, ground_truth):
    pred_letter = extract_answer_letter(response)

    if pred_letter is None:
        return False

    return pred_letter == ground_truth.upper()

correctness = []
for i, responses in enumerate(all_responses):
    correct = any(check_correctness(r, answers[i]) for r in responses)
    correctness.append(correct)

df['correct'] = correctness
print(f"Accuracy: {sum(correctness)/len(correctness):.2%}")

Accuracy: 32.00%


In [14]:
def calculate_ece(predictions, correctness, n_bins=10):
    df_temp = pd.DataFrame({
        'pred': predictions,
        'correct': correctness
    })

    df_temp['bin'] = pd.cut(df_temp['pred'], bins=n_bins, labels=False, duplicates='drop')

    ece = 0.0
    for bin_idx in df_temp['bin'].unique():
        if pd.isna(bin_idx):
            continue

        bin_data = df_temp[df_temp['bin'] == bin_idx]

        if len(bin_data) == 0:
            continue

        avg_confidence = bin_data['pred'].mean()
        avg_accuracy = bin_data['correct'].mean()
        bin_weight = len(bin_data) / len(df_temp)

        ece += bin_weight * abs(avg_confidence - avg_accuracy)

    return ece

metric_names = ['semantic_entropy', 'self_consistency', 'token_entropy', 'perplexity', 'lexical_diversity']

print("\nCommonsenseQA (Reasoning Category) ECE Results:")
for metric in metric_names:
    normalized = (df[metric] - df[metric].min()) / (df[metric].max() - df[metric].min() + 1e-8)
    ece = calculate_ece(normalized, df['correct'])
    print(f"{metric}: ECE = {ece:.4f}")


CommonsenseQA (Reasoning Category) ECE Results:
semantic_entropy: ECE = 0.6127
self_consistency: ECE = 0.6900
token_entropy: ECE = 0.4141
perplexity: ECE = 0.2502
lexical_diversity: ECE = 0.4681


In [15]:
df.to_csv('commonsenseqa_results.csv', index=False)