In [None]:
!pip install -q transformers accelerate bitsandbytes lexicalrichness sentence-transformers datasets scikit-learn

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/97.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.8/97.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for lexicalrichness (setup.py) ... [?25l[?25hdone


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from lexicalrichness import LexicalRichness
import numpy as np
import pandas as pd
from collections import Counter
import re

In [None]:
model_name = "Qwen/Qwen2.5-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [None]:
dataset = load_dataset("gsm8k", "main", split="test[:100]")
questions = [item['question'] for item in dataset]
answers = [item['answer'].split('####')[1].strip() for item in dataset]

README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [None]:
few_shot_examples = """Question: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
Answer: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.

Question: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
Answer: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.

Question: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?
Answer: Originally, Leah had 32 chocolates. Her sister had 42. So in total they had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39. The answer is 39.

"""

In [None]:
def generate_responses(question, num_samples=5):
    responses = []
    for _ in range(num_samples):
        prompt = few_shot_examples + f"Question: {question}\nAnswer:"
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = response.split("Answer:")[-1].strip()
        responses.append(response)

    return responses

In [None]:
all_responses = []
for i, q in enumerate(questions):
    print(f"Processing {i+1}/100: {q[:50]}...")
    responses = generate_responses(q)
    all_responses.append(responses)

Processing 1/100: Janet’s ducks lay 16 eggs per day. She eats three ...
Processing 2/100: A robe takes 2 bolts of blue fiber and half that m...
Processing 3/100: Josh decides to try flipping a house.  He buys a h...
Processing 4/100: James decides to run 3 sprints 3 times a week.  He...
Processing 5/100: Every day, Wendi feeds each of her chickens three ...
Processing 6/100: Kylar went to the store to buy glasses for his new...
Processing 7/100: Toulouse has twice as many sheep as Charleston. Ch...
Processing 8/100: Carla is downloading a 200 GB file. Normally she c...
Processing 9/100: John drives for 3 hours at a speed of 60 mph and t...
Processing 10/100: Eliza's rate per hour for the first 40 hours she w...
Processing 11/100: A new program had 60 downloads in the first month....
Processing 12/100: Toula went to the bakery and bought various types ...
Processing 13/100: Carlos is planting a lemon tree. The tree will cos...
Processing 14/100: Melanie is a door-to-door saleswoman. She

In [None]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def semantic_entropy(responses):
    if len(responses) < 2:
        return 0.0

    embeddings = embedding_model.encode(responses)
    clustering = DBSCAN(eps=0.5, min_samples=1, metric='cosine').fit(embeddings)
    labels = clustering.labels_

    cluster_counts = Counter(labels)
    total = len(labels)
    probs = [count / total for count in cluster_counts.values()]

    entropy = -sum(p * np.log(p) for p in probs if p > 0)
    return entropy

def self_consistency(responses):
    if len(responses) < 2:
        return 1.0

    normalized = [r.strip().lower()[:50] for r in responses]
    most_common = Counter(normalized).most_common(1)[0][1]
    consistency = most_common / len(responses)

    return 1 - consistency

def token_entropy_approx(responses):
    all_tokens = []
    for response in responses:
        tokens = response.lower().split()
        all_tokens.extend(tokens)

    if len(all_tokens) == 0:
        return 0.0

    token_counts = Counter(all_tokens)
    total = len(all_tokens)
    probs = [count / total for count in token_counts.values()]

    entropy = -sum(p * np.log(p) for p in probs if p > 0)
    return entropy

def perplexity_approx(responses):
    token_ent = token_entropy_approx(responses)
    return np.exp(token_ent)

def lexical_diversity(responses):
    combined_text = " ".join(responses)

    if len(combined_text.strip()) == 0:
        return 0.0

    lex = LexicalRichness(combined_text)
    try:
        ttr = lex.ttr
    except:
        ttr = 0.0

    return ttr

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
results = []

for i, responses in enumerate(all_responses):
    se = semantic_entropy(responses)
    sc = self_consistency(responses)
    te = token_entropy_approx(responses)
    ppl = perplexity_approx(responses)
    ld = lexical_diversity(responses)

    results.append({
        'question': questions[i],
        'semantic_entropy': se,
        'self_consistency': sc,
        'token_entropy': te,
        'perplexity': ppl,
        'lexical_diversity': ld
    })

df = pd.DataFrame(results)

In [None]:
def extract_number(text):
    numbers = re.findall(r'-?\d+\.?\d*', text)
    if numbers:
        return numbers[-1]
    return None

def check_correctness(response, ground_truth):
    pred_number = extract_number(response)
    truth_number = ground_truth.strip()

    if pred_number is None:
        return False

    try:
        return abs(float(pred_number) - float(truth_number)) < 0.01
    except:
        return False

correctness = []
for i, responses in enumerate(all_responses):
    correct = any(check_correctness(r, answers[i]) for r in responses)
    correctness.append(correct)

df['correct'] = correctness
print(f"Accuracy: {sum(correctness)/len(correctness):.2%}")

Accuracy: 73.00%


In [None]:
def calculate_ece(predictions, correctness, n_bins=10):
    df_temp = pd.DataFrame({
        'pred': predictions,
        'correct': correctness
    })

    df_temp['bin'] = pd.cut(df_temp['pred'], bins=n_bins, labels=False, duplicates='drop')

    ece = 0.0
    for bin_idx in df_temp['bin'].unique():
        if pd.isna(bin_idx):
            continue

        bin_data = df_temp[df_temp['bin'] == bin_idx]

        if len(bin_data) == 0:
            continue

        avg_confidence = bin_data['pred'].mean()
        avg_accuracy = bin_data['correct'].mean()
        bin_weight = len(bin_data) / len(df_temp)

        ece += bin_weight * abs(avg_confidence - avg_accuracy)

    return ece

metric_names = ['semantic_entropy', 'self_consistency', 'token_entropy', 'perplexity', 'lexical_diversity']

print("\nGSM8K (Mathematical Category) ECE Results:")
for metric in metric_names:
    normalized = (df[metric] - df[metric].min()) / (df[metric].max() - df[metric].min() + 1e-8)
    ece = calculate_ece(normalized, df['correct'])
    print(f"{metric}: ECE = {ece:.4f}")


GSM8K (Mathematical Category) ECE Results:
semantic_entropy: ECE = 0.7095
self_consistency: ECE = 0.1875
token_entropy: ECE = 0.3152
perplexity: ECE = 0.3839
lexical_diversity: ECE = 0.3732


In [None]:
df.to_csv('gsm8k_results.csv', index=False)