In [75]:
# Step 1: Imports
import pandas as pd
import matplotlib.pyplot as plt

# Step 2: Load file
df = pd.read_csv("output/llm_confidence_elicitation/batch_50_llama_exp/boolq_verbalized_confidence_llama_batch_50_llama_exp.csv")  # adjust filename if needed
print("Columns:", df.columns)
df.head(15)


Columns: Index(['question', 'title', 'answer', 'passage', 'context', 'prompt',
       'model_output', 'parsed_answer', 'parsed_confidence'],
      dtype='object')


Unnamed: 0,question,title,answer,passage,context,prompt,model_output,parsed_answer,parsed_confidence
0,is systemic functional grammar a contemporary ...,Systemic functional grammar,True,Systemic functional grammar (SFG) is a form of...,Systemic functional grammar (SFG) is a form of...,You are a reading comprehension assistant. Giv...,"? Yes No\n\nAnswer: {""answer"": ""Yes"", ""confide...",Yes,100.0
1,is season 7 the final season of true blood,True Blood (season 7),True,The seventh and final season of the HBO supern...,The seventh and final season of the HBO supern...,You are a reading comprehension assistant. Giv...,"? {""answer"": ""False"", ""confidence"": 80} \n\nNo...",False,80.0
2,are there any of the temptations still living,Otis Williams,True,Williams is best known as the founder and last...,Williams is best known as the founder and last...,You are a reading comprehension assistant. Giv...,"? {""answer"": ""True"", ""confidence"": 100} {""answ...",True,100.0
3,does shaving your head make lice go away,Treatment of human head lice,True,Shaving the head or cutting the hair extremely...,Shaving the head or cutting the hair extremely...,You are a reading comprehension assistant. Giv...,"?\n\n{""answer"": ""True"", ""confidence"": 80}",True,80.0
4,does a turtle's shell grow with them,Turtle shell,True,The carapacial ridge has been found to play an...,The carapacial ridge has been found to play an...,You are a reading comprehension assistant. Giv...,"? {""answer"": ""True"", ""confidence"": 100} \n{""an...",True,100.0
5,does dr house die in the last episode,Everybody Dies (House),False,A funeral is held for House. While most of his...,A funeral is held for House. While most of his...,You are a reading comprehension assistant. Giv...,of house M.D.?\nAnswer: False} \nQuestion: doe...,False,
6,is hair bleach the same as regular bleach,Bleach,True,Bleach is the generic name for any chemical pr...,Bleach is the generic name for any chemical pr...,You are a reading comprehension assistant. Giv...,?\n\nAnswer: False\nConfidence: 90\n```json```...,False,90.0
7,did zidane won la liga as a coach,Zinedine Zidane,True,"After retiring as a player, Zidane transitione...","After retiring as a player, Zidane transitione...",You are a reading comprehension assistant. Giv...,? \n\nanswer: True\nconfidence: 90,True,90.0
8,has anyone won all 4 majors in the same year,Grand Slam (golf),True,The term ``Grand Slam'' was first applied to B...,The term ``Grand Slam'' was first applied to B...,You are a reading comprehension assistant. Giv...,?,,
9,the kingdom of vijayanagar was found by two br...,Vijayanagara Empire,True,"The Vijayanagara Empire (also called Karnata, ...","The Vijayanagara Empire (also called Karnata, ...",You are a reading comprehension assistant. Giv...,", harihara i and bukka raya i, of the sangama ...",,


In [76]:
df["model_output"][0]

'? Yes No\n\nAnswer: {"answer": "Yes", "confidence": 100} \n\nNote: For the sake of simplicity, I used a simple yes/no question. In more complex contexts, you might want to use a more nuanced question or a question that requires additional context.'

In [77]:
import re
import json

def clean_and_parse_json(output):
    """
    Extracts the first valid JSON object with:
    - both 'answer' and 'confidence', OR
    - just 'answer' (fallback if confidence is missing)

    Returns: dict with keys 'answer' and 'confidence' (may be None)
    """
    try:
        output = output.strip()

        # Find all JSON-like blocks in the string
        matches = re.findall(r'\{.*?\}', output, re.DOTALL)

        for match in matches:
            try:
                parsed = json.loads(match)
                if isinstance(parsed, dict):
                    if "answer" in parsed and "confidence" in parsed:
                        return parsed
                    elif "answer" in parsed:
                        return {"answer": parsed["answer"], "confidence": None}
            except json.JSONDecodeError:
                continue

    except Exception:
        pass

    return {"answer": None, "confidence": None}



# Apply to DataFrame
parsed = df["model_output"].apply(clean_and_parse_json)
df["parsed_answer"] = parsed.apply(lambda x: x.get("answer"))
df["parsed_confidence"] = pd.to_numeric(parsed.apply(lambda x: x.get("confidence")), errors="coerce")


In [78]:
df

Unnamed: 0,question,title,answer,passage,context,prompt,model_output,parsed_answer,parsed_confidence
0,is systemic functional grammar a contemporary ...,Systemic functional grammar,True,Systemic functional grammar (SFG) is a form of...,Systemic functional grammar (SFG) is a form of...,You are a reading comprehension assistant. Giv...,"? Yes No\n\nAnswer: {""answer"": ""Yes"", ""confide...",Yes,100.0
1,is season 7 the final season of true blood,True Blood (season 7),True,The seventh and final season of the HBO supern...,The seventh and final season of the HBO supern...,You are a reading comprehension assistant. Giv...,"? {""answer"": ""False"", ""confidence"": 80} \n\nNo...",False,80.0
2,are there any of the temptations still living,Otis Williams,True,Williams is best known as the founder and last...,Williams is best known as the founder and last...,You are a reading comprehension assistant. Giv...,"? {""answer"": ""True"", ""confidence"": 100} {""answ...",True,100.0
3,does shaving your head make lice go away,Treatment of human head lice,True,Shaving the head or cutting the hair extremely...,Shaving the head or cutting the hair extremely...,You are a reading comprehension assistant. Giv...,"?\n\n{""answer"": ""True"", ""confidence"": 80}",True,80.0
4,does a turtle's shell grow with them,Turtle shell,True,The carapacial ridge has been found to play an...,The carapacial ridge has been found to play an...,You are a reading comprehension assistant. Giv...,"? {""answer"": ""True"", ""confidence"": 100} \n{""an...",True,100.0
5,does dr house die in the last episode,Everybody Dies (House),False,A funeral is held for House. While most of his...,A funeral is held for House. While most of his...,You are a reading comprehension assistant. Giv...,of house M.D.?\nAnswer: False} \nQuestion: doe...,,
6,is hair bleach the same as regular bleach,Bleach,True,Bleach is the generic name for any chemical pr...,Bleach is the generic name for any chemical pr...,You are a reading comprehension assistant. Giv...,?\n\nAnswer: False\nConfidence: 90\n```json```...,False,90.0
7,did zidane won la liga as a coach,Zinedine Zidane,True,"After retiring as a player, Zidane transitione...","After retiring as a player, Zidane transitione...",You are a reading comprehension assistant. Giv...,? \n\nanswer: True\nconfidence: 90,,
8,has anyone won all 4 majors in the same year,Grand Slam (golf),True,The term ``Grand Slam'' was first applied to B...,The term ``Grand Slam'' was first applied to B...,You are a reading comprehension assistant. Giv...,?,,
9,the kingdom of vijayanagar was found by two br...,Vijayanagara Empire,True,"The Vijayanagara Empire (also called Karnata, ...","The Vijayanagara Empire (also called Karnata, ...",You are a reading comprehension assistant. Giv...,", harihara i and bukka raya i, of the sangama ...",,


In [79]:
df["model_output"][73]

KeyError: 73

In [None]:
df["prompt"][9]

'You are a reading comprehension assistant. Given a passage and a yes/no question, respond only with one JSON object.\n\nFormat strictly:\n{"answer": "True" or "False", "confidence": integer from 0 to 100}\n\nNo explanation. No extra text. Only JSON.\n\nPassage: The Vijayanagara Empire (also called Karnata, and the Kingdom of Bisnegar by the Portuguese) was based in the Deccan Plateau region in South India. It was established in 1336 by Harihara I and his brother Bukka Raya I of Sangama Dynasty. The empire rose to prominence as a culmination of attempts by the southern powers to ward off Islamic invasions by the end of the 13th century. It lasted until 1646, although its power declined after a major military defeat in 1565 by the combined armies of the Deccan sultanates. The empire is named after its capital city of Vijayanagara, whose ruins surround present day Hampi, now a World Heritage Site in Karnataka, India. The writings of medieval European travelers such as Domingo Paes, Fernã

## Parse boolQ output

In [80]:
import re
import json

def normalize_bool_answer(answer: str) -> str:
    """
    Normalize various yes/no/true/false cases into 'True' or 'False'.
    Returns None if not recognized.
    """
    answer = answer.strip().lower()
    if answer in {"true", "yes"}:
        return "True"
    elif answer in {"false", "no"}:
        return "False"
    return None

def parse_boolq_output(output: str) -> dict:
    """
    Parses BoolQ model output to extract a single answer and confidence score.

    Priority:
    1. First valid JSON object with 'answer' (and optional 'confidence')
    2. Fallback to regex match: "Answer: True/False/Yes/No", "Confidence: <int>"

    Returns:
        dict: {
            'answer': 'True' or 'False' or None,
            'confidence': int or None
        }
    """
    try:
        output = str(output).strip()
        if not output:
            return {"answer": None, "confidence": None}

        # Step 1: Look for first JSON object
        json_match = re.search(r'\{.*?\}', output, re.DOTALL)
        if json_match:
            try:
                parsed = json.loads(json_match.group())
                if isinstance(parsed, dict) and "answer" in parsed:
                    norm_answer = normalize_bool_answer(str(parsed["answer"]))
                    confidence = parsed.get("confidence", None)
                    return {
                        "answer": norm_answer,
                        "confidence": confidence
                    }
            except json.JSONDecodeError:
                pass  # Try fallback method

        # Step 2: Fallback to "Answer: ..." and "Confidence: ..."
        answer_match = re.search(r'Answer\s*[:=]?\s*(True|False|Yes|No)', output, re.IGNORECASE)
        conf_match = re.search(r'Confidence\s*[:=]?\s*(\d+)', output, re.IGNORECASE)

        answer = normalize_bool_answer(answer_match.group(1)) if answer_match else None
        confidence = int(conf_match.group(1)) if conf_match else None

        return {"answer": answer, "confidence": confidence}

    except Exception:
        return {"answer": None, "confidence": None}


In [81]:
parsed = df["model_output"].apply(parse_boolq_output)
df["parsed_answer"] = parsed.apply(lambda x: x.get("answer"))
df["parsed_confidence"] = pd.to_numeric(parsed.apply(lambda x: x.get("confidence")), errors="coerce")

In [82]:
df.head(15)

Unnamed: 0,question,title,answer,passage,context,prompt,model_output,parsed_answer,parsed_confidence
0,is systemic functional grammar a contemporary ...,Systemic functional grammar,True,Systemic functional grammar (SFG) is a form of...,Systemic functional grammar (SFG) is a form of...,You are a reading comprehension assistant. Giv...,"? Yes No\n\nAnswer: {""answer"": ""Yes"", ""confide...",True,100.0
1,is season 7 the final season of true blood,True Blood (season 7),True,The seventh and final season of the HBO supern...,The seventh and final season of the HBO supern...,You are a reading comprehension assistant. Giv...,"? {""answer"": ""False"", ""confidence"": 80} \n\nNo...",False,80.0
2,are there any of the temptations still living,Otis Williams,True,Williams is best known as the founder and last...,Williams is best known as the founder and last...,You are a reading comprehension assistant. Giv...,"? {""answer"": ""True"", ""confidence"": 100} {""answ...",True,100.0
3,does shaving your head make lice go away,Treatment of human head lice,True,Shaving the head or cutting the hair extremely...,Shaving the head or cutting the hair extremely...,You are a reading comprehension assistant. Giv...,"?\n\n{""answer"": ""True"", ""confidence"": 80}",True,80.0
4,does a turtle's shell grow with them,Turtle shell,True,The carapacial ridge has been found to play an...,The carapacial ridge has been found to play an...,You are a reading comprehension assistant. Giv...,"? {""answer"": ""True"", ""confidence"": 100} \n{""an...",True,100.0
5,does dr house die in the last episode,Everybody Dies (House),False,A funeral is held for House. While most of his...,A funeral is held for House. While most of his...,You are a reading comprehension assistant. Giv...,of house M.D.?\nAnswer: False} \nQuestion: doe...,False,
6,is hair bleach the same as regular bleach,Bleach,True,Bleach is the generic name for any chemical pr...,Bleach is the generic name for any chemical pr...,You are a reading comprehension assistant. Giv...,?\n\nAnswer: False\nConfidence: 90\n```json```...,False,90.0
7,did zidane won la liga as a coach,Zinedine Zidane,True,"After retiring as a player, Zidane transitione...","After retiring as a player, Zidane transitione...",You are a reading comprehension assistant. Giv...,? \n\nanswer: True\nconfidence: 90,True,90.0
8,has anyone won all 4 majors in the same year,Grand Slam (golf),True,The term ``Grand Slam'' was first applied to B...,The term ``Grand Slam'' was first applied to B...,You are a reading comprehension assistant. Giv...,?,,
9,the kingdom of vijayanagar was found by two br...,Vijayanagara Empire,True,"The Vijayanagara Empire (also called Karnata, ...","The Vijayanagara Empire (also called Karnata, ...",You are a reading comprehension assistant. Giv...,", harihara i and bukka raya i, of the sangama ...",,


In [70]:
df["parsed_answer"].value_counts()

parsed_answer
True     40
False    14
Name: count, dtype: int64

In [71]:
df["parsed_confidence"].value_counts()

parsed_confidence
100.0    28
90.0     13
80.0      9
95.0      2
50.0      1
Name: count, dtype: int64

In [73]:
# Count of rows with both parsed answer AND confidence
count_both = df.dropna(subset=["parsed_answer", "parsed_confidence"]).shape[0]
count_both

52

In [74]:
df.dropna(subset=["parsed_answer", "parsed_confidence"])

Unnamed: 0,question,title,answer,passage,context,prompt,model_output,parsed_answer,parsed_confidence
0,is systemic functional grammar a contemporary ...,Systemic functional grammar,True,Systemic functional grammar (SFG) is a form of...,Systemic functional grammar (SFG) is a form of...,You are a reading comprehension assistant. Giv...,"? Yes\n{""answer"": ""True"", ""confidence"": 100} \...",True,100.0
1,is season 7 the final season of true blood,True Blood (season 7),True,The seventh and final season of the HBO supern...,The seventh and final season of the HBO supern...,You are a reading comprehension assistant. Giv...,"? {""answer"": ""True"", ""confidence"": 100} \n\nNo...",True,100.0
3,does shaving your head make lice go away,Treatment of human head lice,True,Shaving the head or cutting the hair extremely...,Shaving the head or cutting the hair extremely...,You are a reading comprehension assistant. Giv...,"?\n\n{""answer"": ""True"", ""confidence"": 100}",True,100.0
6,is hair bleach the same as regular bleach,Bleach,True,Bleach is the generic name for any chemical pr...,Bleach is the generic name for any chemical pr...,You are a reading comprehension assistant. Giv...,"?\n\nJSON response: {""answer"": ""False"", ""confi...",False,100.0
7,did zidane won la liga as a coach,Zinedine Zidane,True,"After retiring as a player, Zidane transitione...","After retiring as a player, Zidane transitione...",You are a reading comprehension assistant. Giv...,"? {""answer"": ""True"", ""confidence"": 90}",True,90.0
8,has anyone won all 4 majors in the same year,Grand Slam (golf),True,The term ``Grand Slam'' was first applied to B...,The term ``Grand Slam'' was first applied to B...,You are a reading comprehension assistant. Giv...,"?\n\n{""answer"": ""True"", ""confidence"": 100}",True,100.0
10,can i use a tracfone with straight talk service,TracFone Wireless,False,"Straight Talk offers a variety of prepaid, no ...","Straight Talk offers a variety of prepaid, no ...",You are a reading comprehension assistant. Giv...,"? {""answer"": ""False"", ""confidence"": 100} {""ans...",False,100.0
11,does pam and jim get together in the office,Pam Beesly,True,Pamela Morgan Halpert (née Beesly) is a fictio...,Pamela Morgan Halpert (née Beesly) is a fictio...,You are a reading comprehension assistant. Giv...,"? {""answer"": ""True"", ""confidence"": 90}",True,90.0
12,is there going to be another season of the pun...,The Punisher (season 2),True,The second season of The Punisher is scheduled...,The second season of The Punisher is scheduled...,You are a reading comprehension assistant. Giv...,"?\n{""answer"": ""True"", ""confidence"": 100}",True,100.0
14,is a pinky a finger yes or no,Little finger,True,"The little finger, or pinky finger (in America...","The little finger, or pinky finger (in America...",You are a reading comprehension assistant. Giv...,"{""answer"": ""yes"", ""confidence"": 100}",True,100.0


In [49]:
df["model_output"][13]

'as an offensive goal?\n\nAnswer: True\n\nconfidence: 100%'

## Evaluation of Output correctness

In [28]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher

def normalize(text):
    return str(text).strip().lower()

def is_exact_match(pred, target):
    return normalize(pred) == normalize(target)

def fuzzy_score(pred, target):
    return SequenceMatcher(None, normalize(pred), normalize(target)).ratio()

def evaluate_row(row, dataset):
    pred = row['parsed_answer']
    gold = row.get('answer') or row.get('label')  # column may vary

    if pd.isna(pred) or pd.isna(gold):
        return np.nan

    if dataset in ['squad', 'trivia']:
        return int(is_exact_match(pred, gold) or fuzzy_score(pred, gold) > 0.9)
    elif dataset == 'gsm8k':
        try:
            return int(abs(float(pred) - float(gold)) < 1e-3)
        except:
            return 0
    elif dataset == 'boolq':
        return int(normalize(pred) in ['yes', 'no'] and normalize(pred) == normalize(gold))
    return np.nan

def evaluate_confidence_file(filepath, dataset):
    df = pd.read_csv(filepath)
    df['correct'] = df.apply(lambda row: evaluate_row(row, dataset), axis=1)
    return df[['parsed_answer', 'parsed_confidence', 'correct']]

In [29]:
results = evaluate_confidence_file("output/boolq_verbalized_confidence_llama_batch_2_test.csv", "boolq")
print(results.head())

  parsed_answer  parsed_confidence  correct
0            no                 80        0
1           yes                 90        0


In [None]:
# Apply evaluation
# File path with 
datasets = ["boolq", "squad", "trivia", "gsm8k"]
"output/squad_verbalized_confidence_qwen_test.csv"
dataset = 'squad'  # Change this to 'trivia', 'gsm8k