In [5]:
# Step 1: Imports
import pandas as pd
import matplotlib.pyplot as plt

# Step 2: Load file
df = pd.read_csv("output/llm_confidence_elicitation/test/squad_verbalized_confidence_cogito_test.csv")  # adjust filename if needed
print("Columns:", df.columns)
df.head()


Columns: Index(['id', 'title', 'question', 'context', 'answers', 'is_impossible',
       'prompt', 'model_output', 'parsed_answer', 'parsed_confidence'],
      dtype='object')


Unnamed: 0,id,title,question,context,answers,is_impossible,prompt,model_output,parsed_answer,parsed_confidence
0,57269ef4f1498d1400e8e52d,The_Sun_(United_Kingdom),What's another thing the paper showed hostilit...,Despite its initial opposition to the closures...,['promotion of right-wing ministers to the cab...,False,You are an expert QA assistant. Your task is t...,"The EU and approval of public spending cuts, t...",The EU,90
1,56dfbd5e231d4119001abd5a,Internet_service_provider,what type of technology is used to connect to ...,"ISPs provide Internet access, employing a ran...",['wireless Ethernet (wi-fi)'],False,You are an expert QA assistant. Your task is t...,"{""answer"": ""wireless Ethernet (wi-fi)"", ""confi...",wireless Ethernet (wi-fi),99
2,5730d22faca1c71400fe5af0,United_States_Air_Force,What is one of the important functions of the ...,"Offensive Counterair (OCA) is defined as ""offe...",['suppression/destruction of enemy air defense'],False,You are an expert QA assistant. Your task is t...,Let me provide an answer based on the given co...,"Destroy, disrupt, or neutralize enemy aircraft...",95


In [5]:
df["model_output"]

0    Based on the context provided, the paper showe...
1    Based on the context given, what is the answer...
2    (Select one of the following: Attack operation...
3    {"answer": "1604", "confidence": 100}\nOkay, l...
4    The options are: (A) WAV (B) FLAC (C) MP3 (D) ...
5    Answer:\n{"answer": "1254", "confidence": 100}...
6    (based on context)\n{"answer": "Through altern...
Name: model_output, dtype: object

In [6]:
import re
import json

def clean_and_parse_json(output):
    """
    Extracts the first valid JSON object with:
    - both 'answer' and 'confidence', OR
    - just 'answer' (fallback if confidence is missing)

    Returns: dict with keys 'answer' and 'confidence' (may be None)
    """
    try:
        output = output.strip()

        # Find all JSON-like blocks in the string
        matches = re.findall(r'\{.*?\}', output, re.DOTALL)

        for match in matches:
            try:
                parsed = json.loads(match)
                if isinstance(parsed, dict):
                    if "answer" in parsed and "confidence" in parsed:
                        return parsed
                    elif "answer" in parsed:
                        return {"answer": parsed["answer"], "confidence": None}
            except json.JSONDecodeError:
                continue

    except Exception:
        pass

    return {"answer": None, "confidence": None}



# Apply to DataFrame
parsed = df["model_output"].apply(clean_and_parse_json)
df["parsed_answer"] = parsed.apply(lambda x: x.get("answer"))
df["parsed_confidence"] = pd.to_numeric(parsed.apply(lambda x: x.get("confidence")), errors="coerce")


In [7]:
df

Unnamed: 0,id,title,question,context,answers,is_impossible,prompt,model_output,parsed_answer,parsed_confidence
0,57269ef4f1498d1400e8e52d,The_Sun_(United_Kingdom),What's another thing the paper showed hostilit...,Despite its initial opposition to the closures...,['promotion of right-wing ministers to the cab...,False,You are an expert QA assistant. Your task is t...,"Based on the context provided, the paper showe...",the EU,95
1,56dfbd5e231d4119001abd5a,Internet_service_provider,what type of technology is used to connect to ...,"ISPs provide Internet access, employing a ran...",['wireless Ethernet (wi-fi)'],False,You are an expert QA assistant. Your task is t...,"Based on the context given, what is the answer...",Wireless Ethernet (wi-fi),90
2,5730d22faca1c71400fe5af0,United_States_Air_Force,What is one of the important functions of the ...,"Offensive Counterair (OCA) is defined as ""offe...",['suppression/destruction of enemy air defense'],False,You are an expert QA assistant. Your task is t...,(Select one of the following: Attack operation...,Suppression/destruction of enemy air defense,90
3,57266715dd62a815002e83be,British_Empire,When did England first establish a colony in G...,The Caribbean initially provided England's mos...,['1604'],False,You are an expert QA assistant. Your task is t...,"{""answer"": ""1604"", ""confidence"": 100}\nOkay, l...",1604,100
4,5706371452bb89140068996f,MP3,What would an example of lossy audio encoding be?,"When performing lossy audio encoding, such as ...",['creating an MP3 file'],False,You are an expert QA assistant. Your task is t...,The options are: (A) WAV (B) FLAC (C) MP3 (D) ...,C,98
5,56de81824396321400ee2994,Plymouth,In what year was Plymouth recognized as a town?,The first record of the existence of a settlem...,['1254'],False,You are an expert QA assistant. Your task is t...,"Answer:\n{""answer"": ""1254"", ""confidence"": 100}...",1254,100
6,56f8b7f99e9bad19000a0395,Gene,How do genes encode multiple proteins?,Early work in molecular genetics suggested the...,['by alternative splicing and coding sequences'],False,You are an expert QA assistant. Your task is t...,"(based on context)\n{""answer"": ""Through altern...",Through alternative splicing and trans-splicing,95


In [8]:
df["model_output"][5]

'Answer:\n{"answer": "1254", "confidence": 100} The answer is 1254 because the context explicitly states that Plymouth gained status as a town in 1254. The confidence is 100 as this is a direct factual statement from the provided text. \n\n{"answer": "1254", "confidence": 100} \n\nThe answer is 1254 because the context explicitly states that Plymouth gained status as a town in 1254. The confidence is 100 as this is a direct factual statement from the provided text. \n\n{"answer": "1254", "confidence": 100} \n\nThe answer is 1254 because the context explicitly states that Plymouth gained status as a town in 1254. The confidence is 100 as this is a direct factual statement from the provided text. \n\n{"answer": "1254", "confidence": 100} \n\nThe answer is 1254 because the context explicitly states that Plymouth gained status as a town in 1254. The confidence is 100 as this is a direct factual statement from the provided text. \n\n{"answer": "1254", "confidence": 100} \n\nThe answer is 12

In [125]:
df["prompt"][9]

'I want you to answer the question.\nQuestion: What "moderately famous" ship left Southampton, England, before stopping at Cherbourg, France and Queenstown, Ireland before stopping floating?\nYour entire reply must be a single-line, valid JSON object matching this schema: {"answer": string, "confidence": integer}. Confidence must be an integer 0–100. Only provide a single, best answer—do NOT list multiple possibilities or alternatives. Do NOT include code fences, explanations, or line breaks—only the JSON.Example 1:\nQuestion: What is the capital of France?\nAnswer: {"answer": "Paris", "confidence": 95}  # One answer only\n\nExample 2:\nQuestion: Who painted the Mona Lisa?\nAnswer: {"answer": "Leonardo da Vinci", "confidence": 98}  # Do not list alternatives\n\n'

## Evaluation of Output correctness

In [None]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher

def normalize(text):
    return str(text).strip().lower()

def is_exact_match(pred, target):
    return normalize(pred) == normalize(target)

def fuzzy_score(pred, target):
    return SequenceMatcher(None, normalize(pred), normalize(target)).ratio()

def evaluate_row(row, dataset):
    pred = row['parsed_answer']
    gold = row.get('answer') or row.get('label')  # column may vary

    if pd.isna(pred) or pd.isna(gold):
        return np.nan

    if dataset in ['squad', 'trivia']:
        return int(is_exact_match(pred, gold) or fuzzy_score(pred, gold) > 0.9)
    elif dataset == 'gsm8k':
        try:
            return int(abs(float(pred) - float(gold)) < 1e-3)
        except:
            return 0
    elif dataset == 'boolq':
        return int(normalize(pred) in ['yes', 'no'] and normalize(pred) == normalize(gold))
    return np.nan

def evaluate_confidence_file(filepath, dataset):
    df = pd.read_csv(filepath)
    df['correct'] = df.apply(lambda row: evaluate_row(row, dataset), axis=1)
    return df[['parsed_answer', 'parsed_confidence', 'correct']]

In [None]:
# Apply evaluation
# File path with 
datasets = ["boolq", "squad", "trivia", "gsm8k"]
"output/squad_verbalized_confidence_qwen_test.csv"
dataset = 'squad'  # Change this to 'trivia', 'gsm8k