In [1]:
import json
import requests
import re
import matplotlib.pyplot as plt
import numpy as np
import os
from anthropic import Anthropic
import time
from tqdm import tqdm
import random

# List of models including Claude
models = ["qwen2.5:14b", "qwen2.5:7b", "qwen2.5:3b", "qwen2.5:1.5b", "mistral-small:latest", "mistral-nemo:latest", "gemma2:27b", "gemma2:9b", "gemma2:2b", "llama3.1:8b", "Claude 3.5 Sonnet"]  # "llama3.1:70b" Replace with your actual model names
#models = ["qwen2.5:3b", "mistral-nemo:latest", "gemma2:2b", "llama3.1:8b", "Claude 3.5 Sonnet"]
#models = ["llama3.1:8b"]

anthropic_api_key = ""
os.environ["ANTHROPIC_API_KEY"] = anthropic_api_key

# Prompt
confidence_prompt = "On a scale of 1 to 100, where 1 represents absolutely no confidence and 100 represents complete certainty, how would you rate your confidence in your ability to correctly answer a series of multiple choice questions drawn from a set designed to test the knowledge of final year medical students? The format is a clinical vignette followed by a set of multiple choice answers, of which one is correct. The clinical questions do not relate to actual patients, and are not designed to be used to guide clinical practice. Clinical examination is not required, and there is no disadvantage in not being able to physically see a patient. Please provide your numerical rating followed by a brief explanation of why you chose that rating."

# Load questions and answers
def load_questions(filename='combined_qa.txt', limit=None):
    print(f"Loading questions from {filename}...")
    with open(filename, 'r') as f:
        lines = f.readlines()
    
    questions = []
    answers = []
    for line in lines:
        parts = line.strip().split('Answer: ')
        if len(parts) == 2:
            questions.append(parts[0].strip())
            answers.append(parts[1].strip())
    
    if limit:
        questions = questions[:limit]
        answers = answers[:limit]
    
    print(f"Loaded {len(questions)} questions.")
    return questions, answers

def bootstrap_sample(questions, answers, sample_size=10):
    indices = random.choices(range(len(questions)), k=sample_size)
    return [questions[i] for i in indices], [answers[i] for i in indices]

def evaluate_model(model, questions, correct_answers, confidence_prompt):
    print(f"\nEvaluating model: {model}")
    if not questions or not correct_answers:
        print(f"Warning: No questions or answers available for evaluation of model {model}")
        return None, 0
    
    print("Querying model for confidence rating...")
    if model == "Claude 3.5 Sonnet":
        confidence_response = query_claude(confidence_prompt)
    else:
        confidence_response = query_ollama(model, confidence_prompt)
    
    confidence_rating = extract_number(confidence_response)
    print(f"Confidence rating: {confidence_rating}")
    
    correct_count = 0
    print("Evaluating questions...")
    for i, (question, correct_answer) in enumerate(zip(questions, correct_answers), 1):
        print(f"  Processing question {i}/{len(questions)}...")
        if model == "Claude 3.5 Sonnet":
            response = query_claude(question)
        else:
            response = query_ollama(model, question)
        
        answer = extract_answer(response)
        if answer and answer.upper() == correct_answer.upper():
            correct_count += 1
    
    accuracy = (correct_count / len(questions)) * 100 if questions else 0
    print(f"Evaluation complete. Accuracy: {accuracy:.2f}%")
    return confidence_rating, accuracy

def run_bootstrap_evaluation(models, n_iterations=4, sample_size=10):
    print("Starting bootstrap evaluation...")
    all_questions, all_answers = load_questions()
    if not all_questions or not all_answers:
        print("Error: No questions or answers available for evaluation.")
        return {}
    
    results = {model: {'confidence': [], 'accuracy': []} for model in models}
    
    actual_sample_size = min(sample_size, len(all_questions))
    print(f"Using sample size of {actual_sample_size}")
    
    for iteration in range(n_iterations):
        print(f"\nStarting bootstrap iteration {iteration + 1}/{n_iterations}")
        questions, answers = bootstrap_sample(all_questions, all_answers, actual_sample_size)
        
        for model in models:
            print(f"\nProcessing model: {model}")
            confidence, accuracy = evaluate_model(model, questions, answers, confidence_prompt)
            if confidence is not None:
                results[model]['confidence'].append(confidence)
                results[model]['accuracy'].append(accuracy)
                print(f"Results for {model}: Confidence = {confidence}, Accuracy = {accuracy:.2f}%")
    
    return results

# Function to interact with Ollama model
def query_ollama(model, prompt, temperature=0.6, timeout=120):
    print(f"Querying Ollama model: {model}")
    url = "http://localhost:11434/api/generate"
    data = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "temperature": temperature
    }
    try:
        response = requests.post(url, json=data, timeout=timeout)
        response.raise_for_status()
        return response.json()["response"]
    except requests.exceptions.RequestException as e:
        print(f"Error querying Ollama model {model}: {e}")
        return None

# Function to interact with Claude 3.5 Sonnet
def query_claude(prompt):
    print("Querying Claude 3.5 Sonnet")
    client = Anthropic()
    try:
        message = client.messages.create(
            model="claude-3-sonnet-20240229",
            max_tokens=1000,
            temperature=0,
            system="You are a helpful AI assistant.",
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        return message.content[0].text
    except Exception as e:
        print(f"Error querying Claude 3.5 Sonnet: {e}")
        return None

# Function to extract number from response
def extract_number(response):
    if response:
        match = re.search(r'\b(\d+)\b', response)
        return int(match.group(1)) if match else None
    return None

# Function to extract answer from response
def extract_answer(response):
    if response:
        match = re.search(r'\b([A-E])\b', response)
        return match.group(1) if match else None
    return None

print("Starting evaluation process...")
# Run bootstrap evaluation
bootstrap_results = run_bootstrap_evaluation(models, n_iterations=4, sample_size=50)

if not bootstrap_results:
    print("Evaluation could not be completed due to lack of questions or answers.")
else:
    print("\nCalculating summary results...")
    # Calculate mean and confidence intervals
    summary_results = {}
    for model, data in bootstrap_results.items():
        if data['confidence'] and data['accuracy']:
            confidence_mean = np.mean(data['confidence'])
            confidence_ci = np.percentile(data['confidence'], [2.5, 97.5])
            accuracy_mean = np.mean(data['accuracy'])
            accuracy_ci = np.percentile(data['accuracy'], [2.5, 97.5])
            
            summary_results[model] = {
                'confidence_mean': confidence_mean,
                'confidence_ci': confidence_ci.tolist(),
                'accuracy_mean': accuracy_mean,
                'accuracy_ci': accuracy_ci.tolist()
            }
        else:
            print(f"Warning: No valid data for model {model}")

    # Save results
    print("Saving results to bootstrap_results.json...")
    with open("bootstrap_results.json", "w") as f:
        json.dump(summary_results, f, indent=2)

    print("Bootstrap results have been saved to bootstrap_results.json")

    # Print mean accuracies and number of questions used
    n_questions = len(load_questions()[0])
    print(f"\nNumber of questions available: {n_questions}")
    print(f"Sample size used: {min(2, n_questions)}")
    for model, data in summary_results.items():
        print(f"{model}: Mean accuracy = {data['accuracy_mean']:.2f}%, Confidence = {data['confidence_mean']:.2f}")

print("\nEvaluation process complete.")

# Plotting code remains the same

Starting evaluation process...
Starting bootstrap evaluation...
Loading questions from combined_qa.txt...
Loaded 100 questions.
Using sample size of 50

Starting bootstrap iteration 1/4

Processing model: qwen2.5:14b

Evaluating model: qwen2.5:14b
Querying model for confidence rating...
Querying Ollama model: qwen2.5:14b
Confidence rating: 60
Evaluating questions...
  Processing question 1/50...
Querying Ollama model: qwen2.5:14b
  Processing question 2/50...
Querying Ollama model: qwen2.5:14b
  Processing question 3/50...
Querying Ollama model: qwen2.5:14b
  Processing question 4/50...
Querying Ollama model: qwen2.5:14b
  Processing question 5/50...
Querying Ollama model: qwen2.5:14b
  Processing question 6/50...
Querying Ollama model: qwen2.5:14b
  Processing question 7/50...
Querying Ollama model: qwen2.5:14b
  Processing question 8/50...
Querying Ollama model: qwen2.5:14b
  Processing question 9/50...
Querying Ollama model: qwen2.5:14b
  Processing question 10/50...
Querying Ollama

In [2]:
import json
import matplotlib.pyplot as plt
import numpy as np

# Load the bootstrap results
with open("bootstrap_results.json", "r") as f:
    results = json.load(f)

# Prepare data for plotting
models = list(results.keys())
confidence_means = [results[model]['confidence_mean'] for model in models]
confidence_errors = [
    [results[model]['confidence_mean'] - results[model]['confidence_ci'][0],
     results[model]['confidence_ci'][1] - results[model]['confidence_mean']]
    for model in models
]
accuracy_means = [results[model]['accuracy_mean'] for model in models]
accuracy_errors = [
    [results[model]['accuracy_mean'] - results[model]['accuracy_ci'][0],
     results[model]['accuracy_ci'][1] - results[model]['accuracy_mean']]
    for model in models
]

# Plot
plt.figure(figsize=(12, 8))
plt.errorbar(confidence_means, accuracy_means, xerr=np.array(confidence_errors).T, 
             yerr=np.array(accuracy_errors).T, fmt='o', capsize=5, ecolor='gray', markersize=8)

for i, model in enumerate(models):
    plt.annotate(model, (confidence_means[i], accuracy_means[i]), xytext=(5, 5), 
                 textcoords='offset points', fontsize=8)

plt.title("LLM Confidence vs Actual Performance (with 95% CIs)", fontsize=16)
plt.xlabel("Confidence Rating", fontsize=12)
plt.ylabel("Accuracy (%)", fontsize=12)
plt.grid(True, which="both", ls="--", c='0.7')

# Add a diagonal line representing perfect calibration
min_val = min(min(confidence_means), min(accuracy_means))
max_val = max(max(confidence_means), max(accuracy_means))
plt.plot([min_val, max_val], [min_val, max_val], 'k--', alpha=0.5, label='Perfect Calibration')

plt.legend()
plt.tight_layout()
plt.savefig("bootstrap_confidence_vs_performance.png", dpi=300)
plt.close()

print("Plot has been saved as 'bootstrap_confidence_vs_performance.png'")

Plot has been saved as 'bootstrap_confidence_vs_performance.png'


In [8]:
import json
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

def plot_performance_vs_confidence(data_file='bootstrap_results_run2.json'):
    # Load data
    with open(data_file, 'r') as f:
        data = json.load(f)
    
    # Prepare data for plotting, excluding mistral-nemo
    models = [model for model in data.keys() if model != "mistral-nemo:latest"]
    confidences = [data[model]['confidence_mean'] for model in models]
    confidence_errors = [
        [data[model]['confidence_mean'] - data[model]['confidence_ci'][0],
         data[model]['confidence_ci'][1] - data[model]['confidence_mean']]
        for model in models
    ]
    performances = [data[model]['accuracy_mean'] for model in models]
    performance_errors = [
        [data[model]['accuracy_mean'] - data[model]['accuracy_ci'][0],
         data[model]['accuracy_ci'][1] - data[model]['accuracy_mean']]
        for model in models
    ]
    
    # Create the plot
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # Plot points with error bars (95% confidence intervals)
    ax.errorbar(confidences, performances, 
                xerr=np.array(confidence_errors).T, 
                yerr=np.array(performance_errors).T, 
                fmt='o', capsize=5, capthick=1, ecolor='gray', alpha=0.5)
    
    # Add labels for each point
    for i, model in enumerate(models):
        ax.annotate(model, (confidences[i], performances[i]), 
                    textcoords="offset points", xytext=(0,10), 
                    ha='center', fontsize=8, rotation=45)
    
    # Calculate and plot linear regression
    slope, intercept, r_value, p_value, std_err = stats.linregress(confidences, performances)
    line = slope * np.array([min(confidences), max(confidences)]) + intercept
    ax.plot([min(confidences), max(confidences)], line, 'g-', label=f'Linear regression (r={r_value:.2f})')
    
    # Set labels and title
    ax.set_xlabel('Confidence (Mean)')
    ax.set_ylabel('Performance (Mean Accuracy %)')
    ax.set_title('Performance vs Confidence with 95% Confidence Intervals and Linear Regression')
    
    # Add grid
    ax.grid(True, linestyle='--', alpha=0.7)
    
    # Set axis limits
    ax.set_xlim(0, 100)
    ax.set_ylim(0, 100)
    
    # Add diagonal line y=x
    ax.plot([0, 100], [0, 100], 'r--', alpha=0.5, label='y=x')
    
    # Add legend
    ax.legend()
    
    # Adjust layout and save
    plt.tight_layout()
    plt.savefig('performance_vs_confidence_with_ci_and_regression.png', dpi=300)
    plt.close()

    print("Performance vs Confidence plot with 95% confidence intervals and linear regression has been saved as performance_vs_confidence_with_ci_and_regression.png")
    print(f"Linear regression results:")
    print(f"Slope: {slope:.4f}")
    print(f"Intercept: {intercept:.4f}")
    print(f"R-squared: {r_value**2:.4f}")
    print(f"P-value: {p_value:.4f}")

# Run the function
plot_performance_vs_confidence()

Performance vs Confidence plot with 95% confidence intervals and linear regression has been saved as performance_vs_confidence_with_ci_and_regression.png
Linear regression results:
Slope: -0.1668
Intercept: 61.6962
R-squared: 0.1571
P-value: 0.2569


In [9]:
import json
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

def get_parameter_sizes():
    return {
        "qwen2.5:14b": 14,
        "qwen2.5:7b": 7,
        "qwen2.5:3b": 3,
        "qwen2.5:1.5b": 1.5,
        "mistral-small:latest": 7,
        "mistral-nemo:latest": 7,
        "gemma2:27b": 27,
        "gemma2:9b": 9,
        "gemma2:2b": 2,
        "llama3.1:8b": 8,
        "Claude 3.5 Sonnet": 175  # Estimated
    }

def plot_performance_vs_confidence(data, output_file):
    models = [model for model in data.keys() if model != "mistral-nemo:latest"]
    confidences = [data[model]['confidence_mean'] for model in models]
    confidence_errors = [
        [data[model]['confidence_mean'] - data[model]['confidence_ci'][0],
         data[model]['confidence_ci'][1] - data[model]['confidence_mean']]
        for model in models
    ]
    performances = [data[model]['accuracy_mean'] for model in models]
    performance_errors = [
        [data[model]['accuracy_mean'] - data[model]['accuracy_ci'][0],
         data[model]['accuracy_ci'][1] - data[model]['accuracy_mean']]
        for model in models
    ]
    
    fig, ax = plt.subplots(figsize=(12, 8))
    
    ax.errorbar(confidences, performances, 
                xerr=np.array(confidence_errors).T, 
                yerr=np.array(performance_errors).T, 
                fmt='o', capsize=5, capthick=1, ecolor='gray', alpha=0.5)
    
    for i, model in enumerate(models):
        ax.annotate(model, (confidences[i], performances[i]), 
                    textcoords="offset points", xytext=(0,10), 
                    ha='center', fontsize=8, rotation=45)
    
    slope, intercept, r_value, p_value, std_err = stats.linregress(confidences, performances)
    line = slope * np.array([min(confidences), max(confidences)]) + intercept
    ax.plot([min(confidences), max(confidences)], line, 'g-', label=f'Linear regression (r={r_value:.2f})')
    
    ax.set_xlabel('Confidence (Mean)')
    ax.set_ylabel('Performance (Mean Accuracy %)')
    ax.set_title('Performance vs Confidence with 95% Confidence Intervals and Linear Regression')
    
    ax.grid(True, linestyle='--', alpha=0.7)
    ax.set_xlim(0, 100)
    ax.set_ylim(0, 100)
    ax.plot([0, 100], [0, 100], 'r--', alpha=0.5, label='y=x')
    ax.legend()
    
    plt.tight_layout()
    plt.savefig(output_file, dpi=300)
    plt.close()

    print(f"Performance vs Confidence plot saved as {output_file}")
    print(f"Linear regression results:")
    print(f"Slope: {slope:.4f}")
    print(f"Intercept: {intercept:.4f}")
    print(f"R-squared: {r_value**2:.4f}")
    print(f"P-value: {p_value:.4f}")

def plot_vs_parameters(data, y_key, y_label, output_file):
    parameter_sizes = get_parameter_sizes()
    models = [model for model in data.keys() if model != "mistral-nemo:latest"]
    parameters = [parameter_sizes[model] for model in models]
    y_values = [data[model][f'{y_key}_mean'] for model in models]
    y_errors = [
        [data[model][f'{y_key}_mean'] - data[model][f'{y_key}_ci'][0],
         data[model][f'{y_key}_ci'][1] - data[model][f'{y_key}_mean']]
        for model in models
    ]
    
    fig, ax = plt.subplots(figsize=(12, 8))
    
    ax.errorbar(parameters, y_values, 
                yerr=np.array(y_errors).T, 
                fmt='o', capsize=5, capthick=1, ecolor='gray', alpha=0.5)
    
    for i, model in enumerate(models):
        ax.annotate(model, (parameters[i], y_values[i]), 
                    textcoords="offset points", xytext=(0,10), 
                    ha='center', fontsize=8, rotation=45)
    
    slope, intercept, r_value, p_value, std_err = stats.linregress(np.log10(parameters), y_values)
    x_line = np.logspace(np.log10(min(parameters)), np.log10(max(parameters)), 100)
    y_line = slope * np.log10(x_line) + intercept
    ax.plot(x_line, y_line, 'g-', label=f'Log-linear regression (r={r_value:.2f})')
    
    ax.set_xlabel('Parameter Count (Billions)')
    ax.set_ylabel(y_label)
    ax.set_title(f'{y_label} vs Parameter Count with 95% Confidence Intervals')
    
    ax.grid(True, linestyle='--', alpha=0.7)
    ax.set_xscale('log')
    ax.set_xlim(1, 200)
    ax.legend()
    
    plt.tight_layout()
    plt.savefig(output_file, dpi=300)
    plt.close()

    print(f"{y_label} vs Parameter Count plot saved as {output_file}")
    print(f"Log-linear regression results:")
    print(f"Slope: {slope:.4f}")
    print(f"Intercept: {intercept:.4f}")
    print(f"R-squared: {r_value**2:.4f}")
    print(f"P-value: {p_value:.4f}")

def main(data_file='bootstrap_results_run2.json'):
    with open(data_file, 'r') as f:
        data = json.load(f)
    
    plot_performance_vs_confidence(data, 'performance_vs_confidence.png')
    plot_vs_parameters(data, 'accuracy', 'Performance (Mean Accuracy %)', 'performance_vs_parameters.png')
    plot_vs_parameters(data, 'confidence', 'Confidence (Mean)', 'confidence_vs_parameters.png')

if __name__ == "__main__":
    main()

Performance vs Confidence plot saved as performance_vs_confidence.png
Linear regression results:
Slope: -0.1668
Intercept: 61.6962
R-squared: 0.1571
P-value: 0.2569
Performance (Mean Accuracy %) vs Parameter Count plot saved as performance_vs_parameters.png
Log-linear regression results:
Slope: 18.0158
Intercept: 37.1551
R-squared: 0.7845
P-value: 0.0006
Confidence (Mean) vs Parameter Count plot saved as confidence_vs_parameters.png
Log-linear regression results:
Slope: -30.3159
Intercept: 74.7114
R-squared: 0.3933
P-value: 0.0523


In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

def load_results(filename='bootstrap_results.json'):
    with open(filename, 'r') as f:
        return json.load(f)

def add_trend_line(ax, x, y):
    z = np.polyfit(x, y, 1)
    p = np.poly1d(z)
    ax.plot(x, p(x), "r--", alpha=0.8)

def plot_results(results):
    models = list(results.keys())
    confidences = [results[model]['confidence_mean'] for model in models]
    accuracies = [results[model]['accuracy_mean'] for model in models]
    
    # Estimate parameter sizes (you may need to adjust these or provide actual values)
    parameter_sizes = {
        "qwen2.5:14b": 14, "qwen2.5:7b": 7, "qwen2.5:3b": 3, "qwen2.5:1.5b": 1.5,
        "mistral-small:latest": 7, "mistral-nemo:latest": 8,
        "gemma2:27b": 27, "gemma2:9b": 9, "gemma2:2b": 2,
        "llama3.1:8b": 8, "Claude 3.5 Sonnet": 175  # Estimate for Claude
    }
    sizes = [parameter_sizes.get(model, 1) for model in models]  # Default to 1 if unknown
    
    # 1. Confidence vs Performance
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.scatter(confidences, accuracies)
    for i, model in enumerate(models):
        ax.annotate(model, (confidences[i], accuracies[i]), textcoords="offset points", xytext=(0,5), ha='center')
    add_trend_line(ax, confidences, accuracies)
    ax.set_xlabel('Confidence')
    ax.set_ylabel('Accuracy (%)')
    ax.set_title('Confidence vs Performance')
    plt.savefig('confidence_vs_performance.png')
    plt.close()

    # 2. Confidence vs Parameter Size
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.scatter(sizes, confidences)
    for i, model in enumerate(models):
        ax.annotate(model, (sizes[i], confidences[i]), textcoords="offset points", xytext=(0,5), ha='center')
    add_trend_line(ax, np.log10(sizes), confidences)  # Use log of sizes for trend line
    ax.set_xlabel('Parameter Size (billions)')
    ax.set_ylabel('Confidence')
    ax.set_title('Confidence vs Parameter Size')
    ax.set_xscale('log')
    plt.savefig('confidence_vs_parameter_size.png')
    plt.close()

    # 3. Performance vs Parameter Size
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.scatter(sizes, accuracies)
    for i, model in enumerate(models):
        ax.annotate(model, (sizes[i], accuracies[i]), textcoords="offset points", xytext=(0,5), ha='center')
    add_trend_line(ax, np.log10(sizes), accuracies)  # Use log of sizes for trend line
    ax.set_xlabel('Parameter Size (billions)')
    ax.set_ylabel('Accuracy (%)')
    ax.set_title('Performance vs Parameter Size')
    ax.set_xscale('log')
    plt.savefig('performance_vs_parameter_size.png')
    plt.close()

# Load and plot results
results = load_results()
plot_results(results)
print("Plots with trend lines have been saved as PNG files in the current directory.")

Plots with trend lines have been saved as PNG files in the current directory.


In [14]:
import json
import requests
import time
from tqdm import tqdm
from anthropic import Anthropic
import os
import random

def query_confidence_distribution(models, confidence_prompt, n_iterations=20):
    results = {model: [] for model in models}
    
    for model in models:
        print(f"\nQuerying model: {model}")
        for _ in tqdm(range(n_iterations)):
            if model == "Claude 3.5 Sonnet":
                confidence = query_claude(confidence_prompt)
            else:
                confidence = query_ollama(model, confidence_prompt)
            
            if confidence is not None:
                results[model].append(confidence)
            
            # Add a small delay to avoid rate limiting
            time.sleep(random.uniform(0.5, 1.5))
    
    return results

def query_ollama(model, prompt, temperature=0.7, timeout=120):
    url = "http://localhost:11434/api/generate"
    data = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "temperature": temperature
    }
    try:
        response = requests.post(url, json=data, timeout=timeout)
        response.raise_for_status()
        return extract_number(response.json()["response"])
    except requests.exceptions.RequestException as e:
        print(f"Error querying Ollama model {model}: {e}")
        return None

def query_claude(prompt):
    client = Anthropic()
    try:
        message = client.messages.create(
            model="claude-3-sonnet-20240229",
            max_tokens=1000,
            temperature=0.7,
            system="You are a helpful AI assistant.",
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        return extract_number(message.content[0].text)
    except Exception as e:
        print(f"Error querying Claude 3.5 Sonnet: {e}")
        return None

def extract_number(response):
    if response:
        match = re.search(r'\b(\d+)\b', response)
        return int(match.group(1)) if match else None
    return None

# Example usage
models = ["qwen2.5:14b", "qwen2.5:7b", "qwen2.5:3b", "qwen2.5:1.5b", "mistral-small:latest", "mistral-nemo:latest", "gemma2:27b", "gemma2:9b", "gemma2:2b", "llama3.1:8b", "Claude 3.5 Sonnet"]
confidence_prompt = "On a scale of 1 to 100, where 1 represents absolutely no confidence and 100 represents complete certainty, how would you rate your confidence in your ability to correctly answer a series of multiple choice questions drawn from a set designed to test the knowledge of final year medical students? The format is a clinical vignette followed by a set of multiple choice answers, of which one is correct. The clinical questions do not relate to actual patients, and are not designed to be used to guide clinical practice. Clinical examination is not required, and there is no disadvantage in not being able to physically see a patient. Please provide your numerical rating followed by a brief explanation of why you chose that rating."

confidence_distribution = query_confidence_distribution(models, confidence_prompt, n_iterations=20)

# Save results
with open("confidence_distribution.json", "w") as f:
    json.dump(confidence_distribution, f, indent=2)

print("Confidence distribution results have been saved to confidence_distribution.json")


Querying model: qwen2.5:14b


100%|██████████| 20/20 [01:54<00:00,  5.70s/it]



Querying model: qwen2.5:7b


100%|██████████| 20/20 [00:54<00:00,  2.70s/it]



Querying model: qwen2.5:3b


100%|██████████| 20/20 [01:00<00:00,  3.01s/it]



Querying model: qwen2.5:1.5b


100%|██████████| 20/20 [00:43<00:00,  2.17s/it]



Querying model: mistral-small:latest


100%|██████████| 20/20 [01:16<00:00,  3.83s/it]



Querying model: mistral-nemo:latest


100%|██████████| 20/20 [01:30<00:00,  4.52s/it]



Querying model: gemma2:27b


100%|██████████| 20/20 [02:04<00:00,  6.21s/it]



Querying model: gemma2:9b


100%|██████████| 20/20 [01:12<00:00,  3.62s/it]



Querying model: gemma2:2b


100%|██████████| 20/20 [00:54<00:00,  2.73s/it]



Querying model: llama3.1:8b


100%|██████████| 20/20 [01:00<00:00,  3.01s/it]



Querying model: Claude 3.5 Sonnet


100%|██████████| 20/20 [01:55<00:00,  5.78s/it]

Confidence distribution results have been saved to confidence_distribution.json





In [16]:
import json
import matplotlib.pyplot as plt
import numpy as np

def plot_median_confidence_vs_performance(confidence_file='confidence_distribution.json', performance_file='bootstrap_results.json'):
    # Load confidence distribution data
    with open(confidence_file, 'r') as f:
        confidence_data = json.load(f)
    
    # Load performance data
    with open(performance_file, 'r') as f:
        performance_data = json.load(f)
    
    # Calculate median confidence and IQR for each model
    median_confidences = {}
    confidence_iqrs = {}
    for model, ratings in confidence_data.items():
        median_confidences[model] = np.median(ratings)
        q1, q3 = np.percentile(ratings, [25, 75])
        confidence_iqrs[model] = q3 - q1
    
    # Extract performance (accuracy) for each model
    performances = {model: data['accuracy_mean'] for model, data in performance_data.items()}
    
    # Prepare data for plotting
    models = list(set(median_confidences.keys()) & set(performances.keys()))
    confidences = [median_confidences[model] for model in models]
    accuracies = [performances[model] for model in models]
    iqrs = [confidence_iqrs[model] for model in models]
    
    # Create the scatter plot
    fig, ax = plt.subplots(figsize=(12, 8))
    scatter = ax.scatter(confidences, accuracies, s=100, alpha=0.6)
    
    # Add error bars for IQR
    ax.errorbar(confidences, accuracies, xerr=np.array(iqrs)/2, fmt='none', ecolor='gray', alpha=0.5)
    
    # Add labels for each point
    for i, model in enumerate(models):
        ax.annotate(model, (confidences[i], accuracies[i]), textcoords="offset points", xytext=(0,5), ha='center')
    
    # Add a trend line
    z = np.polyfit(confidences, accuracies, 1)
    p = np.poly1d(z)
    ax.plot(confidences, p(confidences), "r--", alpha=0.8)
    
    # Calculate correlation coefficient
    correlation = np.corrcoef(confidences, accuracies)[0, 1]
    
    # Set labels and title
    ax.set_xlabel('Median Confidence')
    ax.set_ylabel('Accuracy (%)')
    ax.set_title(f'Median Confidence vs Performance\nCorrelation: {correlation:.2f}')
    
    # Add grid lines
    ax.grid(True, linestyle='--', alpha=0.7)
    
    # Add a legend explaining the error bars
    ax.plot([], [], 'k-', label='Confidence IQR')
    ax.legend()
    
    # Adjust layout and save
    plt.tight_layout()
    plt.savefig('median_confidence_vs_performance_with_iqr.png', dpi=300)
    plt.close()

    print("Median Confidence vs Performance plot with IQR has been saved as median_confidence_vs_performance_with_iqr.png")

# Example usage
plot_median_confidence_vs_performance()

Median Confidence vs Performance plot with IQR has been saved as median_confidence_vs_performance_with_iqr.png
