In [None]:
import re
import pandas as pd

In [None]:
def extract_option(answer):
    """Extract the chosen option (A, B, C, D, E or a, b, c, d, e) from the LLM's answer."""
    
    answer = str(answer).strip()  # Remove leading/trailing spaces

    # 1) Match a single character (a-e or A-E)
    match = re.fullmatch(r'[a-eA-E]', answer)
    if match:
        return match.group(0).upper()

    # 2) Match a letter followed by a space (with or without more text)
    match = re.search(r'^([a-eA-E])\s', answer)
    if match:
        return match.group(1).upper()
    
    # 3) Match a letter followed by a period, e.g., A., b.
    match = re.search(r'^([a-eA-E])\.', answer)
    if match:
        return match.group(1).upper()

    # 4) Match a letter inside parentheses, e.g., (a), (B),
    # OR a letter preceded by a space and followed by `)`, e.g., "text d) more text" (but NOT "textd)")
    match = re.search(r'\(([a-eA-E])\)|\s([a-eA-E])\)', answer)
    if match:
        return (match.group(1) or match.group(2)).upper()
    
    return None

def evaluate_answers(file_path):
    df = pd.read_csv(file_path)

    df['correct_answer'] = df['real']
    
    result_columns = ['question', 'correct_answer']
    
    # Evaluate all columns that start with "zero_shot" or "one_shot"
    for col in [c for c in df.columns if c.startswith(("zero_shot", "one_shot"))]:
        df[f'{col}_choice'] = df[col].apply(extract_option)
        df[f'{col}_is_correct'] = df[f'{col}_choice'] == df['correct_answer']
        accuracy = df[f'{col}_is_correct'].mean()
        print(f'Accuracy for {col}: {accuracy:.2%}')
        result_columns.extend([f'{col}_choice', f'{col}_is_correct'])
    
    return df[result_columns]

models = ["mistral", "llama3.1:8b", "llama2:7b", "gemma:7b", "gemma2:9b", "phi4:14b", "qwen2.5:7b", "medllama2:7b"]

for model in models:
    print(f"Results for {model}")
    evaluate_answers(f"/home/cc/PHD/ragkg/results_quiz_v2_{model}.csv")

In [None]:
for model in models:
    print(f"Results for {model}")
    evaluate_answers(f"/home/cc/PHD/ragkg/results_quiz_baseline_v2_{model}.csv")

In [None]:
# Results for mistral
# Accuracy for zero_shot: 59.49%
# Accuracy for zero_shot_rag: 62.55%
# Results for llama3.1:8b
# Accuracy for zero_shot: 63.60%
# Accuracy for zero_shot_rag: 69.34%
# Results for llama2:7b
# Accuracy for zero_shot: 20.53%
# Accuracy for zero_shot_rag: 41.85%
# Results for gemma:7b
# Accuracy for zero_shot: 58.26%
# Accuracy for zero_shot_rag: 64.90%
# Results for gemma2:9b
# Accuracy for zero_shot: 66.81%
# Accuracy for zero_shot_rag: 71.00%
# Results for qwen2.5:7b
# Accuracy for zero_shot: 65.58%
# Accuracy for zero_shot_rag: 64.72%