In [None]:
import re
import pandas as pd
from collections import Counter
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

plt.style.use("ggplot")

In [None]:
def extract_option(answer):
    """Extract the chosen option (A, B, C, D, E or a, b, c, d, e) from the LLM's answer."""
    
    answer = str(answer).strip()  # Remove leading/trailing spaces
    answer = answer.replace("**", "")

    # 3) Match a single character (a-e or A-E)
    match = re.fullmatch(r'[a-eA-E]', answer)
    if match:
        return match.group(0).upper()

    # 4) Match a letter followed by a space (with or without more text)
    match = re.search(r'^([a-eA-E])\s', answer)
    if match:
        return match.group(1).upper()
    
    # 5) Match a letter followed by a period, e.g., A., b.
    match = re.search(r'^([a-eA-E])\.', answer)
    if match:
        return match.group(1).upper()

    # 6) Match a letter inside parentheses, e.g., (a), (B),
    # OR a letter preceded by a space and followed by `)`, e.g., "text d) more text" (but NOT "textd)")
    match = re.search(r'\(([a-eA-E])\)|\s([a-eA-E])\)', answer)
    if match:
        return (match.group(1) or match.group(2)).upper()
    
    # 1) Check for uppercase letter A-E followed by another uppercase letter (A-E)
    match = re.match(r'^[A-E]([A-Z])', answer)
    if match:
        return match.group(0)[0]

    # 2) Check for lowercase letter a-e or uppercase letter A-E followed by a non-letter character
    match = re.match(r'^[a-eA-E][^a-zA-Z]', answer)
    if match:
        return match.group(0)[0]

    # Handle cases like "the correct answer is:a", "the correct answer is: a", etc.
    match = re.search(r'the correct answer is[:\s]*\(?([a-eA-E])\)?', answer.lower(), re.IGNORECASE)
    if match:
        return match.group(1).upper()

    # NEW RULE: Match a letter a-e or A-E followed by any uppercase letter
    match = re.match(r'^([a-eA-E])[A-Z]', answer)
    if match:
        return match.group(1).upper()
    
    return None

def evaluate_answers(file_path, model):
    df = pd.read_csv(file_path)

    df['correct_answer'] = df['real']
    accs = []
    
    # Evaluate all columns that start with "zero_shot" or "one_shot"
    for col in [c for c in df.columns if c.startswith(("zero_shot", "one_shot"))]:
        df[f'{col}_choice'] = df[col].apply(extract_option)
        df[f'{col}_is_correct'] = df[f'{col}_choice'] == df['correct_answer']
        accuracy = df[f'{col}_is_correct'].mean()
        print(f'Accuracy for {col}: {accuracy:.2%}')
        accs.append(accuracy)
    
    accs.insert(0, model)

    return accs

models = ["mistral", "llama3.1:8b", "llama2:7b", "gemma:7b", "gemma2:9b", "qwen2.5:7b", "phi4:14b"]

In [None]:
bench = [evaluate_answers(f"/home/cc/PHD/HealthBranches/results_quiz_{model}.csv", model) for model in models]
baseline = [evaluate_answers(f"/home/cc/PHD/HealthBranches/results_quiz_baseline_{model}.csv", model) for model in models]

In [None]:
for model in models:
    print(f"Results for {model}")
    evaluate_answers(f"/home/cc/PHD/HealthBranches/results_quiz_baseline_{model}.csv", model)

In [None]:
for model in models:
    print(f"Results for {model}")
    evaluate_answers(f"/home/cc/PHD/HealthBranches/results_quiz_{model}.csv", model)

In [None]:
def show_chart(bench, baseline, bar1, bar2, bar3, xl, yl, title):
    # Creiamo un dizionario dalla lista baseline per una ricerca veloce
    baseline_dict = {item[0]: item[1] for item in baseline}

    # Uniamo le liste
    merged_list = [item + [baseline_dict[item[0]]] for item in bench if item[0] in baseline_dict]

    # Ordiniamo la lista in base al primo float (item[1])
    merged_list.sort(key=lambda x: x[1], reverse=True)

    print(merged_list)

    # Estrai le etichette e i valori
    labels = [x[0] for x in merged_list]
    values1 = [x[1] for x in merged_list]
    values2 = [x[2] for x in merged_list]
    values3 = [x[3] for x in merged_list]

    # Imposta la posizione delle barre con più spazio tra i gruppi
    x = np.arange(len(labels)) * 1.3  # Moltiplica per aumentare la distanza tra i gruppi
    width = 0.35  # Larghezza delle barre

    # Aumenta le dimensioni del grafico
    fig, ax = plt.subplots(figsize=(14, 8))
    bars1 = ax.bar(x - width, values1, width, label=bar1)
    bars2 = ax.bar(x, values2, width, label=bar2)
    bars3 = ax.bar(x + width, values3, width, label=bar3)    

    # Etichette e titolo
    ax.set_xlabel(xl)
    ax.set_ylabel(yl)
    ax.set_title(title)
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend()

    # Mostra il grafico
    plt.show()

show_chart(bench, baseline, "no RAG", "RAG", "Baseline", "Models", "Accuracy", "Benchmark")

In [None]:
import pandas as pd
import ast
import random
import string
from collections import Counter

def balance_correct_answer(csv_file, output_file):
    # Carica il file CSV
    df = pd.read_csv(csv_file)
    
    # Definisce le lettere per le opzioni (A, B, C, D, E)
    option_letters = list(string.ascii_uppercase[:5])
    
    # Conta la distribuzione attuale delle risposte corrette
    correct_counts = Counter(df['correct_option'])
    
    # Calcola il numero desiderato per ciascuna lettera
    target_count = len(df) // 5
    
    # Inizializza un dizionario per tenere traccia delle assegnazioni
    assigned_counts = {letter: 0 for letter in option_letters}
    
    # Funzione per ribilanciare la posizione della risposta corretta
    def rebalance(row):
        options = ast.literal_eval(row['options'].replace("['", '["').replace("']", '"]').replace("', '", '", "'))  # Converte la stringa in lista
        correct_letter = row['correct_option']
        correct_index = option_letters.index(correct_letter)
        correct_answer = options[correct_index]
        
        # Trova le lettere meno usate
        available_letters = [letter for letter in option_letters if assigned_counts[letter] < target_count]
        
        # Se tutte le lettere sono bilanciate, assegna a caso
        new_correct_letter = random.choice(available_letters) if available_letters else random.choice(option_letters)
        new_correct_index = option_letters.index(new_correct_letter)
        
        # Rimescola le risposte
        random.shuffle(options)
        
        # Sposta la risposta corretta nella nuova posizione
        options.remove(correct_answer)
        options.insert(new_correct_index, correct_answer)
        
        # Aggiorna il conteggio
        assigned_counts[new_correct_letter] += 1
        
        return pd.Series([str(options), new_correct_letter])
    
    # Applica la funzione a ogni riga
    df[['options', 'correct_option']] = df.apply(rebalance, axis=1)
    
    # Salva il nuovo file CSV
    df.to_csv(output_file, index=False)

    return df


df = pd.read_csv("/home/cc/PHD/HealthBranches/questions_pro/ultimate_questions_v3_full.csv")
print("Distribuzione prima:")
print(Counter(df["correct_option"]))

# # Esegui la funzione su un file di esempio
# bal_df = balance_correct_answer("/home/cc/PHD/HealthBranches/questions_pro/ultimate_questions_v3_full.csv", "/home/cc/PHD/HealthBranches/questions_pro/ultimate_questions_v3_full_balanced.csv")

# # df = balance_correct_options(df)
# print("Distribuzione dopo:")
# print(Counter(bal_df["correct_option"]))

# df.to_csv("/home/cc/PHD/HealthBranches/questions_pro/ultimate_questions_v3_full_balanced.csv", index=False)