In [None]:
import re
import pandas as pd
from collections import Counter
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from classes.utils import check_options, extract_option

In [None]:
def evaluate_answers(file_path, model):
    df = pd.read_csv(file_path)

    # to_remove = pd.read_csv("/home/cc/PHD/HealthBranches/questions_to_check.csv")

    # df = df[~df["question"].isin(to_remove["question"])]

    df['correct_answer'] = df['real']
    accs = []
    
    # Evaluate all columns that start with "zero_shot" or "one_shot"
    for col in [c for c in df.columns if c.startswith(("zero_shot", "one_shot"))]:
        df[f'{col}_choice'] = df[col].apply(extract_option)
        df[f'{col}_is_correct'] = df[f'{col}_choice'] == df['correct_answer']
        accuracy = df[f'{col}_is_correct'].mean()
        print(f'Accuracy for {col}: {accuracy:.2%}')
        accs.append(accuracy)
    
    accs.insert(0, model)

    return accs

In [None]:
extract_option("The correct answer is A.Given the presence")

In [None]:
_, nonelist = check_options(pd.read_csv(f"/home/cc/PHD/HealthBranches/results/results_quiz_baseline_mistral.csv"))
print(len(nonelist))

nonelist

In [None]:
models = ["mistral", "llama3.1:8b", "llama2:7b", "gemma:7b", "gemma2:9b", "qwen2.5:7b", "phi4:14b", "Llama-3.3-70B-Instruct-Turbo-Free"]

bench = [evaluate_answers(f"/home/cc/PHD/HealthBranches/results/results_quiz_{model}.csv", model) for model in models]
baseline = [evaluate_answers(f"/home/cc/PHD/HealthBranches/results/results_quiz_baseline_{model}.csv", model) for model in models]

In [None]:
evaluate_answers(f"/home/cc/PHD/HealthBranches/results/results_quiz_baseline_Llama-3.3-70B-Instruct-Turbo-Free.csv", "405-B")

In [None]:
print("### BASELINE ###")
for model in models:
    print(f"Results for {model}")
    evaluate_answers(f"/home/cc/PHD/HealthBranches/results/results_quiz_baseline_{model}.csv", model)

In [None]:
print("### BENCH ###")
for model in models:
    print(f"Results for {model}")
    evaluate_answers(f"/home/cc/PHD/HealthBranches/results/results_quiz_{model}.csv", model)

### BENCH ###
Results for mistral
Accuracy for zero_shot: 54.63%
Accuracy for zero_shot_rag: 60.10%
Results for llama3.1:8b
Accuracy for zero_shot: 59.32%
Accuracy for zero_shot_rag: 66.20%
Results for llama2:7b
Accuracy for zero_shot: 18.91%
Accuracy for zero_shot_rag: 43.79%
Results for gemma:7b
Accuracy for zero_shot: 57.16%
Accuracy for zero_shot_rag: 64.11%
Results for gemma2:9b
Accuracy for zero_shot: 63.45%
Accuracy for zero_shot_rag: 67.95%
Results for qwen2.5:7b
Accuracy for zero_shot: 62.31%
Accuracy for zero_shot_rag: 62.41%
Results for phi4:14b
Accuracy for zero_shot: 60.29%
Accuracy for zero_shot_rag: 65.32%

### BENCH (remove questions) ###
Results for mistral
Accuracy for zero_shot: 67.36%
Accuracy for zero_shot_rag: 73.71%
Results for llama3.1:8b
Accuracy for zero_shot: 76.84%
Accuracy for zero_shot_rag: 81.91%
Results for llama2:7b
Accuracy for zero_shot: 20.83%
Accuracy for zero_shot_rag: 50.48%
Results for gemma:7b
Accuracy for zero_shot: 69.18%
Accuracy for zero_shot_rag: 75.94%
Results for gemma2:9b
Accuracy for zero_shot: 80.99%
Accuracy for zero_shot_rag: 83.56%
Results for qwen2.5:7b
Accuracy for zero_shot: 79.55%
Accuracy for zero_shot_rag: 78.07%
Results for phi4:14b
Accuracy for zero_shot: 79.68%
Accuracy for zero_shot_rag: 82.05%

In [None]:
def show_chart(bench, baseline, bar1, bar2, bar3, xl, yl, title):
    # Creiamo un dizionario dalla lista baseline per una ricerca veloce
    baseline_dict = {item[0]: item[1] for item in baseline}

    # Uniamo le liste
    merged_list = [item + [baseline_dict[item[0]]] for item in bench if item[0] in baseline_dict]

    # Ordiniamo la lista in base al primo float (item[1])
    merged_list.sort(key=lambda x: x[1], reverse=True)

    print(merged_list)

    # Estrai le etichette e i valori
    labels = [x[0] for x in merged_list]
    values1 = [x[1] for x in merged_list]
    values2 = [x[2] for x in merged_list]
    values3 = [x[3] for x in merged_list]

    # Imposta la posizione delle barre con più spazio tra i gruppi
    x = np.arange(len(labels)) * 1.3  # Moltiplica per aumentare la distanza tra i gruppi
    width = 0.35  # Larghezza delle barre

    # Aumenta le dimensioni del grafico
    fig, ax = plt.subplots(figsize=(14, 8))
    bars1 = ax.bar(x - width, values1, width, label=bar1)
    bars2 = ax.bar(x, values2, width, label=bar2)
    bars3 = ax.bar(x + width, values3, width, label=bar3)    

    # Etichette e titolo
    ax.set_xlabel(xl)
    ax.set_ylabel(yl)
    ax.set_title(title)
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend()

    # Mostra il grafico
    plt.show()

show_chart(bench, baseline, "no RAG", "RAG", "Baseline", "Models", "Accuracy", "Benchmark")

In [None]:
import pandas as pd
import ast
import random
import string
from collections import Counter

def balance_correct_answer(csv_file, output_file):
    # Carica il file CSV
    df = pd.read_csv(csv_file)
    
    # Definisce le lettere per le opzioni (A, B, C, D, E)
    option_letters = list(string.ascii_uppercase[:5])
    
    # Conta la distribuzione attuale delle risposte corrette
    correct_counts = Counter(df['correct_option'])
    
    # Calcola il numero desiderato per ciascuna lettera
    target_count = len(df) // 5
    
    # Inizializza un dizionario per tenere traccia delle assegnazioni
    assigned_counts = {letter: 0 for letter in option_letters}
    
    # Funzione per ribilanciare la posizione della risposta corretta
    def rebalance(row):
        options = ast.literal_eval(row['options'].replace("['", '["').replace("']", '"]').replace("', '", '", "'))  # Converte la stringa in lista
        correct_letter = row['correct_option']
        correct_index = option_letters.index(correct_letter)
        correct_answer = options[correct_index]
        
        # Trova le lettere meno usate
        available_letters = [letter for letter in option_letters if assigned_counts[letter] < target_count]
        
        # Se tutte le lettere sono bilanciate, assegna a caso
        new_correct_letter = random.choice(available_letters) if available_letters else random.choice(option_letters)
        new_correct_index = option_letters.index(new_correct_letter)
        
        # Rimescola le risposte
        random.shuffle(options)
        
        # Sposta la risposta corretta nella nuova posizione
        options.remove(correct_answer)
        options.insert(new_correct_index, correct_answer)
        
        # Aggiorna il conteggio
        assigned_counts[new_correct_letter] += 1
        
        return pd.Series([str(options), new_correct_letter])
    
    # Applica la funzione a ogni riga
    df[['options', 'correct_option']] = df.apply(rebalance, axis=1)
    
    # Salva il nuovo file CSV
    df.to_csv(output_file, index=False)

    return df


df = pd.read_csv("/home/cc/PHD/HealthBranches/questions_pro/ultimate_questions_v3_full.csv")
print("Distribuzione prima:")
print(Counter(df["correct_option"]))

# # Esegui la funzione su un file di esempio
# bal_df = balance_correct_answer("/home/cc/PHD/HealthBranches/questions_pro/ultimate_questions_v3_full.csv", "/home/cc/PHD/HealthBranches/questions_pro/ultimate_questions_v3_full_balanced.csv")

# # df = balance_correct_options(df)
# print("Distribuzione dopo:")
# print(Counter(bal_df["correct_option"]))

# df.to_csv("/home/cc/PHD/HealthBranches/questions_pro/ultimate_questions_v3_full_balanced.csv", index=False)