In [None]:
import pandas as pd
import numpy as np
from RAGs.advanced_rag import AdvancedRAG, EmbeddingModelType, ChainType
from RAG_Database.prompts import PROMPT_PRESETS
from RAGs.basic_rag import answer_query as basic_answer_query
from sentence_transformers import SentenceTransformer, util
from datetime import datetime
import os
from collections import defaultdict
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [None]:
def embedding_model_evaluate_answer(pred: str, gold: str, semantic_model) -> float:
    emb_pred = semantic_model.encode(pred, convert_to_tensor=True)
    emb_gold = semantic_model.encode(gold, convert_to_tensor=True)
    sim = util.cos_sim(emb_pred, emb_gold).item()
    return sim

def llm_evaluate_answer(pred: str, gold: str, llm, llm_prompt_list: list) -> float:
    sim_score = 0
    for prompt in llm_prompt_list:
        s = llm.get_direct_llm_answer(prompt.replace("{gold_answer}", gold).replace("{predicted_answer}", pred))
        try:
            s = float(s)
        except:
            return np.nan
        try:
            sim_score += s
        except:
            return np.nan
    sim_score /= len(llm_prompt_list)
    return sim_score

def compute_accuracy(results_df, group_col):
    metrics = results_df.groupby(group_col)["Correct"].mean().reset_index()
    metrics.columns = [group_col, "Accuracy"]
    return metrics

def evaluate_rag(rag_query_fn, questions_df, semantic_model, self_llm, llm_prompts_list, direct_llm=False):
    results = []
    for _, row in questions_df.iterrows():
        q, gold, category, no = row["Question"], row["Answer"], row["Category"], row["no"]
        if direct_llm:
            q = "Please answer based on the book '1984' written by George Orwell. " + q
        pred = rag_query_fn(q)
        print(pred.split("Sources: ")[0])
        embeddings_models_score = embedding_model_evaluate_answer(pred.split("Sources: ")[0], gold, semantic_model)
        llm_score = llm_evaluate_answer(pred.split("Sources: ")[0], gold, self_llm, llm_prompts_list)
        results.append({
            "question_no": no,
            "Predicted Answer": pred,
            "Category": category,
            "embeddings_models_score": embeddings_models_score,
            "llm_score": llm_score
        })

    results_df = pd.DataFrame(results)
    return results_df

def create_specific_rag_eval_figure(results_path, figure_path):
    for file in os.listdir(results_path):
        # Load the JSON data
        with open(f"{results_path}/{file}", "r") as f:
            data = json.load(f)

        # Group scores by category
        categories = defaultdict(lambda: {"embedding": [], "llm": []})

        for item in data:
            cat = item["Category"]

            if item["embeddings_models_score"] is not None:
                categories[cat]["embedding"].append(item["embeddings_models_score"])

            if item["llm_score"] is not None:
                categories[cat]["llm"].append(item["llm_score"])

        # Prepare plotting data
        cat_names = list(categories.keys())

        embedding_scores = [vals["embedding"] for vals in categories.values()]
        llm_scores = [vals["llm"] for vals in categories.values()]

        # Add mean as the last value per category
        embedding_with_means = [scores + [np.mean(scores)] for scores in embedding_scores]
        llm_with_means = [scores + [np.mean(scores)] for scores in llm_scores]

        # Plot setup
        fig, axes = plt.subplots(2, 1, figsize=(10, 10), sharex=True)

        # Define colors
        color_individual = "C0"
        color_mean = "C3"

        for ax, scores, title in zip(
            axes, [embedding_with_means, llm_with_means],
            ["Embedding Scoring Method", "LLM Comparison Scoring Method"]
            ):
            bar_width = 0.1
            x = np.arange(len(cat_names))

            for i, cat_scores in enumerate(scores):
                offsets = np.linspace(-bar_width*(len(cat_scores)-1)/2,
                                      bar_width*(len(cat_scores)-1)/2, len(cat_scores))
                for j, val in enumerate(cat_scores):
                    color = color_mean if j == len(cat_scores)-1 else color_individual
                    bar = ax.bar(x[i] + offsets[j], val, width=bar_width, color=color)

                    ax.text(
                        x[i] + offsets[j],
                        val + 0.02,
                        f"{val:.2f}",
                        ha='center',
                        va='bottom',
                        fontsize=8,
                        rotation=90
                    )

            ax.set_title(title, fontsize=14)
            ax.set_ylabel("Score")
            ax.set_ylim(0, 1.1)
            ax.grid(axis="y", linestyle="--", alpha=0.6)

        # Add legend using proxy artists with correct colors
        ind_bar = plt.Rectangle((0, 0), 1, 1, color=color_individual)
        mean_bar = plt.Rectangle((0, 0), 1, 1, color=color_mean)
        axes[0].legend([ind_bar, mean_bar], ["Individual Scores", "Mean"], loc="upper right")

        # Common X labels
        axes[-1].set_xticks(np.arange(len(cat_names)))
        axes[-1].set_xticklabels(cat_names, rotation=30, ha="right")

        if len(file) > 50:
            title = f"RAG evaluation results for \n{file.split('_eval_results.json')[0].split("rag_")[1]}"
            title = "\n_nb_chunks".join(title.split("_nbChunks"))
        else:
            title = file
        plt.suptitle(title)
        plt.savefig(f"{figure_path}/{file.split('_eval_results.json')[0]}.png")
        plt.close("all")

def plot_subplots(data: pd.DataFrame, group_label: str, mode: str, save_path: str):
    """Create multi-subplot figure: vertical boxplots, one per parameter."""
    n_params = len(parameters)
    n_cols = 3
    n_rows = int((n_params + n_cols - 1) / n_cols)
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 4 * n_rows))
    axes = axes.flatten()

    for i, param in enumerate(parameters):
        sns.boxplot(
            data=data,
            x=param,
            y="llm_score",
            hue=param,
            ax=axes[i],
            palette="Set2"
        )
        axes[i].set_title(param)
        axes[i].set_xlabel(param)
        axes[i].set_ylabel("LLM Score")
        #axes[i].legend(title=param, bbox_to_anchor=(1.05, 1), loc="upper left")

    for j in range(i + 1, len(axes)):
        axes[j].axis("off")

    fig.suptitle(f"LLM Score Distributions — {mode.capitalize()}: {group_label}", fontsize=14, y=1.02)
    plt.tight_layout()
    plt.savefig(save_path, bbox_inches="tight")
    plt.close()

General constants

In [None]:
load_dotenv()
PWD = os.environ.get("PROJECT_WORKING_DIRECTORY")
self_llm = AdvancedRAG()
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
questions_df = pd.read_json(f"{PWD}/evaluation/1984_test_questions.json")

llm_prompts_list = [
    "Please score the level of correctness of the predicted answer based on the gold answer. You should give a high score to an answer that contains at least the elements contained in the gold answer. If a few additionnary elements are present in the predicted answer, you can still give a high score for the answer. If elements of the predicted answer are contradicting the gold answer, the final score should be low. 1 is a perfect score (all elements aôf the gold answer a in the predicted answer). O is a very low score (None of the elements of the gold answer are in the predicted answer).\n Gold answer : {gold_answer}. Predicted answer : {predicted_answer}. Please only give your score with no comments, explanations, side notes or post scriptum. Please do not write anything else than th score. (for example: '0.7').",

    "Please score the level of correctness of the predicted answer based on the gold answer. You should give a high score to an answer that contains at least the elements contained in the gold answer. Do not penalize additionnary elements that are present in the answer but not in the answer. If elements of the predicted answer are contradicting the gold answer, the final score should be low. 1 is a perfect score (all elements of the gold answer a in the predicted answer). O is a very low score (None of the elements of the gold answer are in the predicted answer).\n Gold answer : {gold_answer}. Predicted answer : {predicted_answer}. Please only give your score with no comments, explanations, side notes or post scriptum. Please do not write anything else than th score. (for example: '0.7')."
]

Evaluate basic RAG

In [None]:
results_df = evaluate_rag(rag_query_fn=basic_answer_query, questions_df=questions_df.copy(), semantic_model=semantic_model, self_llm=self_llm, llm_prompts_list=llm_prompts_list)

results_df.to_json(f"{PWD}/evaluation/basicRAG_eval_results/basic_rag_eval_results.json", orient="records", indent=2)

In [None]:
create_specific_rag_eval_figure(results_path=f"{PWD}/evaluation/basicRAG_eval_results", figure_path=f"{PWD}/evaluation/basicRAG_eval_results_figures")

Evaluate direct LLM

In [None]:
results_df = evaluate_rag(rag_query_fn=AdvancedRAG().get_direct_llm_answer, questions_df=questions_df.copy(), semantic_model=semantic_model, self_llm=self_llm, llm_prompts_list=llm_prompts_list, direct_llm=True)

results_df.to_json(f"{PWD}/evaluation/directLLM_eval_results/directLLM_eval_results.json", orient="records", indent=2)

In [None]:
create_specific_rag_eval_figure(results_path=f"{PWD}/evaluation/directLLM_eval_results", figure_path=f"{PWD}/evaluation/directLLM_eval_results_figures")

Evaluate Advanced RAG with various parameters

In [None]:
for temperature in [0.1, 0.2, 0.8, 1.2]:
    for embedding_model in [EmbeddingModelType.HuggingFace, EmbeddingModelType.Ollama]:
        for chain_type in [ChainType.STUFF, ChainType.REFINE, ChainType.MAP_REDUCE]:
            for compression in [True, False]:
                for nb_chunks in [1, 3, 5]:
                    for role_name in ["default", "academic", "debate", "psychology"]:
                        print("---------------------------------------")
                        print(datetime.now())
                        print(temperature, embedding_model.value, chain_type.value, compression, nb_chunks, role_name)
                        prompt = PROMPT_PRESETS[role_name]

                        Advanced_rag = AdvancedRAG(compression=compression,
                                                   embedding_model_type=embedding_model,
                                                   chain_type=chain_type,
                                                   nb_chunks=nb_chunks,
                                                   prompt=prompt,
                                                   llm_temperature=temperature)
                        try:
                            results_df = evaluate_rag(rag_query_fn=Advanced_rag.answer_query,
                                                      questions_df=questions_df.copy(),
                                                      semantic_model=semantic_model,
                                                      self_llm=self_llm,
                                                      llm_prompts_list=llm_prompts_list)

                            results_df["compression"] = compression
                            results_df["embedding_model"] = embedding_model.value
                            results_df["chain_type"] = chain_type.value
                            results_df["nb_chunks"] = nb_chunks
                            results_df["role_name"] = role_name
                            results_df["temperature"] = temperature

                            results_df.to_json(f"{PWD}/evaluation/advancedRAG_eval_results/advanced_rag_compression_{str(compression)}_embModel_{embedding_model.value}_chainType_{chain_type.value}_nbChunks_{str(nb_chunks)}_role{role_name}_LLMtemp_{str(temperature)}_eval_results.json", orient="records", indent=2)
                        except Exception as e:
                            print(e)

Results visualization

In [None]:
create_specific_rag_eval_figure(results_path=f"{PWD}/evaluation/advancedRAG_eval_results", figure_path=f"{PWD}/evaluation/advancedRAG_eval_results_figures")

In [None]:
base_path = f"{PWD}/evaluation/advancedRAG_eval_results"
files = os.listdir(base_path)
dfs = []
for f in files:
    if "LLMtemp_0.1" in f:
        with open(os.path.join(base_path, f)) as fp:
            try:
                df = pd.DataFrame(json.load(fp))
                df["source_file"] = f
                dfs.append(df)
            except Exception as e:
                print(f"Skipping {f}: {e}")
all_results = pd.concat(dfs, ignore_index=True)

# merge all results files
all_results = all_results.dropna(subset=["llm_score", "Category", "question_no"])
for col in ["compression", "embedding_model", "chain_type", "nb_chunks", "role_name"]:
    all_results[col] = all_results[col].astype(str)

parameters = ["compression", "embedding_model", "chain_type", "nb_chunks", "role_name"]

folders = {
    "parameter": f"{PWD}/evaluation/advancedRAG_per_parameter_figures",
    "category": f"{PWD}/evaluation/advancedRAG_per_question_category_figures",
    "question": f"{PWD}/evaluation/advancedRAG_per_single_question_figures",
}
for p in folders.values():
    os.makedirs(p, exist_ok=True)


# Per parameter figures
# Figures per parameter
for param in parameters:
    plt.figure(figsize=(10,6))
    sns.boxplot(
        data=all_results,
        x="Category",
        y="llm_score",
        hue=param,
        palette="Set2"
    )
    plt.title(f"LLM Score by Question Category and {param}")
    plt.xticks(rotation=45, ha="right")
    plt.xlabel("Question Category")
    plt.ylabel("LLM Score")
    plt.legend(title=param, bbox_to_anchor=(1.05, 1), loc="upper left")
    plt.tight_layout()
    plt.savefig(folders["parameter"] + f"/boxplot_{param}.png", bbox_inches="tight")
    plt.close()

# Per question category figures
for cat in all_results["Category"].unique():
    subset = all_results[all_results["Category"] == cat]
    save_path = os.path.join(folders["category"], f"subplots_category_{cat.replace(' ', '_')}.png")
    plot_subplots(subset, group_label=cat, mode="category", save_path=save_path)

# Per single question figures
for q in sorted(all_results["question_no"].unique()):
    subset = all_results[all_results["question_no"] == q]
    cat = subset["Category"].iloc[0] if len(subset) > 0 else "Unknown"
    save_path = os.path.join(folders["question"], f"subplots_question_{q}.png")
    plot_subplots(subset, group_label=f"Q{q} ({cat})", mode="question", save_path=save_path)