In [None]:
from groq import Groq
from llama_index.core import Settings, ServiceContext, StorageContext, SimpleDirectoryReader, PropertyGraphIndex
from llama_index.llms.groq import Groq as Groq_llamaindex
from llama_index.llms.openai import OpenAI
from llama_index.llms.replicate import Replicate
#from llama_index.llms.ollama import Ollama as Ollama_llamaindex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.graph_stores.neo4j import Neo4jGraphStore, Neo4jPropertyGraphStore
#from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex
from llama_index.core.agent import ReActAgent, FunctionCallingAgentWorker, AgentRunner
from llama_index.core.tools import BaseTool, FunctionTool
from milvus import default_server
from dotenv import load_dotenv
import json
import os
import numpy as np
import nest_asyncio

nest_asyncio.apply()


# Retrieve API keys and credentials securely
GROQ_API_KEY = os.getenv('GROQ_API_KEY')
OPEN_AI_API_KEY = os.getenv('OPEN_AI_API_KEY')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_URL = os.getenv('NEO4J_URL', 'bolt://localhost:7687')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE', 'neo4j')
REPLICATE_API_KEY = os.getenv('REPLICATE_API_KEY')
os.environ["REPLICATE_API_TOKEN"] = REPLICATE_API_KEY


#Initialize the Replicate class
llm = Replicate(
    model="meta/meta-llama-3-70b-instruct",
    api_key=REPLICATE_API_KEY,
    
)



Settings.llm = llm
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)


ServiceContext.llm = llm

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

username = NEO4J_USERNAME
password = NEO4J_PASSWORD
url = NEO4J_URL
database = NEO4J_DATABASE

property_graph_store = Neo4jPropertyGraphStore(
    username=username,
    password=password,
    url=url,
    database=database,
)
storage_context = StorageContext.from_defaults(property_graph_store=property_graph_store)


index = PropertyGraphIndex.from_existing(
    property_graph_store=property_graph_store,
    llm=llm,
    embed_model=Settings.embed_model,
)



INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'text']
2 prompts are loaded, with the keys: ['query', 'text']
INFO:neo4j.notifications:Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Schema.IndexOrConstraintAlreadyExists} {category: SCHEMA} {title: `CREATE CONSTRAINT IF NOT EXISTS FOR (e:__Node__) REQUIRE (e.id) IS UNIQUE` has no effect.} {description: `CONSTRAINT constraint_ec67c859 FOR (e:__Node__) REQUIRE (e.id) IS UNIQUE` already exists.} {position: None} for query: 'CREATE CONSTRAINT IF NOT EXISTS FOR (n:`__Node__`)\n            REQUIRE n.id IS UNIQUE;'
Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Schema.IndexOrConstraintAlreadyExists} {category: SCHEMA} {title: `CREATE CONSTRA

In [None]:

#Initialize the Replicate class
llm = Replicate(
    model="meta/meta-llama-3-70b-instruct"
    #model="meta/meta-llama-3.1-405b-instruct"
)#l

llm.complete('hi')
#llmr.complete('hi')

INFO:httpx:HTTP Request: POST https://api.replicate.com/v1/models/meta/meta-llama-3-70b-instruct/predictions "HTTP/1.1 201 Created"
HTTP Request: POST https://api.replicate.com/v1/models/meta/meta-llama-3-70b-instruct/predictions "HTTP/1.1 201 Created"
HTTP Request: POST https://api.replicate.com/v1/models/meta/meta-llama-3-70b-instruct/predictions "HTTP/1.1 201 Created"


CompletionResponse(text="\n\nHi! It's nice to meet you. Is there something I can help you with or would you like to chat?", additional_kwargs={}, raw=None, logprobs=None, delta=None)

In [6]:

from llama_index.core.indices.property_graph import VectorContextRetriever
similarity_top_k = 4    
path_depth = 3
vector_retriever = VectorContextRetriever(
    index.property_graph_store,
    # only needed when the graph store doesn't support vector queries
    # vector_store=index.vector_store,
    embed_model=Settings.embed_model,
    # include source chunk text with retrieved paths
    include_text=True,
    # the number of nodes to fetch
    similarity_top_k=similarity_top_k,
    # the depth of relations to follow after node retrieval
    path_depth=path_depth,
    
  
)

#retriever = index.as_retriever(sub_retrievers=[vector_retriever])

index_query_engine = index.as_query_engine(sub_retrievers=[vector_retriever])
index_retriever = index.as_retriever(sub_retrievers=[vector_retriever])



In [8]:
import pandas as pd
import nest_asyncio
import time

from pathlib import Path

# Apply asyncio patch for Jupyter notebooks
# Read questions from file selected_questions_set_2.csv
questions = pd.read_csv("oia_files_and_questions.csv")["Question"].tolist()
# Load the already completed answers from temp_answers.csv
# try to rad temp_answers.csv if it does not exist create a blank df with column 'Answer'
try:
    df_answers = pd.read_csv("temp_answers.csv")
except FileNotFoundError:
    df_answers = pd.DataFrame(columns=["Question", "Answer"])

# Ensure the answers list matches the length of the questions list
answers = df_answers["Answer"].tolist() + [None] * (len(questions) - len(df_answers))

# Loop over all questions where the answer is None or 'none'
for idx, (question, answer) in enumerate(zip(questions, answers)):
    if answer is None or answer.strip().lower() == 'none':
        print(f"Processing question {idx}")
        print(question)

        try:
            response = index_query_engine.query(question)

            answers[idx] = response.response

            # Create a new row as a DataFrame
            new_row = pd.DataFrame({'Question': [question], 'Answer': [response.response]})
            # Concatenate the new row to the existing DataFrame
            df_answers = pd.concat([df_answers, new_row], ignore_index=True)
            # Save the updated dataframe to 'temp_answers.csv'
            df_answers.to_csv("temp_answers.csv", index=False)
            
        except Exception as e:
            answers[idx] = None
            print(f"Error: {e}")
            
        print("\n#######################\n")
        time.sleep(0)  # Pause for 20 seconds






The chunk bellow is not used any more other that to format the answer the way we need them

In [17]:
import pandas as pd

# Load the questions and answers DataFrames
questions = pd.read_csv("oia_files_and_questions.csv")  # Contains correct answers in 'Answer' column
answers = pd.read_csv("temp_answers.csv")  # Contains model-generated answers

# Ensure that the model-generated answers are in a column named 'Model Answer'
if 'Answer' in answers.columns:
    answers.rename(columns={'Answer': 'Model Answer'}, inplace=True)

# Add the 'Model Answer' from the answers DataFrame to the questions DataFrame
questions["Model Answer"] = answers["Model Answer"]

# Now, we will compare the correct answers in questions['Answer'] with the model's answers in questions['Model Answer']
evaluation_results = []

for idx, row in questions.iterrows():
    # Create a comparison prompt
    prompt = f"""
    The question is: {row['Question']}
    The correct answer is: {row['Answer']}
    The model's answer is: {row['Model Answer']}
    
    Based on the model's answer, respond with one of the following:
    - correct
    - wrong
    - did not find

    do not use any other words in your response
    """
    
    # Call the language model to complete the prompt
    evaluation = "correct"

    # Append the result to the evaluation_results list
    #evaluation_results.append(evaluation.text.strip())  # Strip any whitespace
    evaluation_results.append(evaluation)

    # Print or store the evaluation result
    #print(f"Question {idx+1}: {evaluation.text.strip()}")

# Optionally, add the evaluation results as a new column in the questions DataFrame
questions["Evaluation"] = evaluation_results

# Save the updated DataFrame with the evaluation results
questions.to_csv("evaluated_answers.csv", index=False)


# make directoyr if it does not exist
Path("results_temp").mkdir(parents=True, exist_ok=True)
questions.to_csv(f"results_temp/evaluated_unsorted_k{similarity_top_k}_d{path_depth}.csv", index=False)

In [19]:
#!pip install nltk rouge-score datasets
!pip install evaluate


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Using cached dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Using cached xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Using cached multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting pyarrow>=15.0.0 (from datasets>=2.0.0->evaluate)
  Downloading pyarrow-18.0.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting dill (from evaluate)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Using cached evaluate-0.4.3-py3-none-any.whl (84 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
Using cached dill-0.3.8-py3-none-any.wh

In [18]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/felipenavarro/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/felipenavarro/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
import os
import re
import pandas as pd
import nltk
import evaluate
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
from nltk.translate.bleu_score import sentence_bleu
from tqdm import tqdm  # For progress bars

# Ensure that NLTK has the necessary package for BLEU
nltk.download('punkt')

# Load the BGE model and tokenizer
MODEL_NAME = "BAAI/bge-small-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bge_model = AutoModel.from_pretrained(MODEL_NAME)
bge_model.eval()  # Set model to evaluation mode

# Function to generate embeddings using BGE with caching
class EmbeddingCache:
    def __init__(self, tokenizer, model, device='cpu', batch_size=32):
        self.tokenizer = tokenizer
        self.model = model
        self.device = device
        self.batch_size = batch_size
        self.cache = {}
    
    def get_embeddings(self, texts):
        """
        Generates embeddings for a list of texts with caching to avoid redundant computations.
        
        Args:
            texts (List[str]): List of text strings to embed.
        
        Returns:
            List[np.ndarray]: List of embedding vectors.
        """
        texts_to_embed = [text for text in texts if text not in self.cache]
        if texts_to_embed:
            for i in tqdm(range(0, len(texts_to_embed), self.batch_size), desc="Embedding texts"):
                batch = texts_to_embed[i:i+self.batch_size]
                inputs = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(self.device)
                with torch.no_grad():
                    embeddings = self.model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()
                for text, emb in zip(batch, embeddings):
                    self.cache[text] = emb
        return [self.cache[text] for text in texts]

embedding_cache = EmbeddingCache(tokenizer, bge_model, device='cpu', batch_size=32)

# Function to calculate cosine similarity
def calculate_cosine_similarity(embeddings1, embeddings2):
    return cosine_similarity(embeddings1, embeddings2).diagonal()

# BLEU score calculation
def calculate_bleu(reference, hypothesis):
    reference = [nltk.word_tokenize(reference)]  # Reference must be a list of lists for BLEU
    hypothesis = nltk.word_tokenize(hypothesis)
    return sentence_bleu(reference, hypothesis)

# ROUGE score calculation
rouge = evaluate.load("rouge")

def calculate_rouge_per_row(reference, prediction):
    """
    Compute ROUGE scores for a single (reference, prediction) pair.
    
    Args:
        reference (str): The reference text.
        prediction (str): The prediction text.
    
    Returns:
        dict: Dictionary containing ROUGE-1, ROUGE-2, and ROUGE-L scores.
    """
    try:
        scores = rouge.compute(predictions=[prediction], references=[reference])
        return {
            'ROUGE-1': scores['rouge1'],
            'ROUGE-2': scores['rouge2'],
            'ROUGE-L': scores['rougeL']
        }
    except Exception as e:
        print(f"Error computing ROUGE for reference: {reference} and prediction: {prediction}\n{e}")
        return {'ROUGE-1': float('nan'), 'ROUGE-2': float('nan'), 'ROUGE-L': float('nan')}

# Directory containing the CSV files
DIRECTORY = 'results_temp'

# Regex pattern to parse filenames
FILENAME_PATTERN = re.compile(
    r'evaluated_(?P<sort_status>sorted|unsorted)_k(?P<k>\d+)_d(?P<d>\d+)\.csv$'
)

# List to collect all records
records = []

# Iterate through all CSV files in the directory with a progress bar
csv_files = [f for f in os.listdir(DIRECTORY) if f.endswith(".csv")]
for filename in tqdm(csv_files, desc="Processing CSV files"):
    match = FILENAME_PATTERN.match(filename)
    if not match:
        print(f"Filename format not recognized: {filename}")
        continue
    
    sort_status = match.group('sort_status')
    k_value = int(match.group('k'))
    d_value = int(match.group('d'))
    
    file_path = os.path.join(DIRECTORY, filename)
    
    # Load the CSV file
    data = pd.read_csv(file_path)
    
    # Ensure all entries in 'Answer' and 'Model Answer' are strings
    data['Answer'] = data['Answer'].astype(str)
    data['Model Answer'] = data['Model Answer'].astype(str)
    
    # Collect all unique references and hypotheses for embedding
    unique_texts = pd.concat([data['Answer'], data['Model Answer']]).unique()
    embedding_cache.get_embeddings(unique_texts)
    
    # Calculate embeddings
    data['Reference Embedding'] = embedding_cache.get_embeddings(data['Answer'])
    data['Hypothesis Embedding'] = embedding_cache.get_embeddings(data['Model Answer'])
    
    # Calculate cosine similarities
    data['Cosine Similarity'] = calculate_cosine_similarity(
        list(data['Reference Embedding']),
        list(data['Hypothesis Embedding'])
    )
    
    # Calculate BLEU scores with a progress bar
    bleu_scores = []
    for idx, row in tqdm(data.iterrows(), total=data.shape[0], desc=f"Calculating BLEU for {filename}", leave=False):
        reference = row['Answer']
        hypothesis = row['Model Answer']
        bleu = calculate_bleu(reference, hypothesis)
        bleu_scores.append(bleu)
    data['BLEU'] = bleu_scores
    
    # Calculate ROUGE scores per row with a progress bar
    rouge_scores_1 = []
    rouge_scores_2 = []
    rouge_scores_L = []
    for idx, row in tqdm(data.iterrows(), total=data.shape[0], desc=f"Calculating ROUGE for {filename}", leave=False):
        reference = row['Answer']
        prediction = row['Model Answer']
        rouge_result = calculate_rouge_per_row(reference, prediction)
        rouge_scores_1.append(rouge_result['ROUGE-1'])
        rouge_scores_2.append(rouge_result['ROUGE-2'])
        rouge_scores_L.append(rouge_result['ROUGE-L'])
    data['ROUGE-1'] = rouge_scores_1
    data['ROUGE-2'] = rouge_scores_2
    data['ROUGE-L'] = rouge_scores_L
    
    # Add metadata
    data['Sort Status'] = sort_status
    data['k'] = k_value
    data['d'] = d_value
    
    # Append relevant columns to records
    records.append(data[['Question', 'Sort Status', 'k', 'd', 'BLEU', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'Cosine Similarity']])

# Concatenate all records into a single DataFrame
summary_results = pd.concat(records, ignore_index=True)

# Save the summary DataFrame to a CSV file
summary_csv_path = os.path.join(DIRECTORY, 'summary_evaluations.csv')
summary_results.to_csv(summary_csv_path, index=False)

# Optionally, print the first few rows of the summary results
print(summary_results.head())


In [25]:
# evaluation_plots.py  This is the working version that does the plots

# Import necessary libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def load_data(summary_csv_path):
    """
    Load the summary evaluations CSV into a Pandas DataFrame.
    
    Args:
        summary_csv_path (str): Path to the summary_evaluations.csv file.
        
    Returns:
        pd.DataFrame: Loaded DataFrame.
    """
    if not os.path.exists(summary_csv_path):
        raise FileNotFoundError(f"File not found: {summary_csv_path}")
    
    df = pd.read_csv(summary_csv_path)
    return df

def filter_data_by_k(df, k_value):
    """
    Filter the DataFrame for a specific value of k.
    
    Args:
        df (pd.DataFrame): The original DataFrame.
        k_value (int): The k value to filter by.
        
    Returns:
        pd.DataFrame: Filtered DataFrame.
    """
    filtered_df = df[df['k'] == k_value]
    if filtered_df.empty:
        print(f"No data found for k={k_value}. Skipping.")
    return filtered_df

def aggregate_metrics(df, metrics, group_by=['d', 'Sort Status']):
    """
    Aggregate the DataFrame by computing the mean of specified metrics,
    grouped by the specified columns.
    
    Args:
        df (pd.DataFrame): The DataFrame to aggregate.
        metrics (list): List of metric column names to aggregate.
        group_by (list): Columns to group by.
        
    Returns:
        pd.DataFrame: Aggregated DataFrame.
    """
    aggregated_df = df.groupby(group_by)[metrics].mean().reset_index()
    return aggregated_df

def create_plots(aggregated_df, metrics, k_value, output_dir='plots'):
    """
    Create and save comparative plots for each metric for a specific k value.
    
    Args:
        aggregated_df (pd.DataFrame): Aggregated DataFrame with mean scores.
        metrics (list): List of metric column names to plot.
        k_value (int): The k value being processed.
        output_dir (str): Directory to save the plots.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Set the aesthetic style of the plots
    sns.set(style="whitegrid")
    
    for metric in metrics:
        plt.figure(figsize=(10, 6))
        
        # Create a line plot with d on x-axis and metric score on y-axis
        sns.lineplot(
            data=aggregated_df,
            x='d',
            y=metric,
            hue='Sort Status',
            marker='o',
            palette='viridis'
        )
        
        # Set plot titles and labels
        plt.title(f'{metric} Comparison for k={k_value}: Sorted vs Unsorted')
        plt.xlabel('d Value')
        plt.ylabel(metric)
        
        # Enhance legend
        plt.legend(title='Sort Status')
        
        # Optionally, add annotations for exact values
        for line in plt.gca().get_lines():
            x_values = line.get_xdata()
            y_values = line.get_ydata()
            for x, y in zip(x_values, y_values):
                plt.text(x, y, f'{y:.2f}', ha='center', va='bottom', fontsize=9)
        
        # Save the plot
        plot_filename = f"{metric.replace(' ', '_').lower()}_comparison_k{k_value}.png"
        plt.savefig(os.path.join(output_dir, plot_filename), dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved plot: {os.path.join(output_dir, plot_filename)}")


# Path to the summary evaluations CSV
summary_csv_path = 'results_temp/summary_evaluations.csv'

# Load the data
print("Loading data...")
df = load_data(summary_csv_path)
print("Data loaded successfully.")

# Get the unique k values in the data
k_values = df['k'].unique()
print(f"Unique k values found: {k_values}")

# Define the metrics to plot
metrics = ['BLEU', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'Cosine Similarity']

# Check if all metrics exist in the DataFrame
missing_metrics = [metric for metric in metrics if metric not in df.columns]
if missing_metrics:
    raise ValueError(f"The following metrics are missing in the data: {missing_metrics}")

# Loop over each k value
for k_value in k_values:
    print(f"\nProcessing k={k_value}...")
    
    # Filter the DataFrame for the current k value
    df_k = filter_data_by_k(df, k_value)
    if df_k.empty:
        continue  # Skip if no data for this k
    
    # Aggregate the data by computing mean scores for each metric
    print("Aggregating metrics...")
    aggregated_df = aggregate_metrics(df_k, metrics)
    print("Aggregation complete.")
    
    # Display the aggregated data (optional)
    print("\nAggregated Data:")
    print(aggregated_df.head())
    
    # Create and save the plots
    print("\nCreating plots...")
    output_dir_k = f'plots_k{k_value}'
    create_plots(aggregated_df, metrics, k_value, output_dir=output_dir_k)
    print(f"All plots for k={k_value} created and saved successfully.")


Loading data...
Data loaded successfully.
Unique k values found: [4]

Processing k=4...
Aggregating metrics...
Aggregation complete.

Aggregated Data:
   d Sort Status      BLEU   ROUGE-1   ROUGE-2  ROUGE-L  Cosine Similarity
0  3    unsorted  0.057199  0.237928  0.121293  0.19396           0.721254

Creating plots...
Saved plot: plots_k4/bleu_comparison_k4.png
Saved plot: plots_k4/rouge-1_comparison_k4.png
Saved plot: plots_k4/rouge-2_comparison_k4.png
Saved plot: plots_k4/rouge-l_comparison_k4.png
Saved plot: plots_k4/cosine_similarity_comparison_k4.png
All plots for k=4 created and saved successfully.


In [27]:
# statistical_tests_updated_single_block_with_k.py

import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro, ttest_rel, wilcoxon, binomtest
import numpy as np
from math import ceil

# 1. Load Data
def load_data(summary_csv_path):
    """
    Load the summary evaluations CSV into a Pandas DataFrame.

    Args:
        summary_csv_path (str): Path to the summary_evaluations.csv file.

    Returns:
        pd.DataFrame: Loaded DataFrame.
    """
    if not os.path.exists(summary_csv_path):
        raise FileNotFoundError(f"File not found: {summary_csv_path}")

    df = pd.read_csv(summary_csv_path)
    return df

# 2. Prepare Paired Data
def prepare_paired_data(df, metric, d_value, k_value):
    """
    Prepare paired data for a specific metric, d value, and k value.

    Args:
        df (pd.DataFrame): The original DataFrame.
        metric (str): The metric to analyze.
        d_value (int): The d value to filter by.
        k_value (int): The k value to filter by.

    Returns:
        pd.DataFrame: Filtered DataFrame with sorted and unsorted scores.
    """
    filtered_df = df[(df['d'] == d_value) & (df['k'] == k_value)]
    sorted_scores = filtered_df[filtered_df['Sort Status'] == 'sorted'][metric].reset_index(drop=True)
    unsorted_scores = filtered_df[filtered_df['Sort Status'] == 'unsorted'][metric].reset_index(drop=True)

    # Ensure both sorted and unsorted have the same number of entries
    min_length = min(len(sorted_scores), len(unsorted_scores))
    if min_length == 0:
        return pd.DataFrame()  # Return empty DataFrame if no paired data

    sorted_scores = sorted_scores[:min_length]
    unsorted_scores = unsorted_scores[:min_length]

    paired_df = pd.DataFrame({
        'sorted': sorted_scores,
        'unsorted': unsorted_scores
    })

    return paired_df

# 3. Perform Normality Test
def perform_normality_test(differences):
    """
    Perform Shapiro-Wilk test for normality on the differences.

    Args:
        differences (pd.Series): Differences between sorted and unsorted scores.

    Returns:
        tuple: (statistic, p-value)
    """
    if len(differences) < 3:
        # Shapiro-Wilk test requires at least 3 data points
        return np.nan, np.nan
    stat, p = shapiro(differences)
    return stat, p

# 4. Perform Statistical Tests
def perform_statistical_tests(paired_df):
    """
    Perform paired t-test or Wilcoxon signed-rank test based on normality.
    Also perform the sign test.

    Args:
        paired_df (pd.DataFrame): DataFrame with 'sorted' and 'unsorted' columns.

    Returns:
        dict: Test results including test names, statistics, and p-values.
    """
    differences = paired_df['sorted'] - paired_df['unsorted']
    stat, p = perform_normality_test(differences)

    normality = False
    if not np.isnan(p):
        normality = p > 0.05  # If p > 0.05, assume normality

    # Initialize variables
    t_stat = np.nan
    t_p = np.nan
    w_stat = np.nan
    w_p = np.nan
    s_stat = np.nan
    s_p = np.nan

    # Paired t-test if normality holds
    if normality:
        if len(paired_df) >= 2:
            t_stat, t_p = ttest_rel(paired_df['sorted'], paired_df['unsorted'])
        else:
            print("Paired t-test not performed due to insufficient data.")

    # Wilcoxon signed-rank test
    if len(paired_df) >= 1:
        try:
            w_stat, w_p = wilcoxon(paired_df['sorted'], paired_df['unsorted'])
        except ValueError as e:
            print(f"Wilcoxon test not performed: {e}")
    else:
        print("Wilcoxon test not performed due to insufficient data.")

    # Sign test
    non_zero_differences = differences[differences != 0]
    n = len(non_zero_differences)
    if n == 0:
        print("Sign test not performed due to all differences being zero.")
    else:
        n_positive = sum(non_zero_differences > 0)
        # Under H0, the probability of positive difference is 0.5
        # Perform a binomial test
        binom_result = binomtest(n_positive, n, 0.5, alternative='two-sided')
        s_stat = n_positive
        s_p = binom_result.pvalue

    # Return results including normality test results
    final_results = {
        'Normality': normality,
        'Shapiro-Wilk Statistic': stat,
        'Shapiro-Wilk p-value': p,
        'Paired t-test Statistic': t_stat,
        'Paired t-test p-value': t_p,
        'Wilcoxon test Statistic': w_stat,
        'Wilcoxon test p-value': w_p,
        'Sign test Statistic': s_stat,
        'Sign test p-value': s_p
    }
    return final_results

# 5. Plot All Histograms
def plot_all_histograms(differences_dict, metrics, d_values, k_values, output_dir='histograms'):
    """
    Plot histograms of differences for each metric, d, and k value.

    Args:
        differences_dict (dict): Nested dictionary containing differences.
        metrics (list): List of metrics.
        d_values (list): List of d values.
        k_values (list): List of k values.
        output_dir (str): Directory to save the plots.
    """
    os.makedirs(output_dir, exist_ok=True)

    for metric in metrics:
        for k in k_values:
            # Prepare subplots
            num_d = len(d_values)
            cols = 3
            rows = ceil(num_d / cols)
            fig, axes = plt.subplots(rows, cols, figsize=(cols * 5, rows * 4))
            axes = axes.flatten()

            for idx, d in enumerate(d_values):
                ax = axes[idx]
                if k in differences_dict[metric] and d in differences_dict[metric][k]:
                    differences = differences_dict[metric][k][d]['differences']
                    normality = differences_dict[metric][k][d]['normality']
                    sns.histplot(differences, kde=True, ax=ax)
                    ax.set_title(f'{metric} Differences (d={d}, k={k})\nNormality: {"Yes" if normality else "No"}')
                    ax.set_xlabel('Difference (Sorted - Unsorted)')
                else:
                    ax.axis('off')

            # Remove any unused subplots
            for idx in range(len(d_values), len(axes)):
                axes[idx].axis('off')

            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, f'{metric}_differences_k{k}.png'))
            plt.close()
            print(f"Saved histogram for {metric} at k={k} in {output_dir}")

# 6. Summarize Results
def summarize_results(results, output_dir='statistical_results'):
    """
    Save the statistical test results to a CSV file.

    Args:
        results (list): List of dictionaries containing test results.
        output_dir (str): Directory to save the results.
    """
    os.makedirs(output_dir, exist_ok=True)
    results_df = pd.DataFrame(results)
    results_df.to_csv(os.path.join(output_dir, 'statistical_test_results.csv'), index=False)
    print(f"Saved statistical test results: {os.path.join(output_dir, 'statistical_test_results.csv')}")

# 7. Main Execution
# Define the path to your CSV file
summary_csv_path = 'results_temp/summary_evaluations.csv'

# Load the data
print("Loading data...")
df = load_data(summary_csv_path)
print("Data loaded successfully.")

# Define the metrics to analyze
metrics = ['BLEU', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'Cosine Similarity']

# Check if all metrics exist in the DataFrame
missing_metrics = [metric for metric in metrics if metric not in df.columns]
if missing_metrics:
    raise ValueError(f"The following metrics are missing in the data: {missing_metrics}")

# Get unique k and d values
k_values = sorted(df['k'].unique())
d_values = sorted(df['d'].unique())

# Initialize list to store statistical test results
test_results = []

# Initialize list to store differences for DataFrame
differences_records = []

# Initialize nested dictionary to store differences for plotting
differences_dict = {metric: {k: {} for k in k_values} for metric in metrics}

# Iterate through each metric, k value, and d value
for metric in metrics:
    for k in k_values:
        for d in d_values:
            print(f"\nAnalyzing {metric} for k={k}, d={d}...")
            paired_df = prepare_paired_data(df, metric, d, k)

            if paired_df.empty:
                print(f"No paired data available for {metric} with k={k}, d={d}. Skipping...")
                continue

            # Perform statistical tests
            test_result = perform_statistical_tests(paired_df)
            test_result.update({
                'Metric': metric,
                'k': k,
                'd': d
            })
            test_results.append(test_result)

            # Store differences for plotting
            differences = paired_df['sorted'] - paired_df['unsorted']
            differences_dict[metric][k][d] = {
                'differences': differences,
                'shapiro': (test_result['Shapiro-Wilk Statistic'], test_result['Shapiro-Wilk p-value']),
                'normality': test_result['Normality']
            }

            # Collect differences into records for DataFrame
            for idx in paired_df.index:
                differences_records.append({
                    'Metric': metric,
                    'k': k,
                    'd': d,
                    'sorted_score': paired_df.at[idx, 'sorted'],
                    'unsorted_score': paired_df.at[idx, 'unsorted'],
                    'difference': paired_df.at[idx, 'sorted'] - paired_df.at[idx, 'unsorted']
                })

# 8. Create and Inspect the Differences DataFrame
# Create a DataFrame from the differences records
differences_df = pd.DataFrame(differences_records)

# Display the first few rows of the differences DataFrame
print("\nDifferences DataFrame:")
display(differences_df.head())

# Optionally, display summary statistics
print("\nSummary Statistics of Differences:")
display(differences_df.describe())

# 9. Summarize and Save Statistical Test Results
summarize_results(test_results)

# 10. Plot All Histograms in a Single Figure
print("\nCreating aggregated histograms...")
plot_all_histograms(differences_dict, metrics, d_values, k_values)
print("Aggregated histograms created and saved successfully.")

# 11. Display Statistical Test Results
# Create a DataFrame for statistical test results
results_df = pd.DataFrame(test_results)

# Display the statistical test results
print("\nStatistical Test Results:")
display(results_df)

# 12. (Optional) Save the Differences DataFrame
# Save the differences DataFrame to a CSV file for further inspection
differences_df.to_csv('statistical_results/differences_dataframe.csv', index=False)
print("Saved differences DataFrame: statistical_results/differences_dataframe.csv")


Loading data...
Data loaded successfully.

Analyzing BLEU for k=4, d=3...
No paired data available for BLEU with k=4, d=3. Skipping...

Analyzing ROUGE-1 for k=4, d=3...
No paired data available for ROUGE-1 with k=4, d=3. Skipping...

Analyzing ROUGE-2 for k=4, d=3...
No paired data available for ROUGE-2 with k=4, d=3. Skipping...

Analyzing ROUGE-L for k=4, d=3...
No paired data available for ROUGE-L with k=4, d=3. Skipping...

Analyzing Cosine Similarity for k=4, d=3...
No paired data available for Cosine Similarity with k=4, d=3. Skipping...

Differences DataFrame:



Summary Statistics of Differences:


ValueError: Cannot describe a DataFrame without columns

In [None]:
# Set chatgpt 4o as the llmEval the environment variable for the key has already been set

os.environ["OPENAI_API_KEY"] = OPEN_AI_API_KEY

llmEval = OpenAI(model="gpt-4o", temperature=0)

In [None]:
# Import necessary libraries
import os
import re
import pandas as pd
from tqdm import tqdm

# Directory containing the CSV files
DIRECTORY = 'results_temp'  # Ensure this path is correct

# Regex pattern to parse filenames
FILENAME_PATTERN = re.compile(
    r'evaluated_(?P<sort_status>sorted|unsorted)_k(?P<k>\d+)_d(?P<d>\d+)\.csv$'
)

# Function to parse filename and extract sort_status, k, d
def parse_filename(filename):
    """
    Parses the filename to extract sort status, k value, and d value.

    Args:
        filename (str): The filename string.

    Returns:
        tuple or None: (sort_status, k_value, d_value) if pattern matches, else None
    """
    match = FILENAME_PATTERN.match(filename)
    if not match:
        return None
    sort_status = match.group('sort_status')
    k_value = int(match.group('k'))
    d_value = int(match.group('d'))
    return sort_status, k_value, d_value

# Function to append DataFrame to CSV with headers if file doesn't exist
def append_df_to_csv(df, filepath):
    """
    Appends a DataFrame to a CSV file. If the file does not exist, it creates it with headers.

    Args:
        df (pd.DataFrame): DataFrame to append.
        filepath (str): Path to the CSV file.
    """
    if not os.path.isfile(filepath):
        df.to_csv(filepath, index=False)
    else:
        df.to_csv(filepath, mode='a', header=False, index=False)

# Function to process a single pair of files
def process_single_pair(sorted_file, unsorted_file):
    """
    Processes a single pair of sorted and unsorted CSV files, evaluates unanswered questions,
    and appends the results to the evaluation CSV.

    Args:
        sorted_file (str): Filename of the sorted CSV.
        unsorted_file (str): Filename of the unsorted CSV.

    Returns:
        dict: Summary of evaluation counts for this pair.
    """
    # Parse filenames to extract k and d
    parsed_sorted = parse_filename(sorted_file)
    parsed_unsorted = parse_filename(unsorted_file)

    if not parsed_sorted or not parsed_unsorted:
        print("Filename parsing failed. Ensure filenames match the expected pattern.")
        return None

    _, k_sorted, d_sorted = parsed_sorted
    _, k_unsorted, d_unsorted = parsed_unsorted

    # Check if k and d match
    if (k_sorted != k_unsorted) or (d_sorted != d_unsorted):
        print(f"k and d values do not match for files {sorted_file} and {unsorted_file}.")
        return None

    k = k_sorted
    d = d_sorted

    print(f"\nProcessing files with k={k} and d={d}:")
    print(f"Sorted File: {sorted_file}")
    print(f"Unsorted File: {unsorted_file}")

    # Construct full file paths
    sorted_filepath = os.path.join(DIRECTORY, sorted_file)
    unsorted_filepath = os.path.join(DIRECTORY, unsorted_file)

    # Load the sorted and unsorted CSV files
    try:
        sorted_data = pd.read_csv(sorted_filepath)
        unsorted_data = pd.read_csv(unsorted_filepath)
    except Exception as e:
        print(f"Error reading files: {e}")
        return None

    # Ensure all entries in 'Question', 'Answer', and 'Model Answer' are strings
    for df, name in zip([sorted_data, unsorted_data], ['sorted', 'unsorted']):
        for col in ['Question', 'Answer', 'Model Answer']:
            if col not in df.columns:
                print(f"Column '{col}' not found in {name} file.")
                return None
            df[col] = df[col].astype(str)

    # Rename 'Model Answer' columns to differentiate between sorted and unsorted
    sorted_data.rename(columns={'Model Answer': 'Model Answer_sorted'}, inplace=True)
    unsorted_data.rename(columns={'Model Answer': 'Model Answer_unsorted'}, inplace=True)

    # Merge sorted and unsorted data on 'Question' and 'Answer' to ensure alignment
    merged_data = pd.merge(
        sorted_data,
        unsorted_data,
        on=['Question', 'Answer'],
        how='inner'
    )

    # If there are no matching questions, return
    if merged_data.empty:
        print(f"No matching questions found between sorted and unsorted files for k={k}, d={d}.")
        return None

    # Add 'k' and 'd' columns
    merged_data['k'] = k
    merged_data['d'] = d

    # Define the per-pair evaluation CSV filepath
    pair_evaluation_filename = f'evaluation_k{k}_d{d}.csv'
    pair_evaluation_filepath = os.path.join(DIRECTORY, pair_evaluation_filename)

    # Load existing evaluations for this pair to identify already processed questions
    if os.path.isfile(pair_evaluation_filepath):
        existing_evaluations = pd.read_csv(pair_evaluation_filepath)
        processed_questions = set(existing_evaluations['Question'].tolist())
        print(f"Found {len(processed_questions)} already evaluated questions for k={k}, d={d}.")
    else:
        existing_evaluations = pd.DataFrame(columns=[
            'Question', 'Answer', 'Model Answer_sorted', 'Model Answer_unsorted',
            'Preferred Model', 'Sorted Correctness', 'Unsorted Correctness', 'k', 'd'
        ])
        processed_questions = set()
        print(f"No existing evaluations found for k={k}, d={d}. Processing all questions.")

    # Identify questions that have not been evaluated yet
    to_evaluate = merged_data[~merged_data['Question'].isin(processed_questions)]

    print(f"Number of questions to evaluate: {to_evaluate.shape[0]}")

    if to_evaluate.empty:
        print(f"No new questions to evaluate for k={k}, d={d}.")
        return None

    # Initialize list to store evaluation results
    evaluation_results = []

    # Iterate through each row with a progress bar
    for idx, row in tqdm(to_evaluate.iterrows(), total=to_evaluate.shape[0], desc="Evaluating answers", leave=False):
        question = row['Question']
        correct_answer = row['Answer']
        sorted_answer = row['Model Answer_sorted']
        unsorted_answer = row['Model Answer_unsorted']

       
        prompt = f"""
The question is: {question}
The correct answer is: {correct_answer}
The sorted model's answer is: {sorted_answer}
The unsorted model's answer is: {unsorted_answer}

Instructions:
1. For both the sorted and unsorted model answers, assess each as one of the following: correct, partially correct, wrong, or did not find.
2. Based on the assessments, determine which model's answer is closer to the correct answer:
   - If one is correct and the other is partially correct or wrong, the correct one should be preferred.
   - If one is partially correct and the other is wrong, the partially correct one should be preferred.
   - If both are correct, decide which is preferred based on completeness, accuracy, or any additional information relative to the correct answer.
   - If both are wrong, "neither" should be preferred.
   - If both are identical in correctness (e.g., both correct, both partially correct, or both wrong with no preference), the result should be "tie."

Respond in the following format:
<preferred model>|<sorted correctness>|<unsorted correctness>

Definitions:
- <preferred model>: Choose one of the following: sorted, unsorted, tie, neither
- <sorted correctness> and <unsorted correctness>: Choose one of the following for each: correct, partially correct, wrong, did not find

Note:
- Ensure that the more correct answer (or the more complete/relevant answer) is always preferred, even if both are marked as correct.
- Provide only the specified response format with no additional text.
"""



        try:
            # Call your existing language model's evaluation method
            response = llmEval.complete(prompt)
            best_answer = response.text.strip().lower()  # Normalize the response

            # Split the response into components
            parts = best_answer.split('|')
            if len(parts) != 3:
                print(f"Invalid response format for question '{question}'.")
                preferred_model = 'invalid_response'
                sorted_correctness = 'invalid_response'
                unsorted_correctness = 'invalid_response'
            else:
                preferred_model = parts[0].strip()
                sorted_correctness = parts[1].strip()
                unsorted_correctness = parts[2].strip()

                valid_models = ['sorted', 'unsorted', 'tie', 'neither']
                valid_correctness = ['correct', 'partially correct', 'wrong', 'did not find']

                if preferred_model not in valid_models:
                    preferred_model = 'invalid_response'
                if sorted_correctness not in valid_correctness:
                    sorted_correctness = 'invalid_response'
                if unsorted_correctness not in valid_correctness:
                    unsorted_correctness = 'invalid_response'
        except Exception as e:
            print(f"Error evaluating question '{question}': {e}")
            preferred_model = 'invalid_response'
            sorted_correctness = 'invalid_response'
            unsorted_correctness = 'invalid_response'

        # Append the result to the evaluation_results list
        evaluation_results.append({
            'Question': question,
            'Answer': correct_answer,
            'Model Answer_sorted': sorted_answer,
            'Model Answer_unsorted': unsorted_answer,
            'Preferred Model': preferred_model,
            'Sorted Correctness': sorted_correctness,
            'Unsorted Correctness': unsorted_correctness,
            'k': k,
            'd': d
        })

        # Save the result immediately to the CSV file
        evaluation_df = pd.DataFrame([evaluation_results[-1]])
        append_df_to_csv(evaluation_df, pair_evaluation_filepath)

    # Generate summary for this pair
    existing_evaluations = pd.read_csv(pair_evaluation_filepath)

    # Count preferred models
    preferred_counts = existing_evaluations['Preferred Model'].value_counts().to_dict()

    # Count correctness for sorted and unsorted answers
    sorted_correctness_counts = existing_evaluations['Sorted Correctness'].value_counts().to_dict()
    unsorted_correctness_counts = existing_evaluations['Unsorted Correctness'].value_counts().to_dict()

    summary_complete = {
        'k': k,
        'd': d,
        'Preferred Sorted': preferred_counts.get('sorted', 0),
        'Preferred Unsorted': preferred_counts.get('unsorted', 0),
        'Preferred Tie': preferred_counts.get('tie', 0),
        'Preferred Neither': preferred_counts.get('neither', 0),
        'Preferred Invalid': preferred_counts.get('invalid_response', 0),
        'Sorted Correct': sorted_correctness_counts.get('correct', 0),
        'Sorted Partially Correct': sorted_correctness_counts.get('partially correct', 0),
        'Sorted Wrong': sorted_correctness_counts.get('wrong', 0),
        'Sorted Did Not Find': sorted_correctness_counts.get('did not find', 0),
        'Sorted Invalid': sorted_correctness_counts.get('invalid_response', 0),
        'Unsorted Correct': unsorted_correctness_counts.get('correct', 0),
        'Unsorted Partially Correct': unsorted_correctness_counts.get('partially correct', 0),
        'Unsorted Wrong': unsorted_correctness_counts.get('wrong', 0),
        'Unsorted Did Not Find': unsorted_correctness_counts.get('did not find', 0),
        'Unsorted Invalid': unsorted_correctness_counts.get('invalid_response', 0)
    }

    return summary_complete

# Now, process all file pairs

# Find all sorted and unsorted files and group them by k and d
file_groups = {}
for filename in os.listdir(DIRECTORY):
    if not filename.endswith(".csv"):
        continue
    parsed = parse_filename(filename)
    if not parsed:
        continue
    sort_status, k, d = parsed
    key = (k, d)
    if key not in file_groups:
        file_groups[key] = {}
    file_groups[key][sort_status] = filename

# Initialize list to collect summaries
summary_records = []

# Iterate through each group of sorted and unsorted files
for (k, d), files in tqdm(file_groups.items(), desc="Processing file groups"):
    sorted_filename = files.get('sorted')
    unsorted_filename = files.get('unsorted')

    if not sorted_filename or not unsorted_filename:
        print(f"Missing sorted or unsorted file for k={k}, d={d}. Skipping this group.")
        continue

    # Process the single pair and get the summary
    summary = process_single_pair(sorted_filename, unsorted_filename)

    if summary:
        # Append the summary to the summary_records list
        summary_records.append(summary)

        # Optionally, print the summary for this pair
        print(f"\nSummary for k={k}, d={d}:")
        print(f"  Preferred Model Counts:")
        print(f"    Sorted: {summary.get('Preferred Sorted', 0)}")
        print(f"    Unsorted: {summary.get('Preferred Unsorted', 0)}")
        print(f"    Tie: {summary.get('Preferred Tie', 0)}")
        print(f"    Neither: {summary.get('Preferred Neither', 0)}")
        print(f"    Invalid Response: {summary.get('Preferred Invalid', 0)}")

        print(f"  Sorted Correctness Counts:")
        print(f"    Correct: {summary.get('Sorted Correct', 0)}")
        print(f"    Partially Correct: {summary.get('Sorted Partially Correct', 0)}")
        print(f"    Wrong: {summary.get('Sorted Wrong', 0)}")
        print(f"    Did Not Find: {summary.get('Sorted Did Not Find', 0)}")
        print(f"    Invalid Response: {summary.get('Sorted Invalid', 0)}")

        print(f"  Unsorted Correctness Counts:")
        print(f"    Correct: {summary.get('Unsorted Correct', 0)}")
        print(f"    Partially Correct: {summary.get('Unsorted Partially Correct', 0)}")
        print(f"    Wrong: {summary.get('Unsorted Wrong', 0)}")
        print(f"    Did Not Find: {summary.get('Unsorted Did Not Find', 0)}")
        print(f"    Invalid Response: {summary.get('Unsorted Invalid', 0)}")
    else:
        print(f"No new evaluations for k={k}, d={d}.")

# After processing all pairs, generate the summary DataFrame
if summary_records:
    summary_df = pd.DataFrame(summary_records)
    # Define the summary CSV filepath
    summary_csv_path = os.path.join(DIRECTORY, 'summary_evaluations.csv')

    # Save the summary DataFrame to a CSV file
    summary_df.to_csv(summary_csv_path, index=False)
    print(f"\nSummary evaluation results saved to {summary_csv_path}")

    # Display the summary DataFrame
    print("\nSummary Evaluation Results for All Pairs:")
    display(summary_df)
else:
    print("No evaluation records to save.")


In [28]:
# Import necessary libraries
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set the directory containing the evaluation CSV files
DIRECTORY = 'results_temp'  # Ensure this path is correct

# Regex pattern to identify evaluation files
EVALUATION_FILENAME_PATTERN = re.compile(
    r'evaluation_k(?P<k>\d+)_d(?P<d>\d+)\.csv$'
)

# Initialize an empty list to collect DataFrames
evaluation_dfs = []

# Iterate over files in the directory and read evaluation CSV files
for filename in os.listdir(DIRECTORY):
    match = EVALUATION_FILENAME_PATTERN.match(filename)
    if match:
        k = int(match.group('k'))
        d = int(match.group('d'))
        filepath = os.path.join(DIRECTORY, filename)
        df = pd.read_csv(filepath)
        df['k'] = k
        df['d'] = d
        evaluation_dfs.append(df)

# Concatenate all evaluation DataFrames into one
if evaluation_dfs:
    evaluation_data = pd.concat(evaluation_dfs, ignore_index=True)
else:
    print("No evaluation files found.")
    evaluation_data = pd.DataFrame()

# Check if data is loaded
if not evaluation_data.empty:
    # Display the first few rows of the data
    display(evaluation_data.head())
    
    # Correctness categories (excluding 'invalid_response')
    correctness_categories = ['correct', 'partially correct', 'wrong', 'did not find']
    
    # Preferred model categories
    preferred_model_categories = ['sorted', 'unsorted', 'tie', 'neither']
    
    # Set up improved plotting style with lighter colors
    sns.set_theme(style='whitegrid', palette='pastel', context='talk')
    plt.rcParams.update({'figure.figsize': (14, 8), 'font.size': 14})
    
    # Function to add count labels on top of bars
    def add_counts(ax):
        for container in ax.containers:
            ax.bar_label(container, fmt='%d', label_type='edge', fontsize=12, padding=3)
    
    # Prepare data for correctness plots
    correctness_data = evaluation_data.melt(
        id_vars=['k', 'd'],
        value_vars=['Sorted Correctness', 'Unsorted Correctness'],
        var_name='Model',
        value_name='Correctness'
    )
    correctness_data['Model'] = correctness_data['Model'].str.replace(' Correctness', '')
    
    # Exclude 'invalid_response' from correctness data
    correctness_data = correctness_data[correctness_data['Correctness'] != 'invalid_response']
    
    # Prepare data for preferred model plots
    preferred_data = evaluation_data[['k', 'd', 'Preferred Model', 'Sorted Correctness', 'Unsorted Correctness']]
    
    # Iterate over each unique value of k
    for k_value in sorted(evaluation_data['k'].unique()):
        print(f"\nGenerating correctness plots for k={k_value}")
        
        # Filter data for the current k value
        k_correctness_data = correctness_data[correctness_data['k'] == k_value]
        
        # Prepare counts
        counts = k_correctness_data.groupby(['d', 'Correctness', 'Model']).size().reset_index(name='Count')
        counts.sort_values('d', inplace=True)  # Ensure d is sorted in ascending order
        
        # Define custom color palette with lighter shades for models
        model_palette = {
            'Sorted': '#99d6ff',    # Light blue
            'Unsorted': '#ffcc99'   # Light orange
        }
        
        # Specify the order of correctness categories
        correctness_order = ['correct', 'partially correct', 'wrong', 'did not find']
        
        # Create the barplot using FacetGrid with Correctness as columns
        # Adjust the legend position slightly to the left
        g = sns.catplot(
        data=counts,
        x='d',
        y='Count',
        hue='Model',
        col='Correctness',
        kind='bar',
        palette=model_palette,
        height=6,
        aspect=1.2,
        legend_out=True,  # Keep the legend out of the grid
        col_order=correctness_order,
        col_wrap=2  # Arrange subplots in a 2x2 grid
    )

        # Set the main title and labels
        g.fig.subplots_adjust(top=0.85, right=0.88)  # Adjust right margin to give space to the legend
        g.fig.suptitle(f'Correctness Counts by Model and Depth (k={k_value})', fontsize=16)
        g.set_axis_labels('Depth', 'Count')

        # Position legend slightly left within the adjusted space
        g._legend.set_bbox_to_anchor((1.02, 0.5))  # Slightly move it left from 1.05 to 1.02
        g._legend.set_title('Model')
        g._legend.set_frame_on(True)
            
        # Add count labels
        for ax in g.axes.flat:
            add_counts(ax)
        
        plt.show()
        
        print(f"\nGenerating preferred model plots for k={k_value}")
        
        # Filter data for the current k value
        k_preferred_data = preferred_data[preferred_data['k'] == k_value]
        
        # Exclude instances where the preferred model is 'sorted' or 'unsorted' and both correctness are 'wrong' or 'did not find'
        condition = ~(
            (k_preferred_data['Preferred Model'].isin(['sorted', 'unsorted'])) &
            (k_preferred_data['Sorted Correctness'].isin(['wrong', 'did not find'])) &
            (k_preferred_data['Unsorted Correctness'].isin(['wrong', 'did not find']))
        )
        k_preferred_data = k_preferred_data[condition]
        
        # Filter to only 'sorted' and 'unsorted' preferred models
        model_preference = k_preferred_data[k_preferred_data['Preferred Model'].isin(['sorted', 'unsorted'])]
        
        # Prepare counts
        counts = model_preference.groupby(['d', 'Preferred Model']).size().reset_index(name='Count')
        counts.sort_values('d', inplace=True)  # Ensure d is sorted in ascending order
        
        # Define custom color palette for preferred models
        preferred_palette = {
            'sorted': '#99d6ff',    # Light blue
            'unsorted': '#ffcc99'   # Light orange
        }
        
        # Initialize the matplotlib figure
        plt.figure(figsize=(14, 8))
        
        # Create the barplot
        ax = sns.barplot(
            x='d',
            y='Count',
            hue='Preferred Model',
            data=counts,
            palette=preferred_palette
        )
        
        # Add title and labels with improved formatting
        plt.title(f'Preferred Model Counts by d (k={k_value})', fontsize=16)
        plt.xlabel('d (Dimension)', fontsize=14)
        plt.ylabel('Count', fontsize=14)
        
        # Add count labels on top of each bar
        add_counts(ax)
        
        # Enhance legend
        plt.legend(title='Preferred Model', fontsize=12, title_fontsize=14)
        
        # Improve layout
        plt.tight_layout()
        plt.show()
else:
    print("No data to display.")


No evaluation files found.
No data to display.
