In [8]:
import toml
import os
from openai import OpenAI
import pandas as pd
from datasets import Dataset

from langchain_chroma import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from phoenix.otel import register

from ragas.metrics import (
    FactualCorrectness,
    Faithfulness,
    SemanticSimilarity,
    LLMContextRecall,
)
from ragas import evaluate
from ragas import EvaluationDataset
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper


import streamlit as st

BASE_DIR = os.getcwd()

secrets_path = os.path.join(BASE_DIR, "..", "..", "config", "secrets.toml")
docs_folder = os.path.join(BASE_DIR, "docs")
prompt_folder = os.path.join(BASE_DIR, "prompts")

queries_path = os.path.join(BASE_DIR, "..", "eval-ds", "Queries.xlsx")
output_folder = os.path.join(BASE_DIR, "..", "eval-ds")

API_KEY = toml.load(secrets_path)["OPENAI_API_KEY"]
GROQ_KEY = toml.load(secrets_path)["GROQ_KEY"]

embedding_model = "text-embedding-3-large"
embeddings = OpenAIEmbeddings(model=embedding_model, api_key=API_KEY)

client = OpenAI(api_key=GROQ_KEY, base_url="https://api.groq.com/openai/v1")

chroma_dir = os.path.join(BASE_DIR, "..", "chroma")
chroma_dir

'/Users/suryaganesan/Documents/GitHub/polly-server/pollyServer/app/RAG/eval-module/../chroma'

In [9]:

# Prepare vector store
vector_store = Chroma(
    persist_directory=chroma_dir,
    embedding_function=embeddings,
    collection_name="polly-rag",
)

os.environ["OPENAI_API_KEY"] = API_KEY

# Prepare LLM Judge
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())


def query_db(query: str):
    results = vector_store.similarity_search(query=query, k=3)

    return results


# Prepare chat function
def generate_response(question):
    # Query VectorDB
    source = query_db(question)
    context = [page.page_content for page in source]
    string_context = "\n".join(context)

    with open(os.path.join(prompt_folder, "system.md"), "r", encoding="utf-8") as file:
        system_prompt = file.read()

    with open(os.path.join(prompt_folder, "user.md"), "r", encoding="utf-8") as file:
        user_prompt = file.read()

    response = (
        client.chat.completions.create(
            model="llama-3.2-11b-vision-preview",
            messages=[
                {
                    "role": "system",
                    "content": system_prompt.format(context=string_context),
                },
                {"role": "user", "content": user_prompt.format(question=question)},
            ],
        )
        .choices[0]
        .message.content
    )

    return response, context



In [10]:
vector_store.similarity_search("What percentage of respondents strongly or somewhat agreed with the statement 'I am aware of disciplinary issues where action should have been taken but has not'?", k=3)

[Document(metadata={'figure_title': 'Figure 1: To what extent do you agree with the statement that “the civil service takes ', 'page_no': 15, 'source': '/Users/suryaganesan/Documents/GitHub/polly-server/pollyServer/app/RAG/docs/Making-the-grade.pdf'}, page_content='Source document: Making-the-grade.pdf \nPage no: 15 \nFigure 1: To what extent do you agree with the statement that “the civil service takes \n\nFigure Description: \nCertainly! Here\'s a detailed description of the data presented in the figures:\n\n### Figure 1: Agreement with the Statement "The civil service takes talent and performance management seriously"\n\nThis figure is a bar graph representing survey respondents\' levels of agreement with the statement.\n\n- **Strongly agree**: 3%\n- **Somewhat agree**: 26%\n- **Neither agree nor disagree**: 13%\n- **Somewhat disagree**: 30%\n- **Strongly disagree**: 27%\n- **Don\'t know**: 1%\n\n### Figure 2: Agreement with the Statement "I am aware of disciplinary issues where act

In [11]:

# Evaluation
# Prepare dataset - List of dicts [{"reference": "", "query": "", "response": ""}...]
# Context precision, Context recall, Response Relevancy, Faithfullness

# Iterate through queries - make dataframe with reference, query, response


def build_eval_ds(queries_df, evals_df):
    print("Building evaluation dataset...")
    questions = []
    answers = []
    contexts = []
    ground_truth = []
    for index, row in queries_df.iterrows():
        length = len(queries_df)
        print(f"Working on question: {index+1}/{length}")
        questions.append(row["Queries"])
        ground_truth.append(row["Ground truth"])
        answer, context = generate_response(row["Queries"])
        answers.append(answer)
        contexts.append(context)

    evals_df["user_input"] = questions
    evals_df["response"] = answers
    evals_df["retrieved_contexts"] = contexts
    evals_df["reference"] = [str(truth) for truth in ground_truth]

    evals_dataset = EvaluationDataset.from_pandas(evals_df)
    return evals_dataset


def run_eval(df, llm=evaluator_llm, emd=evaluator_embeddings):
    print("Running evaluation...")
    metrics = [
        LLMContextRecall(llm=llm),
        FactualCorrectness(llm=llm),
        Faithfulness(llm=llm),
        SemanticSimilarity(embeddings=evaluator_embeddings),
    ]
    results = evaluate(dataset=df, metrics=metrics).to_pandas()

    return results



In [12]:


eval_df = pd.DataFrame()
query_df = pd.read_excel(queries_path)

# Initating test

response_ds = build_eval_ds(queries_df=query_df, evals_df=eval_df)
eval_results = run_eval(response_ds)

filename = input("Save table as: ")

Saved = False
while not Saved:
    if filename:
        print("Filename exists...")
        if not os.path.exists(
            os.path.join(output_folder, "output", f"{filename}.xlsx")
        ):
            print("Filename is valid...")
            eval_results.to_excel(
                os.path.join(output_folder, "output", f"{filename}.xlsx"),
                index=False,
            )
            print("Saved file...")
            print(f"Save {filename}.xlsx successfully")
            Saved = True

        else:
            print("File already exists")

    else:
        print("Please enter a valid name to save the file.")


Building evaluation dataset...
Working on question: 1/12
Working on question: 2/12
Working on question: 3/12
Working on question: 4/12
Working on question: 5/12
Working on question: 6/12
Working on question: 7/12
Working on question: 8/12
Working on question: 9/12
Working on question: 10/12
Working on question: 11/12
Working on question: 12/12
Running evaluation...


Evaluating:  27%|██▋       | 13/48 [00:06<00:19,  1.75it/s]Exception raised in Job[17]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Evaluating:  35%|███▌      | 17/48 [00:09<00:22,  1.38it/s]Exception raised in Job[9]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Evaluating:  71%|███████   | 34/48 [00:15<00:07,  1.96it/s]Exception raised in Job[37]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Evaluating: 100%|██████████| 48/48 [00:44<00:00,  1.07it/s]


Filename exists...
Filename is valid...
Saved file...
Save 2411-llama3.211b-fixedimgs.xlsx successfully


In [13]:
eval_results[["user_input", "response", "reference", "context_recall", "factual_correctness", "faithfulness", "semantic_similarity"]]

Unnamed: 0,user_input,response,reference,context_recall,factual_correctness,faithfulness,semantic_similarity
0,"What is the purpose of the ""General Power of C...","The ""General Power of Competence"" is a law tha...","The ""General Power of Competence,"" as outlined...",0.75,0.73,0.5,0.971718
1,What are the causes for concern in England's l...,The causes for concern in England's local gove...,The main concerns with England's local governm...,1.0,0.45,1.0,0.958417
2,What percentage of line managers somewhat disa...,"According to the data, 41% of line managers so...",0.41,1.0,,0.857143,0.771277
3,How does the Department of Health and Social C...,The Department of Health and Social Care's res...,The response to the FOI request is not clear o...,1.0,0.89,1.0,0.908341
4,What percentage of line managers strongly disa...,"According to the results, 21% of line managers...",0.21,1.0,,1.0,0.797666
5,What contributed to the Windrush scandal?,"According to the provided text, the operationa...",“Operational and organisational failing” at th...,1.0,0.0,0.4,0.898321
6,What are the delivery and administrative power...,"In England, local government is divided into s...","In England, local governments possess specific...",0.8,0.41,0.75,0.93673
7,What is the primary focus of Reform's new prog...,"The primary focus of Reform's new programme, ""...","The primary focus of Reform's new programme, ""...",1.0,0.74,1.0,0.991734
8,What percentage of respondents strongly or som...,Respondents who strongly or somewhat agreed wi...,62 per cent,1.0,0.0,1.0,0.819587
9,What was the rank of the UK in terms of measle...,The UK's measles immunisation coverage ranked ...,31st,1.0,,1.0,0.769989


In [14]:
print("Summary:")
round_up = 2

context_recall = eval_results["context_recall"].fillna(0).values
cr_avg = sum(context_recall)/len(context_recall)
print(f"Context Recall: {round(cr_avg, round_up)}")

factual_correctness = eval_results["factual_correctness"].fillna(0).values
fc_avg = sum(factual_correctness)/len(factual_correctness)
print(f"Factual Correctness: {round(fc_avg, round_up)}")

faithfulness = eval_results["faithfulness"].fillna(0).values
ff_avg = sum(faithfulness)/len(faithfulness)
print(f"Faithfulness: {round(ff_avg, round_up)}")

semantic_similarity = eval_results["semantic_similarity"].fillna(0).values
ss_avg = sum(semantic_similarity)/len(semantic_similarity)
print(f"Semantic Similarity: {round(ss_avg, round_up)}")

Summary:
Context Recall: 0.96
Factual Correctness: 0.33
Faithfulness: 0.88
Semantic Similarity: 0.89
