In [23]:
import toml
import os
from openai import OpenAI
import pandas as pd
from datasets import Dataset

from langchain_chroma import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from phoenix.otel import register

from ragas.metrics import (
    FactualCorrectness,
    Faithfulness,
    SemanticSimilarity,
    LLMContextRecall,
)
from ragas import evaluate
from ragas import EvaluationDataset
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper


import streamlit as st

BASE_DIR = os.getcwd()

secrets_path = os.path.join(BASE_DIR, "..", "..", "config", "secrets.toml")
docs_folder = os.path.join(BASE_DIR, "docs")
prompt_folder = os.path.join(BASE_DIR, "prompts")

queries_path = os.path.join(BASE_DIR, "..", "eval-ds", "Queries.xlsx")
output_folder = os.path.join(BASE_DIR, "..", "eval-ds")

API_KEY = toml.load(secrets_path)["OPENAI_API_KEY"]

embedding_model = "text-embedding-3-large"
embeddings = OpenAIEmbeddings(model=embedding_model, api_key=API_KEY)

client = OpenAI(api_key=API_KEY)

chroma_dir = os.path.join(BASE_DIR, "..", "chroma_db")

In [24]:

# Prepare vector store
vector_store = Chroma(
    persist_directory=chroma_dir,
    embedding_function=embeddings,
    collection_name="polly-rag",
)

os.environ["OPENAI_API_KEY"] = API_KEY

# Prepare LLM Judge
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())


def query_db(query: str):
    results = vector_store.similarity_search(query=query, k=3)

    return results


# Prepare chat function
def generate_response(question):
    # Query VectorDB
    source = query_db(question)
    context = [page.page_content for page in source]
    string_context = "\n".join(context)

    with open(os.path.join(prompt_folder, "system.md"), "r", encoding="utf-8") as file:
        system_prompt = file.read()

    with open(os.path.join(prompt_folder, "user.md"), "r", encoding="utf-8") as file:
        user_prompt = file.read()

    response = (
        client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": system_prompt.format(context=string_context),
                },
                {"role": "user", "content": user_prompt.format(question=question)},
            ],
        )
        .choices[0]
        .message.content
    )

    return response, context



In [25]:

# Evaluation
# Prepare dataset - List of dicts [{"reference": "", "query": "", "response": ""}...]
# Context precision, Context recall, Response Relevancy, Faithfullness

# Iterate through queries - make dataframe with reference, query, response


def build_eval_ds(queries_df, evals_df):
    print("Building evaluation dataset...")
    questions = []
    answers = []
    contexts = []
    ground_truth = []
    for index, row in queries_df.iterrows():
        length = len(queries_df)
        print(f"Working on question: {index+1}/{length}")
        questions.append(row["Queries"])
        ground_truth.append(row["Ground truth"])
        answer, context = generate_response(row["Queries"])
        answers.append(answer)
        contexts.append(context)

    evals_df["user_input"] = questions
    evals_df["response"] = answers
    evals_df["retrieved_contexts"] = contexts
    evals_df["reference"] = [str(truth) for truth in ground_truth]

    evals_dataset = EvaluationDataset.from_pandas(evals_df)
    return evals_dataset


def run_eval(df, llm=evaluator_llm, emd=evaluator_embeddings):
    print("Running evaluation...")
    metrics = [
        LLMContextRecall(llm=llm),
        FactualCorrectness(llm=llm),
        Faithfulness(llm=llm),
        SemanticSimilarity(embeddings=evaluator_embeddings),
    ]
    results = evaluate(dataset=df, metrics=metrics).to_pandas()

    return results



In [26]:


eval_df = pd.DataFrame()
query_df = pd.read_excel(queries_path)

# Initating test

response_ds = build_eval_ds(queries_df=query_df, evals_df=eval_df)
eval_results = run_eval(response_ds)

filename = input("Save table as: ")

Saved = False
while not Saved:
    if filename:
        print("Filename exists...")
        if not os.path.exists(
            os.path.join(output_folder, "output", f"{filename}.xlsx")
        ):
            print("Filename is valid...")
            eval_results.to_excel(
                os.path.join(output_folder, "output", f"{filename}.xlsx"),
                index=False,
            )
            print("Saved file...")
            print(f"Save {filename}.xlsx successfully")
            Saved = True

        else:
            print("File already exists")

    else:
        print("Please enter a valid name to save the file.")


Building evaluation dataset...
Working on question: 1/12
Working on question: 2/12
Working on question: 3/12
Working on question: 4/12
Working on question: 5/12
Working on question: 6/12
Working on question: 7/12
Working on question: 8/12
Working on question: 9/12
Working on question: 10/12
Working on question: 11/12
Working on question: 12/12
Running evaluation...


Evaluating:  27%|██▋       | 13/48 [00:06<00:24,  1.42it/s]Exception raised in Job[17]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Evaluating:  29%|██▉       | 14/48 [00:06<00:19,  1.71it/s]Exception raised in Job[9]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Evaluating:  52%|█████▏    | 25/48 [00:14<00:14,  1.64it/s]Exception raised in Job[33]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Evaluating:  65%|██████▍   | 31/48 [00:16<00:06,  2.66it/s]Exception raised in Job[37]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting 

Filename exists...
Filename is valid...
Saved file...
Save init-test-1.xlsx successfully


In [27]:
eval_results[["user_input", "response", "reference", "context_recall", "factual_correctness", "faithfulness", "semantic_similarity"]]

Unnamed: 0,user_input,response,reference,context_recall,factual_correctness,faithfulness,semantic_similarity
0,"What is the purpose of the ""General Power of C...","The ""General Power of Competence"" was introduc...","The ""General Power of Competence,"" as outlined...",0.0,0.69,0.0,0.971981
1,What are the causes for concern in England's l...,The causes for concern in England's local gove...,The main concerns with England's local governm...,1.0,0.76,0.904762,0.975568
2,What percentage of line managers somewhat disa...,"According to the Reform/CSW survey, 41% of lin...",0.41,1.0,,1.0,0.777819
3,How does the Department of Health and Social C...,"According to the response to the FOI request, ...",The response to the FOI request is not clear o...,1.0,0.67,1.0,0.903955
4,What percentage of line managers strongly disa...,"According to the Reform /CSW survey, 41% of li...",0.21,1.0,,0.5,0.762745
5,What contributed to the Windrush scandal?,The Windrush scandal was primarily contributed...,“Operational and organisational failing” at th...,0.0,0.0,0.04,0.784868
6,What are the delivery and administrative power...,Local government in England has various delive...,"In England, local governments possess specific...",0.0,0.44,0.060606,0.962029
7,What is the primary focus of Reform's new prog...,"The primary focus of Reform's new programme, ""...","The primary focus of Reform's new programme, ""...",0.75,0.74,1.0,0.982347
8,What percentage of respondents strongly or som...,The provided content does not include informat...,62 per cent,0.0,,0.0,0.783762
9,What was the rank of the UK in terms of measle...,"In 2021, the UK ranked 31st in terms of measle...",31st,1.0,,1.0,0.769668


In [43]:
print("Summary:")
round_up = 2

context_recall = eval_results["context_recall"].values
cr_avg = sum(context_recall)/len(context_recall)
print(f"Context Recall: {round(cr_avg, round_up)}")

factual_correctness = eval_results["factual_correctness"].fillna(0).values
fc_avg = sum(factual_correctness)/len(factual_correctness)
print(f"Factual Correctness: {round(fc_avg, round_up)}")

faithfulness = eval_results["faithfulness"].values
ff_avg = sum(faithfulness)/len(faithfulness)
print(f"Faithfulness: {round(ff_avg, round_up)}")

semantic_similarity = eval_results["semantic_similarity"].values
ss_avg = sum(semantic_similarity)/len(semantic_similarity)
print(f"Semantic Similarity: {round(ss_avg, round_up)}")

Summary:
Context Recall: 0.65
Factual Correctness: 0.34
Faithfulness: 0.63
Semantic Similarity: 0.88
