In [None]:
# Import all the libraries we need
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
from langchain_community.chat_models import ChatMlflow
from langchain_openai import ChatOpenAI
from statistics import median

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json

In [None]:
# Structure to use to get the output from the evaluation chain
class EvalResultSchema(BaseModel):

    match_score:int = Field(description="A scalar value representing how well the answer matches the ground truth")
    eval_description:str = Field(description="An explanation that justifies the score that was assigned for how well the answer matches ground truth")

pydantic_parser = PydanticOutputParser(pydantic_object=EvalResultSchema)
format_instructions = pydantic_parser.get_format_instructions()

In [None]:
# Load the model and setup the prompt to use for the evaluation

# Uncomment  to use the AI gateway, change the name of the endpoint as required
# eval_model = ChatMlflow(
#     target_uri=os.environ["DOMINO_MLFLOW_DEPLOYMENTS"],
#     endpoint="chat-gpt35turbo-sm",
# )

eval_model = ChatOpenAI(model_name="gpt-3.5-turbo-0125")

EVAL_PROMPT = """
Your goal is give an integer score in the range of 1 to 10 by matching how well the LLM Answer. 
While assigning a score just focus on the facts and entities that match with the Ground Truth Answer. Also give higher scores to answers that are terse.
A bad answer will have a low integer score and a good match will have a high integer score

{format_instructions}

LLM Answer:
{llm_answer}

Ground Truth Answer:
{gt_answer}
"""

eval_prompt = ChatPromptTemplate.from_template(
    template=EVAL_PROMPT
)

eval_chain = LLMChain(llm=eval_model, prompt=eval_prompt)

In [43]:
# Plot the distribution of the scores of how well the RAG response matches with the answer provided as ground truth
def plot_match_scores_distributions(graph_rag_scores, rag_scores):
    # Set the figure size and adjust the padding between and around the subplots
    plt.figure(figsize=(10, 6))
    
    # Plot the first set of match scores
    plt.hist(graph_rag_scores, bins=10, alpha=0.5, color='#aec6cf', label='Graph RAG')
    
    # Plot the second set of match scores
    plt.hist(rag_scores, bins=10, alpha=0.5, color='#ff6961', label='RAG')
    
    # Adding titles and labels
    plt.title('Distribution of Match Scores')
    plt.xlabel('Match Score')
    plt.ylabel('Frequency')
    
    # Display legend
    plt.legend()
    
    # Show plot
    plt.show()
    
    # Compute and print the median for both sets of match scores
    median_graph_rag_score = median(graph_rag_scores)
    median_rag_score = median(rag_scores)
    print(f"The median match score for Graph RAG is: {median_graph_rag_score}")
    print(f"The median match score for RAG is: {median_rag_score}")

In [1]:
# Function to extract the scores from the response from the evaluation chain
def extract_match_scores(results):
    match_scores = [eval(result['text'])['match_score'] for result in results]
    return(match_scores)

In [48]:
# Function that loads the QA dataset for evaluation and gets the scores from the evaluation chain
def get_match_scores(csv_file_name):
    # Step 1: Read the CSV file
    df = pd.read_csv(csv_file_name)
    
    # Step 2: Extract column values
    llm_answers = df['llm_answer'].tolist()
    gt_answers = df['gt_answer'].tolist()
    
    # Step 3: Format the data
    batch_data = [{"llm_answer": llm_answer, "gt_answer": gt_answer, "format_instructions":format_instructions} for llm_answer, gt_answer in zip(llm_answers, gt_answers)]
    
    # Step 4: Call chain.batch
    results = eval_chain.batch(batch_data)   
    
    match_scores = extract_match_scores(results)
    return match_scores