In this notebook, we try to do a more fine-grained evaluation: we will evaluate the retriever to see if we always retrieve the right documents given a user question. In order to do, we will ask the judge model to evaluate if the answer is present in the documents.

# Import libraries & retrieve questions

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizedQuery
from dotenv import load_dotenv
import json
from openai import AzureOpenAI
import os
import pandas as pd
import requests
from requests.auth import AuthBase
import time
from tqdm.auto import tqdm

In [None]:
load_dotenv()

In [None]:
OPENAI_API_ENDPOINT = os.environ.get("OPENAI_API_ENDPOINT") 
OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_EMBEDDING_MODEL = os.environ.get("OPENAI_EMBEDDING_MODEL")

SEARCH_ENDPOINT = os.environ.get("SEARCH_ENDPOINT")
SEARCH_KEY =  os.environ.get("SEARCH_KEY")
SEARCH_INDEX = os.environ.get("SEARCH_INDEX")

MISTRAL_URL = os.environ.get("MISTRAL_URL")
MISTRAL_KEY = os.environ.get("MISTRAL_KEY")

aoai_client = AzureOpenAI(
    azure_endpoint=OPENAI_API_ENDPOINT,
    api_key=OPENAI_API_KEY,
    api_version=OPENAI_API_VERSION,
)

search_client = SearchClient(SEARCH_ENDPOINT, SEARCH_INDEX, AzureKeyCredential(SEARCH_KEY))

In [None]:
question_dataframe = pd.read_csv("comparison.csv", sep=";")
reduced_dataset = question_dataframe[["question", "answer"]]

In [None]:
reduced_dataset

## Retriever evaluation
---

Now that we have the questions and answers, we will evaluate the capacity of the retriever of finding chunks of text that can evaluate the answer. In order to do so, we qill ask a powerful AI (GPT-4 in our case), to check if the given answer is in the retrieved chunks.

In [None]:
EVALUATION_PROMPT = """###Task Description:
Your task is to assess if a given documentation gives you enough information to answer a question.
You will be given a documentation, a factoid question and its answer. If the documentation contains the specific, factual piece of information needed to answer the question answer "True", else, answer "False".
Your answers should only be "True" or "False".

###Documentation to evaluate:
{documentation}

###Question and Answer to look for:
question: {question}
response: {response}

###Your answer:
"""

In [None]:
# Method to embed a question using Azure OpenAI so that we can do hybrid search in Azure AI Search
def generate_embeddings(user_question, aoai_client=aoai_client):
    return aoai_client.embeddings.create(input = [user_question], model=OPENAI_EMBEDDING_MODEL).data[0].embedding

def retrieve_doc(user_question, nb_doc=5, search_client=search_client):
        """
        Function that takes as input a user question and uses Azure AI Search to return a list of documents and url that are on the topic of the user question.

        input :
            user_intent (str)

        output :
            doc_list (list)
            url_list (list)
        """
        embedded_query = generate_embeddings(user_question)
        vector_query = VectorizedQuery(vector=embedded_query, k_nearest_neighbors=nb_doc, fields="contentVector")
        ai_search_results = search_client.search(
            search_text = user_question,
            search_fields=["content"],
            vector_queries = [vector_query],
            select = ["content", "title", "url"],
            query_type = "semantic",
            semantic_configuration_name="default",
            top=nb_doc
        )
        
        # Important documents and list of urls associated to the documents
        url_list = []
        doc_list = []
        for my_result in ai_search_results:
            doc_list.append(my_result["content"])
            url_list.append(my_result["url"])
        return list(set(doc_list))

In [None]:
list_evaluation = []

for my_question_answer in reduced_dataset.values:
    list_doc = retrieve_doc(my_question_answer[0])
    document_str = ""
    for i in range(len(list_doc)):
        document_str += f"document number {i}: {list_doc[i]}\n"
    retriever_evaluation = aoai_client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": EVALUATION_PROMPT.format(documentation = document_str, question = my_question_answer[0] , response = my_question_answer[1])}],
        temperature=0.2,
        top_p = 1,
        max_tokens=800).choices[0].message.content
    
    time.sleep(2)
    list_evaluation.append(retriever_evaluation)

In [None]:
reduced_dataset["retriever_finds_document"] = list_evaluation
reduced_dataset["retriever_finds_document"] = reduced_dataset["retriever_finds_document"].apply(bool)

In [None]:
reduced_dataset.to_csv("retriever_evaluation.csv", sep=";", index=False)