This notebook's purpose is to evaluate the performance of RAG models for the Azure architecture.

We have uploaded the wikipedia files to azure AI search and we want to evaluate our RAG models. In this notebook, we will test four configurations : 
- Azure AI Search (with simple search) + GPT-3.5
- Azure AI Search (with hybrid search) + GPT-3.5
- Azure AI Search (with hybrid search and a few more "noise" documents containing no answers) + GPT-3.5
- Azure AI Search (with hybrid search and a few more "noise" documents containing no answers)  + Mistral Large

# Import libraries & retrieve questions

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizedQuery
from dotenv import load_dotenv
import json
from openai import AzureOpenAI
import os
import pandas as pd
import requests
from requests.auth import AuthBase
import time
from tqdm.auto import tqdm

In [None]:
load_dotenv()

In [None]:
OPENAI_API_ENDPOINT = os.environ.get("OPENAI_API_ENDPOINT") 
OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_EMBEDDING_MODEL = os.environ.get("OPENAI_EMBEDDING_MODEL")

SEARCH_ENDPOINT = os.environ.get("SEARCH_ENDPOINT")
SEARCH_KEY =  os.environ.get("SEARCH_KEY")
SEARCH_INDEX = os.environ.get("SEARCH_INDEX")

MISTRAL_URL = os.environ.get("MISTRAL_URL")
MISTRAL_KEY = os.environ.get("MISTRAL_KEY")

aoai_client = AzureOpenAI(
    azure_endpoint=OPENAI_API_ENDPOINT,
    api_key=OPENAI_API_KEY,
    api_version=OPENAI_API_VERSION,
)

search_client = SearchClient(SEARCH_ENDPOINT, SEARCH_INDEX, AzureKeyCredential(SEARCH_KEY))

In [None]:
# Change name of file depending on the name you chose for your question dataset.
question_dataframe = pd.read_csv("question_benchmark.csv", sep=";")

# Generation of answers (GPT-35)

In [None]:
# We ask each of the questions to our RAG system. 
# Here, we evaluate the answers of a GPT-35 using simple search and hybrid search.

list_classic_answer = []
list_vector_answer = []

for my_question in question_dataframe["question"].values:
    gpt_35_classic_answer = aoai_client.chat.completions.create(
            model="gpt-35-turbo",
            messages=[{"role": "user", "content": my_question}],
            temperature=0.2,
            top_p = 1,
            max_tokens=800,
            extra_body={
                "data_sources":[
                    {
                        "type" : "azure_search",
                        "parameters": {
                            "endpoint": SEARCH_ENDPOINT,
                            "index_name": SEARCH_INDEX,
                            "role_information": "You are an AI assistant that helps people find information.",
                            "query_type": "simple",
                            "filter": None,
                            "fields_mapping": {},
                            "in_scope": True,
                            "strictness": 3,
                            "top_n_documents": 5,
                            "authentication": {
                                "type": "api_key",
                                "key": SEARCH_KEY
                            }
                        }
                    }
                ]
            }
        ).choices[0].message.content

    gpt_35_vector_answer = aoai_client.chat.completions.create(
        model="gpt-35-turbo",
        messages=[{"role": "user", "content": my_question}],
        temperature=0.2,
        top_p = 1,
        max_tokens=800,
        extra_body={
            "data_sources":[
                {
                    "type" : "azure_search",
                    "parameters": {
                        "endpoint": SEARCH_ENDPOINT,
                        "index_name": SEARCH_INDEX,
                        "role_information": "You are an AI assistant that helps people find information.",
                        "semantic_configuration": "default",
                        "query_type": "vector_semantic_hybrid",
                        "filter": None,
                        "fields_mapping": {},
                        "in_scope": True,
                        "strictness": 3,
                        "top_n_documents": 5,
                        "authentication": {
                            "type": "api_key",
                            "key": SEARCH_KEY
                        },
                        "embedding_dependency": {
                            "deployment_name": OPENAI_EMBEDDING_MODEL,
                            "type" : "deployment_name"
                        }
                    }
                }
            ]
        }
    ).choices[0].message.content
    time.sleep(2)
    list_classic_answer.append(gpt_35_classic_answer)
    list_vector_answer.append(gpt_35_vector_answer)


# Generation of answers (Mistral)

In [None]:
# Test for Mistral. We use a mistral large model deployed in Azure. Since there is no dedicated python library, we just do an API call using the requests library.
# We create a method to generate an embedding from our question, a method to do research on our azure AI search an a prompt to link the two.

class TokenAuth(AuthBase):
    """Implements a token authentication scheme."""

    def __init__(self, token):
        self.token = token

    def __call__(self, request):
        """Attach an API token to the Authorization header."""
        request.headers["Authorization"] = f"Bearer {self.token}"
        return request
    
MISTRAL_PROMPT = """
Your task is to answer a complicated factoid question given documents at your disposal.
The questions demand a good understanding of the documents.
Your factoid answer should be specific, concise informations retrieved from the documents.

Documents:::
{document}

Question:::
{question}

Answer:::"""
    
# Method to embed a question using Azure OpenAI so that we can do hybrid search in Azure AI Search
def generate_embeddings(user_question, aoai_client=aoai_client):
    return aoai_client.embeddings.create(input = [user_question], model=OPENAI_EMBEDDING_MODEL).data[0].embedding

def retrieve_doc(user_question, nb_doc=5, search_client=search_client):
        """
        Function that takes as input a user question and uses Azure AI Search to return a list of documents and url that are on the topic of the user question.

        input :
            user_intent (str)

        output :
            doc_list (list)
            url_list (list)
        """
        embedded_query = generate_embeddings(user_question)
        vector_query = VectorizedQuery(vector=embedded_query, k_nearest_neighbors=nb_doc, fields="contentVector")
        ai_search_results = search_client.search(
            search_text = user_question,
            search_fields=["content"],
            vector_queries = [vector_query],
            select = ["content", "title", "url"],
            query_type = "semantic",
            semantic_configuration_name="default",
            top=nb_doc
        )
        
        # Important documents and list of urls associated to the documents
        url_list = []
        doc_list = []
        for my_result in ai_search_results:
            doc_list.append(my_result["content"])
            url_list.append(my_result["url"])
        return list(set(doc_list))

In [None]:
list_mistral_answer = []

for my_question in question_dataframe["question"].values:
    list_doc = retrieve_doc(my_question)
    document_str = ""
    for i in range(len(list_doc)):
        document_str += f"document number {i}: {list_doc[i]}\n"
    response = requests.post(MISTRAL_URL + "/v1/chat/completions", auth=TokenAuth(MISTRAL_KEY), json = {
        "messages":
        [
            { 
            "role": "system", 
            "content": "You are an AI assistant that helps people find information."
            },
            {
            "role": "user", 
            "content": MISTRAL_PROMPT.format(document=document_str, question=my_question)
            }
        ],
        "temperature": 0.2,
        "top_p": 1,
        "max_tokens": 800
    })
    
    time.sleep(2)
    list_mistral_answer.append(json.loads(response.content.decode('utf-8'))["choices"][0]["message"]["content"])

# Evaluation of answers

In [None]:
# Prompt used for the evaluation.  

EVALUATION_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:"""

In [None]:
def evaluate_answer(question, response, reference_answer, model="gpt-4"):
    return aoai_client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": EVALUATION_PROMPT.format(instruction = question, response = response, reference_answer = reference_answer)}],
        temperature=0.2,
        top_p = 1,
        max_tokens=800).choices[0].message.content

In [None]:
all_questions = question_dataframe["question"].values
all_answer = question_dataframe["answer"].values
evaluation_classic = []
evaluation_vector = []
evaluation_mistral = []

for i in range(len(question_dataframe)):
    question_i = all_questions[i]
    reference_answer_i = all_answer[i]
    classic_answer_i = list_classic_answer[i]
    vector_answer_i = list_vector_answer[i]
    mistral_answer_i = list_mistral_answer[i]
    evaluation_classic.append(evaluate_answer(question_i, classic_answer_i, reference_answer_i))
    evaluation_vector.append(evaluate_answer(question_i, vector_answer_i, reference_answer_i))
    evaluation_mistral.append(evaluate_answer(question_i, mistral_answer_i, reference_answer_i))

# Cleaning answer from the evaluation

In [None]:
classic_critique = []
classic_grade = []
vector_critique = []
vector_grade = []
mistral_critique = []
mistral_grade = []

for i in tqdm(range(len(evaluation_classic))):
    classic_critique.append(evaluation_classic[i].split("[RESULT]")[0])
    classic_grade.append(evaluation_classic[i].split("[RESULT]")[1])
    vector_critique.append(evaluation_vector[i].split("[RESULT]")[0])
    vector_grade.append(evaluation_vector[i].split("[RESULT]")[1])
    mistral_critique.append(evaluation_mistral[i].split("[RESULT]")[0])
    mistral_grade.append(evaluation_mistral[i].split("[RESULT]")[1])

In [None]:
question_dataframe["classic_anwer"], question_dataframe["classic_critique"], question_dataframe["classic_grade"] = list_classic_answer, classic_critique, classic_grade
question_dataframe["vector_anwer"], question_dataframe["vector_critique"], question_dataframe["vector_grade"] = list_vector_answer, vector_critique, vector_grade
question_dataframe["mistral_anwer"], question_dataframe["mistral_critique"], question_dataframe["mistral_grade"] = list_mistral_answer, mistral_critique, mistral_grade

In [None]:
question_dataframe.to_csv("comparison.csv", index=False, sep=";")