<div class="alert alert-block alert-info">

# RAG System Evaluation
    
This notebook is a follow up from the previous notebook in which we explored the overall evaluation approach and a RAG system's overall accuracy.

This notebook we will take a closer look at specific RAG metrics and explore how different components and configurations can impact overall accuracy.



## Solution architecture
<img src="https://d3q8adh3y5sxpk.cloudfront.net/meetingrecordings/modelevaluation/architecture.png" alt="LLM selection process" width="900" height="550">

From the solution architecture, we will experiment with the below RAG components and evaluate the impact on several metric's relevant for RAG.

- 1) Embedding model: amazon.titan-embed-text-v1 vs amazon.titan-e1t-medium 
- 2) Text Splitter: TokenTextSplitter vs CharacterTextSplitter
- 3) Retriever: OpenSearch VectoreStoreRetriever search types “similarity” vs “mmr”
- 4) Prompt Template: For each LLM we evaluate two different prompt templates


## RAG evaluation metrics

This notebook explores the following metrics:

Langsmith evaluators: 
-  a. "cot_qa"
-  b. "conciseness"
-  c. "relevance"

Also review https://docs.smith.langchain.com/evaluation/datasets

In [None]:
# install dependencies
%pip install --force-reinstall -r requirements.txt

In [None]:
# restart kernel to ensure proper version of libraries is loaded
from IPython.display import display_html
def restartkernel() :
    display_html("<script>Jupyter.notebook.kernel.restart()</script>",raw=True)
restartkernel()

In [None]:
!pip list | grep -E "awscli|boto3|botocore|langchain|langsmith|plotly|tiktoken|nltk|python-dotenv|xmltodict|requests-aws4auth|pypdf|opensearch-py|sagemaker|nest-asyncio"
# also review requirements.txt for reference if needed

In [None]:
# load environment variables 
import boto3
import os
import botocore
from botocore.config import Config
import langchain
import sagemaker
import pandas as pd

from langchain.llms.bedrock import Bedrock
from langchain.llms import SagemakerEndpoint
from langchain.llms.sagemaker_endpoint import LLMContentHandler
from typing import Dict

import json
import requests
import csv
import time
import pandas as pd
import nltk
import sys

from langchain.llms import Bedrock
from dotenv import load_dotenv, find_dotenv

# loading environment variables that are stored in local file dev.env
load_dotenv(find_dotenv('dev-langsmith.env'),override=True)

session = sagemaker.Session()
bucket = session.default_bucket()


os.environ['OPENSEARCH_COLLECTION'] = os.getenv('OPENSEARCH_COLLECTION')
os.environ['AWS_ACCESS_KEY'] = os.getenv('AWS_ACCESS_KEY')
os.environ['AWS_SECRET_TOKEN'] = os.getenv('AWS_SECRET_TOKEN')
os.environ['REGION'] = os.getenv('REGION')
os.environ['LANGCHAIN_ENDPOINT'] = os.getenv('LANGCHAIN_ENDPOINT')
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ['LANGCHAIN_PROJECT'] = os.getenv('LANGCHAIN_PROJECT')
os.environ['LANGCHAIN_TRACING_V2'] = os.getenv('LANGCHAIN_TRACING')
os.environ["LANGCHAIN_TRACING"]="false"
os.environ["LANGCHAIN_SESSION"] = "rag-eval"

# Initialize Bedrock runtime
config = Config(
   retries = {
      'max_attempts': 8
   }
)
bedrock_runtime = boto3.client(
        service_name="bedrock-runtime",
        config=config
)

In [None]:
# Initialize LLMs (Claude-V2, Cohere, LLama2)

## 1a. Initialize Claude-v2
llm01_inference_modifier = {
    "max_tokens_to_sample": 545,
    "temperature": 0,
    "stop_sequences": ["\n\nHuman"],
}
LLM_01_NAME= "anthropic.claude-v2"
llm01 = langchain.llms.bedrock.Bedrock( #create a Bedrock llm client
    model_id=LLM_01_NAME,
    model_kwargs=llm01_inference_modifier
)

## 1b. Initialize Cohere Command
llm02_inference_modifier = { 
    "max_tokens": 545,
    "temperature": 0,    
}
LLM_02_NAME= "cohere.command-text-v14"
llm02 = langchain.llms.bedrock.Bedrock( #create a Bedrock llm client
    model_id=LLM_02_NAME,
    model_kwargs=llm02_inference_modifier
)

## 1c. Initialize Llama
llm03_inference_modifier = { 
    "max_gen_len": 545,
    "top_p": 0.9, 
    "temperature": 0,    
}
LLM_03_NAME= "meta.llama2-13b-chat-v1"
llm03 = langchain.llms.bedrock.Bedrock( #create a Bedrock llm client
    model_id=LLM_03_NAME,
    model_kwargs=llm03_inference_modifier
)

llms = [
    llm01,
    llm02,
    llm03
]

## 1d. Initialize eval llm
inference_modifier = { 
    "max_gen_len": 545,
    "top_p": 0.9, 
    "temperature": 0,    
}
LLM_EVAL_NAME= "meta.llama2-70b-chat-v1"
langchain_eval_llm = langchain.llms.bedrock.Bedrock( #create a Bedrock llm client
    model_id=LLM_EVAL_NAME,
    model_kwargs=inference_modifier
)

In [None]:
## 2a. download ground truth dataset
import xmltodict
url = 'https://d3q8adh3y5sxpk.cloudfront.net/rageval/qsdata_20.xml'

# Send an HTTP GET request to download the file
response = requests.get(url)

# Check if the request was successful (HTTP status code 200)
if response.status_code == 200:        
    xml_data = xmltodict.parse(response.text)

# Convert the dictionary to a Pandas DataFrame
qa_dataset = pd.DataFrame(xml_data['data']['records'])

prompts = []
for row in qa_dataset.itertuples():
    item = {
        'prompt': str(row[1]['Question']),
        'context': str(row[1]['Context']),
        'output': str(row[1]['Answer']['question_answer']),
        'page': str(row[1]['Page'])
    }
    prompts.append(item)

# example prompt
print(prompts[0])

In [None]:
# 2b. create ground truth dataset in langsmith
from langsmith import Client
from langsmith.utils import LangSmithError

client = Client()
dataset_name = "AMZN_groundtruthdata_20"

try:
    dataset = client.read_dataset(dataset_name=dataset_name)
    print("using existing dataset: ", dataset.name)
except LangSmithError:
    dataset = client.create_dataset(
        dataset_name=dataset_name,
        description="Amazon 10k evaluation dataset",
    )
    for prompt in prompts:
        client.create_example(
            inputs={"input": prompt['prompt']},
            outputs={"answer": prompt['output']},
            dataset_id=dataset.id,
        )

    print("Created a new dataset: ", dataset.name)

In [None]:
# 3. Create token_text_splitter and char_text_splitter for evaluation

## 3a. download context / Amazon annual report
import numpy as np
import pypdf
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from urllib.request import urlretrieve

os.makedirs("data", exist_ok=True)
files = [ "https://d3q8adh3y5sxpk.cloudfront.net/rageval/AMZN-2023-10k.pdf"]
for url in files:
    file_path = os.path.join("data", url.rpartition("/")[2])
    urlretrieve(url, file_path)
    

loader = PyPDFDirectoryLoader("./data/")
documents = loader.load()

token_text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=100)
char_text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

token_text_list = token_text_splitter.split_documents(documents)
char_text_list = char_text_splitter.split_documents(documents)
    
print("TokenTextSplitter split documents in to " + str(len(token_text_list)) + " chunks.\n")
print("CharacterTextSplitter split documents in to " + str(len(char_text_list)) + " chunks.\n")

In [None]:
# 4. create vectors and store each document chunk in it's own index in vector database (OpenSearch Serverless)
## 4a. connect to OpenSearchServerless
import time
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth

host = os.environ['OPENSEARCH_COLLECTION']  # serverless collection endpoint, without https://
print(f"host: {host}")
region = os.environ['REGION']  # e.g. us-east-1
print(f'region: {region}')


service = 'aoss'
credentials = boto3.Session().get_credentials()
auth = AWSV4SignerAuth(credentials, region, service)

## 4b. create vectordatabase if it does not exist yet
if host == '':
    print('creating collection')
    vector_store_name = 'rag-eval'
    encryption_policy_name = "rag-eval-ep"
    network_policy_name = "rag-eval-np"
    access_policy_name = 'rag-eval-ap'
    identity = boto3.client('sts').get_caller_identity()['Arn']

    aoss_client = boto3.client('opensearchserverless')

    security_policy = aoss_client.create_security_policy(
        name = encryption_policy_name,
        policy = json.dumps(
            {
                'Rules': [{'Resource': ['collection/' + vector_store_name],
                'ResourceType': 'collection'}],
                'AWSOwnedKey': True
            }),
        type = 'encryption'
    )

    network_policy = aoss_client.create_security_policy(
        name = network_policy_name,
        policy = json.dumps(
            [
                {'Rules': [{'Resource': ['collection/' + vector_store_name],
                'ResourceType': 'collection'}],
                'AllowFromPublic': True}
            ]),
        type = 'network'
    )

    collection = aoss_client.create_collection(name=vector_store_name,type='VECTORSEARCH')

    while True:
        status = aoss_client.list_collections(collectionFilters={'name':vector_store_name})['collectionSummaries'][0]['status']
        if status in ('ACTIVE', 'FAILED'): 
            print(f'new collection {vector_store_name} created')
            break
        time.sleep(10)

    access_policy = aoss_client.create_access_policy(
        name = access_policy_name,
        policy = json.dumps(
            [
                {
                    'Rules': [
                        {
                            'Resource': ['collection/' + vector_store_name],
                            'Permission': [
                                'aoss:CreateCollectionItems',
                                'aoss:DeleteCollectionItems',
                                'aoss:UpdateCollectionItems',
                                'aoss:DescribeCollectionItems'],
                            'ResourceType': 'collection'
                        },
                        {
                            'Resource': ['index/' + vector_store_name + '/*'],
                            'Permission': [
                                'aoss:CreateIndex',
                                'aoss:DeleteIndex',
                                'aoss:UpdateIndex',
                                'aoss:DescribeIndex',
                                'aoss:ReadDocument',
                                'aoss:WriteDocument'],
                            'ResourceType': 'index'
                        }],
                    'Principal': [identity],
                    'Description': 'Easy data policy'}
            ]),
        type = 'data'
    )

    host = collection['createCollectionDetail']['id'] + '.' + os.environ.get("AWS_DEFAULT_REGION", None) + '.aoss.amazonaws.com:443'
    host = host.split(":")[0]
    print(f'new aoss host: {host}')

aospy_client = OpenSearch(
    hosts=[{'host': host, 'port': 443}],
    http_auth=auth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
    pool_maxsize=20,
)
print(f'aospy client:{aospy_client}')

In [None]:
## 4c. Create index for CharacterTextSplitter in Amazon Opensearch Service 

# langchain version
knn_index = {
    "settings": {
        "index.knn": True,
        
    },
    "mappings": {
        "properties": {
            "vector_field": {
                "type": "knn_vector",
                "dimension": 1536,
                "store": True
            },
            "text": {
                "type": "text",
                "store": True
            },
        }
    }
}

index_name = "rag-eval-charactertextsplitter"
try:
    aospy_client.indices.delete(index=index_name)
    aospy_client.indices.create(index=index_name,body=knn_index,ignore=400)
    aospy_client.indices.get(index=index_name)
except:
    print(f'Index {index_name} not found. Creating index on OpenSearch.')
    aospy_client.indices.create(index=index_name,body=knn_index)
    aospy_client.indices.get(index=index_name)

In [None]:
## 4d. Create index for TokenTextSplitter in Amazon Opensearch Service 

# langchain version
knn_index = {
    "settings": {
        "index.knn": True,
        
    },
    "mappings": {
        "properties": {
            "vector_field": {
                "type": "knn_vector",
                "dimension": 1536,
                "store": True
            },
            "text": {
                "type": "text",
                "store": True
            },
        }
    }
}

index_name = "rag-eval-tokentextsplitter"
try:
    aospy_client.indices.delete(index=index_name)
    aospy_client.indices.create(index=index_name,body=knn_index,ignore=400)
    aospy_client.indices.get(index=index_name)
except:
    print(f'Index {index_name} not found. Creating index on OpenSearch.')
    aospy_client.indices.create(index=index_name,body=knn_index)
    aospy_client.indices.get(index=index_name)

In [None]:
# 5. Use Titan Embeddings Model to generate embeddings

from langchain.embeddings import BedrockEmbeddings


# # LangChain requires AWS4Auth
# from requests_aws4auth import AWS4Auth
# def get_aws4_auth():
#     region = os.environ.get("Region", os.environ["REGION"])
#     service = "aoss"
#     credentials = boto3.Session().get_credentials()
#     return AWS4Auth(
#         credentials.access_key,
#         credentials.secret_key,
#         region,
#         service,
#         session_token=credentials.token,
#     )
# aws4_auth = get_aws4_auth()

bedrock_embeddings = BedrockEmbeddings(client=bedrock_runtime)

In [None]:
## 5a. Use Titan Embeddings Model to generate embeddings for TokenTextSplitter
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings  
from langchain.vectorstores import OpenSearchVectorSearch

full_opensearch_endpoint = 'https://' + os.environ['OPENSEARCH_COLLECTION']
index_name = "rag-eval-tokentextsplitter"  
vectorstore_token = OpenSearchVectorSearch.from_documents(
            index_name = index_name,
            documents=token_text_list,
            embedding=bedrock_embeddings,
            opensearch_url=full_opensearch_endpoint,
            http_auth=auth,
            use_ssl=True,
            verify_certs=True,
            connection_class=RequestsHttpConnection,
            timeout=60*3,
            bulk_size=1000,
            is_aoss=True
        )  
retriever_token = vectorstore_token.as_retriever()

In [None]:
## 5b. Use Titan Embeddings Model to generate embeddings for CharacterTextSplitter
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings  
from langchain.vectorstores import OpenSearchVectorSearch

full_opensearch_endpoint = 'https://' + os.environ['OPENSEARCH_COLLECTION']
index_name = "rag-eval-charactertextsplitter"  
vectorstore_character = OpenSearchVectorSearch.from_documents(
            index_name = index_name,
            documents=token_text_list,
            embedding=bedrock_embeddings,
            opensearch_url=full_opensearch_endpoint,
            http_auth=auth,
            use_ssl=True,
            verify_certs=True,
            connection_class=RequestsHttpConnection,
            timeout=60*3,
            bulk_size=1000,
            is_aoss=True
        )  
retriever_character = vectorstore_character.as_retriever()

In [None]:
# 6. create and save prompt templates for eval
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain import hub


### Claude prompt templates
prompt_template_claude_1 = """
        Human: Given report provided, please read it and analyse the content.
        Please answer the following question: {question} basing the answer only on the information from the report
        and return it inside <question_answer></question_answer> XML tags.

        If a particular bit of information is not present, return an empty string.
        Each returned answer should be concise, remove extra information if possible.
        The report will be given between <report></report> XML tags.

        <report>
        {context}
        </report>

        Return the answer inside <question_answer></question_answer> XML tags.
        Assistant:"""

PROMPT_CLAUDE_1 = PromptTemplate(
    template=prompt_template_claude_1, input_variables=["question", "context"]
)

prompt_template_claude_2 = """
        Human: 
        You are a helpful, respectful, and honest assistant, dedicated to providing valuable and accurate information.

        Assistant:
        Understood. I will provide information based on the context given, without relying on prior knowledge.

        Human:
        If you don't see answer in the context just reply "not available" in XML tags.

        Assistant:
        Noted. I will respond with "not available" if the information is not available in the context.

        Human:
        Now read this context and answer the question and return the answer inside <question_answer></question_answer> XML tags. 
        {context}

        Assistant:
        Based on the provided context above and information from the retriever source, I will provide the answer in  and return it inside <question_answer></question_answer> XML tags to the below question
        {question}
        """

PROMPT_CLAUDE_2 = PromptTemplate(
    template=prompt_template_claude_2, input_variables=["question", "context"]
)

### Llama2 prompt templates
prompt_template_llama_1 = """
        [INST] Given report provided, please read it and analyse the content.
        Please answer the following question: {question} basing the answer only on the information from the report
        and return it inside <question_answer></question_answer> XML tags.

        If a particular bit of information is not present, return an empty string.
        Each returned answer should be concise, remove extra information if possible.
        The report will be given between <report></report> XML tags.

        <report>
        {context}
        </report>

        Return the answer inside <question_answer></question_answer> XML tags. [/INST]
        """
PROMPT_LLAMA_1 = PromptTemplate(
    template=prompt_template_llama_1, input_variables=["question", "context"]
)

prompt_template_llama_2 = """
        [INST]
        You are a helpful, respectful, and honest assistant, dedicated to providing valuable and accurate information.
        [/INST]

        Understood. I will provide information based on the context given, without relying on prior knowledge.

        [INST]
        If you don't see answer in the context just reply "not available" in XML tags.
        [/INST]

        Noted. I will respond with "not available" if the information is not available in the context.

        [INST]
        Now read this context and answer the question and return the answer inside <question_answer></question_answer> XML tags. 
        {context}
        [/INST]

        Based on the provided context above and information from the retriever source, I will provide the answer in  and return it inside <question_answer></question_answer> XML tags to the below question
        {question}
        """
PROMPT_LLAMA_2 = PromptTemplate(
    template=prompt_template_llama_2, input_variables=["question", "context"]
)


### Cohere Command prompt templates
prompt_template_command_1 = """
        Human: Given report provided, please read it and analyse the content.
        Please answer the following question: {question} basing the answer only on the information from the report
        and return it inside <question_answer></question_answer> XML tags.

        If a particular bit of information is not present, return an empty string.
        Each returned answer should be concise, remove extra information if possible.
        The report will be given between <report></report> XML tags.

        <report>
        {context}
        </report>

        Return the answer inside <question_answer></question_answer> XML tags.
        Assistant:"""

PROMPT_COMMAND_1 = PromptTemplate(
    template=prompt_template_command_1, input_variables=["question", "context"]
)

prompt_template_command_2 = """
        Human: 
        You are a helpful, respectful, and honest assistant, dedicated to providing valuable and accurate information.

        Assistant:
        Understood. I will provide information based on the context given, without relying on prior knowledge.

        Human:
        If you don't see answer in the context just reply "not available" in XML tags.

        Assistant:
        Noted. I will respond with "not available" if the information is not available in the context.

        Human:
        Now read this context and answer the question and return the answer inside <question_answer></question_answer> XML tags. 
        {context}

        Assistant:
        Based on the provided context above and information from the retriever source, I will provide the answer in  and return it inside <question_answer></question_answer> XML tags to the below question
        {question}
        """
PROMPT_COMMAND_2 = PromptTemplate(
    template=prompt_template_command_2, input_variables=["question", "context"]
)

# generic prompt template for all LLMs
generic_rag_template = hub.pull("rlm/rag-prompt")

prompttemplates = [
    {'template_name': 'generic_rag_template', 'template': generic_rag_template},
    {'template_name': 'prompt_template_claude_1', 'template': PROMPT_CLAUDE_1},
    {'template_name': 'prompt_template_claude_2', 'template': PROMPT_CLAUDE_2},
    {'template_name': 'prompt_template_command_1', 'template': PROMPT_COMMAND_1},
    {'template_name': 'prompt_template_command_2', 'template': PROMPT_COMMAND_2},
    {'template_name': 'prompt_template_llama_1', 'template': PROMPT_LLAMA_1},
    {'template_name': 'prompt_template_llama_2', 'template': PROMPT_LLAMA_2},
]

In [None]:
# 7. create custom evaluators for LangSmith
## 7a) Custom Evaluator with llama_index SemanticSimilarityEvaluator

from typing import Optional
from langsmith.evaluation import EvaluationResult, RunEvaluator
from langsmith.schemas import Example, Run
import nest_asyncio
from llama_index.llms import Bedrock
from llama_index.embeddings import BedrockEmbedding
from llama_index import (
    ServiceContext
)

from llama_index.evaluation import (
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    CorrectnessEvaluator,
    SemanticSimilarityEvaluator
)
from llama_index.embeddings import SimilarityMode
from llama_index import Document

class LlamaIndexEvaluator(RunEvaluator):
    
    def __init__(self, model: str = "anthropic.claude-v2"):

        self.model = model

        self.eval_llm = Bedrock(model=self.model,
                    temperature=0,
                    additional_kwargs={'max_tokens_to_sample': 512,'top_k': 10})

        self.embed_model = BedrockEmbedding().from_credentials(
            model_name='amazon.titan-embed-g1-text-02'
        )

        self.service_context_eval = ServiceContext.from_defaults(
            llm=self.eval_llm, 
            embed_model=self.embed_model, 
        )
        self.faithfulness_evaluator = FaithfulnessEvaluator(service_context=self.service_context_eval)
        self.relevancy_evaluator = RelevancyEvaluator(service_context=self.service_context_eval)
        self.similarity_threshold = 0.8
        self.semantic_evaluator = SemanticSimilarityEvaluator(service_context=self.service_context_eval,
                                                        similarity_mode=SimilarityMode.DEFAULT,
                                                        similarity_threshold=self.similarity_threshold) # 0.8 default
        self.correctness_evaluator = CorrectnessEvaluator(service_context=self.service_context_eval) # encountered parsing errors with this class


    def evaluate_run(self, run, example: [Example]) -> EvaluationResult:
        if run.outputs is None:
            raise ValueError("Run outputs cannot be None")
        if example is None:
            raise ValueError("Examples cannot be None")
        

        print(f'example answer value: {str(example.outputs["answer"])}')
        print(f'example question value: {str(run.inputs["query"])}')
        print(f'run answer value: {str(run.outputs["result"])}')

        generated_answer=run.outputs["result"]
        reference_answer=example.outputs["answer"]

        nest_asyncio.apply()
        semantic_results = self.semantic_evaluator.evaluate(
            response=generated_answer,
            reference=reference_answer
        )

        cur_result_dict = {
            "generated_answer": generated_answer,
            "semantic_similarity": semantic_results.passing,
            "semantic_similarity_threshold": self.similarity_threshold,
            "semantic_similarity_score": semantic_results.score
        }
        return EvaluationResult(key="Similarity", score=semantic_results.score)

In [None]:
## 7b) Custom Evaluator with RAGAS framework for context_recall

from typing import Optional
from langsmith.evaluation import EvaluationResult, RunEvaluator
from langsmith.schemas import Example, Run
import nest_asyncio

from datasets import Dataset
import ragas

from ragas import evaluate
from ragas.metrics import (
    context_precision,
    faithfulness,
    context_recall,
    answer_relevancy,
)

class RagasContextRecallEvaluator(RunEvaluator):
    
    def __init__(self, model: str = "anthropic.claude-v2"):

        self.model = model

        self.eval_llm = Bedrock(model=self.model,
                    temperature=0,
                    additional_kwargs={'max_tokens_to_sample': 512,'top_k': 10})

        self.embed_model = BedrockEmbedding().from_credentials(
            model_name='amazon.titan-embed-g1-text-02'
        )

        

    def evaluate_run(self, run, example: [Example]) -> EvaluationResult:
        if run.inputs is None:
            raise ValueError("Run inputs cannot be None")
        if run.outputs is None:
            raise ValueError("Run outputs cannot be None")
        if example is None:
            raise ValueError("Examples cannot be None")
        

        print(f'example answer value: {str(example.outputs["answer"])}')
        print(f'example question value: {str(run.inputs["query"])}')
        print(f'run answer value: {str(run.outputs["result"])}')

        generated_answer=run.outputs["result"]
        reference_answer=example.outputs["answer"]
        question=run.inputs["query"]


        nest_asyncio.apply()
        # list of metrics we're going to use
        metrics = [
            #faithfulness,
            #answer_relevancy,
            context_recall,
            #context_precision,
            # harmfulness,
        ]

        basic_qa_ragas_dataset = []
        basic_qa_ragas_dataset.append(
                {"question" :question,
                "answer" : generated_answer,
                "contexts" : [""],
                "ground_truths" : [reference_answer]
                }
            )
        basic_qa_ragas_df = pd.DataFrame(basic_qa_ragas_dataset)
        basic_qa_ragas_df = Dataset.from_pandas(basic_qa_ragas_df)

        # evaluate
        result = evaluate(basic_qa_ragas_df, metrics=metrics)
        context_recall_results_df = result.to_pandas()

        cur_result_dict = {
            "generated_answer": generated_answer,
            "context_recall_score": context_recall_results_df['context_recall'],
        }
        return EvaluationResult(key="ContextRecall", score=float(cur_result_dict['context_recall_score']))

In [None]:
import langsmith
from langchain import hub
from langchain import chat_models, prompts, smith
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.schema import output_parser

def langsmith_evaluate(test_name, dataset_name, tags, chain):
    # Define the evaluators to apply
    eval_config = smith.RunEvalConfig(
        evaluators=[
            "cot_qa",
            smith.RunEvalConfig.LabeledCriteria("conciseness"),
            smith.RunEvalConfig.LabeledCriteria("relevance")
        ],
        custom_evaluators=[
                           LlamaIndexEvaluator(),
                           RagasContextRecallEvaluator()
                           ],
        eval_llm=langchain_eval_llm
    )

    client = langsmith.Client()
    chain_results = client.run_on_dataset(
        dataset_name=dataset_name,
        llm_or_chain_factory=chain,
        evaluation=eval_config,
        project_name=test_name,
        concurrency_level=5,
        verbose=True,
        tags=tags
    )
    return chain_results

vectorstores = [vectorstore_token, vectorstore_character]
overall_results = []
for llm in llms:
    for prompttemplate in prompttemplates:
        print(f'llm: {llm.model_id}')
        print(f'prompt template: {prompttemplate["template_name"]}')
        
        prompt = prompttemplate["template"]
        chain_type="stuff"
        search_type="similarity" # alternative: "mmr", or "similarity_score_threshold" (Default: similarity)
        retriever_k = 4 # Amount of documents to return (Default: 4)
        score_threshold = 0 # Minimum relevance threshold for similarity_score_threshold
        fetch_k = 20 # Amount of documents to pass to MMR algorithm (Default: 20)
        lambda_mult = 0.5 # Diversity of results returned by MMR, 1 for minimum diversity and 0 for maximum. (Default: 0.5)

        

        test_name=f'LLM_{llm.model_id}_vectorstore_token_template_{str(prompttemplate["template_name"])}_search_{search_type}_chain_{chain_type}_k_{retriever_k}_21'
        k_value = f'k_{retriever_k}'
        chain_type_value = f'chain_{chain_type}'
        tags = [llm.model_id, prompttemplate["template_name"],search_type, chain_type_value, k_value]
        print(test_name)

        search_kwargs = {
            "retriever_k": retriever_k
        }

        retriever = vectorstore_token.as_retriever(search_type = search_type, search_kwargs=search_kwargs)

        qa_chain = RetrievalQA.from_chain_type(
                llm=llm,
                chain_type=chain_type,
                retriever=retriever,
                chain_type_kwargs = {"prompt": prompt}
            )

        chain = qa_chain
        dataset_name="AMZN_groundtruthdata_20"

        chain_results = langsmith_evaluate(test_name, dataset_name, tags, chain)
        overall_results.append(chain_results)

In [None]:
# LLAMA_INDEX EVAL

## use results from LLMInformationExtraction.ipynb
### query,llm,output,trainingoutput,context,trainingcontext,evaluationmetric,score,feedback
predictions_df = pd.read_csv('eval_run_predictions.csv')
print(f'column names: {predictions_df.columns}')
print(f'no of rows: {predictions_df.count()}')

In [None]:
# run evaluation directly with llama_index on an existing dataframe
## Faithfulness: measure if the response from a query engine matches any source nodes
## Relevancy: measure if the response and source nodes match the query
## Correctness: assess the relevance and correctness of a generated answer against a reference answer
## Semantic Similarity: evaluates the quality of a question answering system via semantic similarity

from llama_index.llms import Bedrock
from llama_index.embeddings import BedrockEmbedding
from llama_index import (
    ServiceContext
)

from llama_index.evaluation import (
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    CorrectnessEvaluator,
    SemanticSimilarityEvaluator
)
from llama_index.embeddings import SimilarityMode
from llama_index import Document


model_kwargs_claude = {
    "temperature": 0,
    "top_k": 10,
    "max_tokens_to_sample": 512
}

#LLM_EVAL_NAME= "meta.llama2-70b-chat-v1"
eval_llm = Bedrock(model="anthropic.claude-v2",
              #context_size=512,
              temperature=0,
              additional_kwargs={'max_tokens_to_sample': 512,'top_k': 10})

embed_model = BedrockEmbedding().from_credentials(
    model_name='amazon.titan-embed-g1-text-02'
)

service_context_eval = ServiceContext.from_defaults(
    llm=eval_llm, 
    embed_model=embed_model, 
)

faithfulness_evaluator = FaithfulnessEvaluator(service_context=service_context_eval)
relevancy_evaluator = RelevancyEvaluator(service_context=service_context_eval)
similarity_threshold = 0.8
semantic_evaluator = SemanticSimilarityEvaluator(service_context=service_context_eval,
                                                 similarity_mode=SimilarityMode.DEFAULT,
                                                 similarity_threshold=similarity_threshold) # 0.8 default
correctness_evaluator = CorrectnessEvaluator(service_context=service_context_eval) # encountered parsing errors with this class

def run_evals(qa_df):
    results_list = []
    for row in qa_df.itertuples(index=False):
        question = row.query
        reference_answer = row.trainingoutput
        generated_answer = row.output
        retrieved_context = row.context.replace('[]','')
        retrieved_context = retrieved_context.split("/n")
        #print(f'retrieved context: {retrieved_context}')
        #print(f'retrieved context type: {type(retrieved_context)}')

        faithfulness = False
        faithfulness_feedback  = 'not calculated'
        faithfulness_score =  0.0
        relevancy = False
        relevancy_feedback =  'not calculated'
        relevancy_score  =  0.0
        correctness = False
        correctness_feedback = 'not calculated'
        correctness_score = 1.0
        
        if not(len(retrieved_context) == 0 or retrieved_context[0] == ''):

            faithfulness_results = faithfulness_evaluator.evaluate(
                query=question,
                response=generated_answer,
                contexts=retrieved_context
                )
            
            relevancy_results = relevancy_evaluator.evaluate(
                query=question,
                response=generated_answer,
                contexts=retrieved_context
                )
            faithfulness = faithfulness_results.passing
            faithfulness_feedback  = faithfulness_results.feedback
            faithfulness_score =  faithfulness_results.score
            relevancy = relevancy_results.passing
            relevancy_feedback =  relevancy_results.feedback
            relevancy_score  =  relevancy_results.score
            
        semantic_results = semantic_evaluator.evaluate(
            response=generated_answer,
            reference=reference_answer
        )

        # correctness_results = correctness_evaluator.evaluate(
        #     query=question,
        #     response=generated_answer,
        #     reference=reference_answer
        # )

        # correctness= correctness_results.passing
        # correctness_feedback= correctness_results.feedback
        # correctness_score= correctness_results.score

        cur_result_dict = {
            "query": question,
            "generated_answer": generated_answer,
            "correctness": correctness,
            "correctness_feedback": correctness_feedback,
            "correctness_score": correctness_score,
            "semantic_similarity": semantic_results.passing,
            "semantic_similarity_threshold": similarity_threshold,
            "semantic_similarity_score": semantic_results.score,
            "faithfulness": faithfulness,
            "faithfulness_feedback": faithfulness_feedback,
            "faithfulness_score": faithfulness_score,
            "relevancy": relevancy,
            "relevancy_feedback": relevancy_feedback,
            "relevancy_score": relevancy_score
        }
        results_list.append(cur_result_dict)
    evals_df = pd.DataFrame(results_list)
    return evals_df

In [None]:
evals_df = run_evals(predictions_df)

In [None]:
# verify results 

# mean in each dimension
print(f'faithfulness mean: {evals_df["faithfulness_score"].mean()}')
print(f'relevancy mean: {evals_df["relevancy_score"].mean()}')
print(f'semantic mean: {evals_df["semantic_similarity_score"].mean()}')
print(f'correctness mean: {evals_df["correctness_score"].mean()}')

In [None]:
# TEST LLAMA_INDEX

In [None]:
## load data
!mkdir -p ./data

from urllib.request import urlretrieve
urls = [
    'https://d3q8adh3y5sxpk.cloudfront.net/rageval/AMZN-2023-10k.pdf',
]

filenames = [
    'AMZN-2023-10k.pdf',
]

data_root = "./data/"

for idx, url in enumerate(urls):
    file_path = data_root + filenames[idx]
    urlretrieve(url, file_path)

In [None]:
from llama_index import (
    SimpleDirectoryReader,
    LLMPredictor,
    ServiceContext,
    get_response_synthesizer,
    set_global_service_context
)
from llama_index.indices.document_summary import DocumentSummaryIndex
import nest_asyncio

nest_asyncio.apply()


In [None]:
from llama_index.llms import Bedrock
from llama_index.embeddings import BedrockEmbedding

model_kwargs_claude = {
    "temperature": 0,
    "top_k": 10,
    "max_tokens_to_sample": 512
}

llm = Bedrock(model="anthropic.claude-v2",
              #context_size=512,
              temperature=0,
              additional_kwargs={'max_tokens_to_sample': 512,'top_k': 10})

embed_model = BedrockEmbedding().from_credentials(
    model_name='amazon.titan-embed-g1-text-02'
)

service_context = ServiceContext.from_defaults(llm=llm, 
                                               embed_model=embed_model, 
                                               chunk_size=512)
chunk_overlap = 20
chunk_size = 512
service_context = ServiceContext.from_defaults(llm=llm, 
                                               embed_model=embed_model, 
                                               chunk_size=chunk_size,
                                               chunk_overlap=chunk_overlap,
                                            )
set_global_service_context(service_context)



In [None]:
filename_fn = lambda filename: {"file_path": filename, "file_name": filename.replace('data/', "").replace('.pdf', "")}

# automatically sets the metadata of each document according to filename_fn
documents = SimpleDirectoryReader(
    "./data", file_metadata=filename_fn
).load_data()

In [None]:
#review metadata
print(documents[50].metadata)

In [None]:
from llama_index import SimpleDirectoryReader
from llama_index.vector_stores import (
    OpensearchVectorStore,
    OpensearchVectorClient,
)
from llama_index import VectorStoreIndex, StorageContext

In [None]:
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth

host = os.environ['OPENSEARCH_COLLECTION'] # OpenSearch endpoint, for example: my-test-domain.us-east-1.aoss.amazonaws.com
service = 'aoss'
region = 'us-east-1'
credentials = boto3.Session().get_credentials()
auth = AWSV4SignerAuth(credentials, region, service)

endpoint = 'https://' + os.environ['OPENSEARCH_COLLECTION']
print(f'endpoint: {endpoint}')
index_name = "rag-eval-v1"
# OpensearchVectorClient stores text in this field by default
text_field = "content"
# OpensearchVectorClient stores embeddings in this field by default
embedding_field = "embedding"

client = OpensearchVectorClient(
    endpoint=endpoint,
    index=index_name, 
    dim=1536, 
    embedding_field=embedding_field, 
    text_field=text_field,
    http_auth=auth, 
    use_ssl=True, 
    verify_certs=True, 
    connection_class=RequestsHttpConnection, 
    timeout=10,
)
print(client)

In [None]:
# initialize vector store
vector_store = OpensearchVectorStore(client)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# initialize an index using our sample data and the client we just created
index = VectorStoreIndex.from_documents(
    documents=documents, storage_context=storage_context
)

In [None]:
# run query
query_engine = index.as_query_engine()
res = query_engine.query("Who is Amazon's Senior Vice President and General Counsel?")
res.response

In [None]:
# query with filtering - NOT WORKING ATM
from llama_index import Document
from llama_index.vector_stores.types import MetadataFilters, ExactMatchFilter, MetadataFilter,FilterOperator
import regex as re

# Create a query engine that only searches certain documents.
metadata_query_engine = index.as_query_engine(
    filters=MetadataFilters(
        filters=[
            ExactMatchFilter(
                key="term", value='{"file_path": "data/AMZN-2023-10k.pdf"}'
            )
            #ExactMatchFilter(key="file_name", value="AMZN-2023-10k")
            
        ]
    )
)

res = metadata_query_engine.query(
    "who is Amazon's Senior Vice President and General Counsel?"
)
res.response

In [None]:
# check what the llm and embeddings model get to see
from llama_index import Document
from llama_index.schema import MetadataMode

document = documents[0]
print(
    "The LLM sees this: \n",
    document.get_content(metadata_mode=MetadataMode.LLM),
)
print(
    "The Embedding model sees this: \n",
    document.get_content(metadata_mode=MetadataMode.EMBED),
)

In [None]:
# use Bedrock Knowledgebase retriever
from langchain.retrievers.bedrock import AmazonKnowledgeBasesRetriever

kb_id = "<knowledge_base_id>"

bedrock_config = Config(connect_timeout=120, read_timeout=120, retries={'max_attempts': 0})
bedrock_client = boto3.client('bedrock-runtime')
bedrock_agent_client = boto3.client("bedrock-agent-runtime",
                              config=bedrock_config)

retriever = AmazonKnowledgeBasesRetriever(
        knowledge_base_id=kb_id,
        retrieval_config={"vectorSearchConfiguration": {"numberOfResults": 4}},

    )

from langchain.chains import RetrievalQA
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": claude_prompt}
)

In [None]:
# New service context for eval
# good blog: https://levelup.gitconnected.com/evaluation-driven-development-the-swiss-army-knife-for-rag-pipelines-dba24218d47e
