In [6]:
# !pip install -Uq ragas==0.1.20
# !pip install -Uq langchain==0.2.16 langchain_aws langchain-community>=0.2.41 langchain-core==0.2.41 langchain-experimental==0.0.60
# !pip install -Uq nest-asyncio

In [7]:
import pandas as pd

df = pd.read_json("data/qa_dataset.jsonl", lines=True)
df.head()

Unnamed: 0,question,ground_truth,question_type,contexts
0,"How do temperature, Top K, and Top P parameter...","Temperature, Top K, and Top P are parameters t...",complex,"• If you set a high temperature, the probabili..."
1,How long will Amazon Bedrock support base mode...,Amazon Bedrock will support base models for a ...,simple,• EOL: This version is no longer available for...
2,How does the system handle a scenario where a ...,The system doesn't explicitly show a function ...,complex,"'payment_date': ['2021-10-05', '2021-10-06', '..."
3,What is the purpose of an S3 retrieval node in...,An S3 retrieval node lets you retrieve data fr...,simple,An S3 retrieval node lets you retrieve data fr...
4,How can a developer create a new prompt versio...,"To create a new prompt version, retrieve its i...",complex,make a CreatePromptVersion Agents for Amazon B...


In [8]:
from datasets import Dataset

subset_length = 10
test_dataset = Dataset.from_pandas(df.head(subset_length))

In [9]:
import ast
import re

def clean_string(s):
    s = re.sub(r'[^\x00-\x7F]+', '', s)
    s = s.replace("'", '"')
    return s

def convert_to_list(example):
    cleaned_context = clean_string(example["contexts"])
    try:
        contexts = ast.literal_eval(cleaned_context)
    except:
        contexts = cleaned_context
    return {"contexts": contexts}

test_dataset = test_dataset.map(convert_to_list)
print(test_dataset)

Map: 100%|██████████| 10/10 [00:00<00:00, 3060.20 examples/s]

Dataset({
    features: ['question', 'ground_truth', 'question_type', 'contexts'],
    num_rows: 10
})





In [10]:
test_dataset[0]['question']

"How do temperature, Top K, and Top P parameters interact in Amazon Bedrock's foundation models, and how might adjusting these affect the output when generating text about different types of equines?"

In [11]:
# RAG implementation sample 1
from libs.bedrock_kb_util import context_retrieval_from_kb

question = test_dataset[0]['question']
search_result = context_retrieval_from_kb(question, 3, 'us-west-2', 'CNDSUOPKAS', 'SEMANTIC')
print(search_result[0])

contexts = "\n--\n".join([result['content'] for result in search_result])

{'index': 1, 'content': 'Randomness and diversity 239        Amazon Bedrock User Guide   â€¢ If you set Top K as 2, the model only considers the top 2 most likely candidates: "horses" and  "zebras."   â€¢ If you set Top P as 0.7, the model only considers "horses" because it is the only candidate that  lies in the top 70% of the probability distribution. If you set Top P as 0.9, the model considers  "horses" and "zebras" as they are in the top 90% of probability distribution.   Length   Foundation models typically support parameters that limit the length of the response. Examples of  these parameters are provided below.   â€¢ Response length â€“ An exact value to specify the minimum or maximum number of tokens to  return in the generated response.   â€¢ Penalties â€“ Specify the degree to which to penalize outputs in a response. Examples include the  following.   â€¢ The length of the response.   â€¢ Repeated tokens in a response.   â€¢ Frequency of tokens in a response.   â€¢ Types of 

In [12]:
print(contexts)

Randomness and diversity 239        Amazon Bedrock User Guide   â€¢ If you set Top K as 2, the model only considers the top 2 most likely candidates: "horses" and  "zebras."   â€¢ If you set Top P as 0.7, the model only considers "horses" because it is the only candidate that  lies in the top 70% of the probability distribution. If you set Top P as 0.9, the model considers  "horses" and "zebras" as they are in the top 90% of probability distribution.   Length   Foundation models typically support parameters that limit the length of the response. Examples of  these parameters are provided below.   â€¢ Response length â€“ An exact value to specify the minimum or maximum number of tokens to  return in the generated response.   â€¢ Penalties â€“ Specify the degree to which to penalize outputs in a response. Examples include the  following.   â€¢ The length of the response.   â€¢ Repeated tokens in a response.   â€¢ Frequency of tokens in a response.   â€¢ Types of tokens in a response.   â

In [13]:
import boto3
from botocore.config import Config

model_id = "anthropic.claude-3-5-sonnet-20240620-v1:0"
region = 'us-west-2'

retry_config = Config(
    region_name=region,
    retries={"max_attempts": 10, "mode": "standard"}
)
boto3_client = boto3.client("bedrock-runtime", config=retry_config)

In [14]:
def generate_answer(question, contexts):
    system_prompt = """You are an AI assistant that uses retrieved context to answer questions accurately. 
    Follow these guidelines:
    1. Use the provided context to inform your answers.
    2. If the context doesn't contain relevant information, say "I don't have enough information to answer that."
    3. Be concise and to the point in your responses."""

    user_prompt = f"""Context: {contexts}

    Question: {question}

    Please answer the question based on the given context."""

    response = boto3_client.converse(
        modelId=model_id,
        messages=[{'role': 'user', 'content': [{'text': user_prompt}]}],
        system=[{'text': system_prompt}]
    )

    answer = response['output']['message']['content'][0]['text']
    return answer

In [15]:
test_dataset

Dataset({
    features: ['question', 'ground_truth', 'question_type', 'contexts'],
    num_rows: 10
})

In [16]:
from tqdm import tqdm
from time import sleep

kb_region = 'us-west-2'
kb_id = 'CNDSUOPKAS'
top_k = 3

def process_item(item):
    sleep(5)  # Prevent throttling
    question = item['question']
    search_result = context_retrieval_from_kb(question, top_k, kb_region, kb_id, 'SEMANTIC')

    contexts = [result['content'] for result in search_result]
    answer = generate_answer(question, "\n--\n".join(contexts))

    return {
        'question': item['question'],
        'ground_truth': item['ground_truth'],
        'original_contexts': item['contexts'],
        'retrieved_contexts': contexts,
        'answer': answer
    }

updated_dataset = test_dataset.map(process_item)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]


KeyboardInterrupt: 

In [None]:
import json
output_file = "data/updated_qa_dataset.jsonl"

with open(output_file, 'w', encoding='utf-8') as f:
    for item in updated_dataset:
        json.dump(item, f, ensure_ascii=False)
        f.write('\n')

print(f"Dataset saved to {output_file}")

In [None]:
import json
from datasets import Dataset

input_file = "data/updated_qa_dataset.jsonl"
def read_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            yield json.loads(line.strip())

updated_dataset = Dataset.from_list(list(read_jsonl(input_file)))

item = updated_dataset[0]
print(f"Question: {item['question']}")
print(f"Answer: {item['answer']}")
print("-" * 50)

In [None]:
updated_dataset

In [1]:
import json
from datasets import Dataset

input_file = "data/updated_qa_dataset.jsonl"
def read_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            yield json.loads(line.strip())

updated_dataset = Dataset.from_list(list(read_jsonl(input_file)))

item = updated_dataset[0]
print(f"Question: {item['question']}")
print(f"Answer: {item['answer']}")
print("-" * 50)

  from .autonotebook import tqdm as notebook_tqdm


Question: How do temperature, Top K, and Top P parameters interact in Amazon Bedrock's foundation models, and how might adjusting these affect the output when generating text about different types of equines?
Answer: Based on the provided context, here's how temperature, Top K, and Top P parameters interact in Amazon Bedrock's foundation models, and how adjusting them might affect the output when generating text about different types of equines:

1. Temperature: 
- Lower values increase the likelihood of higher-probability tokens and decrease the likelihood of lower-probability tokens. This would make the model more likely to choose "horses" in the given example.
- Higher values increase the likelihood of lower-probability tokens and decrease the likelihood of higher-probability tokens. This would make the model more likely to consider "zebras" or even "unicorns" in the example.

2. Top K:
- A lower Top K value (e.g., 2) would limit the model to consider only the top K most likely cand

In [2]:
def evaluate(dataset, metrics, llm_id, emb_id, region):
    """
    Evaluate the dataset using the specified metrics.

    Args:
    dataset (List[Dict]): List of dictionaries containing 'user_input', 'response', and 'retrieved_contexts'.
    metrics (List[AnswerRelevancy]): List of metric objects to use for evaluation.
    llm_id (str): ID of the LLM model to use.
    embeddings_id (str): ID of the embeddings model to use.
    region (str): AWS region to use for Bedrock.

    Returns:
    Dict: A dictionary containing the scores for each metric.
    """
    results = {}

    for metric in metrics:
        if isinstance(metric, AnswerRelevancy):
            metric.llm_id = llm_id
            metric.emb_id = emb_id
            metric.region = region
        if isinstance(metric, Faithfulness):
            metric.llm_id = llm_id
            metric.region = region

        scores = []
        for row in dataset:
            try:
                score = metric.score(row)
                scores.append(score)
            except Exception as e:
                print(f"Error processing row: {e}")
                continue

        if scores:
            avg_score = sum(scores) / len(scores)
            results[metric.__class__.__name__] = avg_score
        else:
            results[metric.__class__.__name__] = "No valid scores"

    return results

In [3]:
from libs.eval_metrics import AnswerRelevancy, Faithfulness

llm_id = "anthropic.claude-3-5-sonnet-20240620-v1:0"
emb_id = "amazon.titan-embed-text-v2:0"
region = "us-west-2"

#metrics = [AnswerRelevancy(llm_id=llm_id, emb_id=emb_id, region=region, strictness=1)]
metrics = [Faithfulness(llm_id=llm_id, region=region)]

def map_dataset(example):
    return {
        "user_input": example["question"],
        "retrieved_contexts": example["retrieved_contexts"],
        "referenced_contexts": example["original_contexts"],
        "response": example["answer"],
        "reference": example["ground_truth"]
    }

dataset = updated_dataset.map(map_dataset).select(range(2))
results = evaluate(dataset, metrics, llm_id, emb_id, region)
print(results)

Map: 100%|██████████| 10/10 [00:00<00:00, 1900.89 examples/s]


[1, 1, 1, 1, 1, 1, 0]
[1]
{'Faithfulness': 0.9285714285714286}


In [None]:
result_df = result.to_pandas()
result_df.head()

In [None]:
result_df.to_csv("data/ragas_evaluation_result.csv", index=False)

In [None]:
result_df.loc[:, "context_precision":"context_recall"]