In [None]:
# !pip install -Uq ragas==0.1.20
# !pip install -Uq langchain==0.2.16 langchain_aws langchain-community>=0.2.41 langchain-core==0.2.41 langchain-experimental==0.0.60
# !pip install -Uq nest-asyncio

In [None]:
import pandas as pd

df = pd.read_json("data/qa_dataset.jsonl", lines=True)
df.head()

In [None]:
from datasets import Dataset

subset_length = 10
test_dataset = Dataset.from_pandas(df.head(subset_length))

In [None]:
import ast
import re

def clean_string(s):
    s = re.sub(r'[^\x00-\x7F]+', '', s)
    s = s.replace("'", '"')
    return s

def convert_to_list(example):
    cleaned_context = clean_string(example["contexts"])
    try:
        contexts = ast.literal_eval(cleaned_context)
    except:
        contexts = cleaned_context
    return {"contexts": contexts}

test_dataset = test_dataset.map(convert_to_list)
print(test_dataset)

In [None]:
test_dataset[0]['question']

In [None]:
# RAG implementation sample 1
from libs.bedrock_kb_util import context_retrieval_from_kb

question = test_dataset[0]['question']
search_result = context_retrieval_from_kb(question, 3, 'us-west-2', 'CNDSUOPKAS', 'SEMANTIC')
print(search_result[0])

contexts = "\n--\n".join([result['content'] for result in search_result])

In [None]:
print(contexts)

In [None]:
import boto3
from botocore.config import Config

model_id = "anthropic.claude-3-5-sonnet-20240620-v1:0"
region = 'us-west-2'

retry_config = Config(
    region_name=region,
    retries={"max_attempts": 10, "mode": "standard"}
)
boto3_client = boto3.client("bedrock-runtime", config=retry_config)

In [None]:
def generate_answer(question, contexts):
    system_prompt = """You are an AI assistant that uses retrieved context to answer questions accurately. 
    Follow these guidelines:
    1. Use the provided context to inform your answers.
    2. If the context doesn't contain relevant information, say "I don't have enough information to answer that."
    3. Be concise and to the point in your responses."""

    user_prompt = f"""Context: {contexts}

    Question: {question}

    Please answer the question based on the given context."""

    response = boto3_client.converse(
        modelId=model_id,
        messages=[{'role': 'user', 'content': [{'text': user_prompt}]}],
        system=[{'text': system_prompt}]
    )

    answer = response['output']['message']['content'][0]['text']
    return answer

In [None]:
test_dataset

In [None]:
from tqdm import tqdm
from time import sleep

kb_region = 'us-west-2'
kb_id = 'CNDSUOPKAS'
top_k = 3

def process_item(item):
    sleep(5)  # Prevent throttling
    question = item['question']
    search_result = context_retrieval_from_kb(question, top_k, kb_region, kb_id, 'SEMANTIC')

    contexts = [result['content'] for result in search_result]
    answer = generate_answer(question, "\n--\n".join(contexts))

    return {
        'question': item['question'],
        'ground_truth': item['ground_truth'],
        'original_contexts': item['contexts'],
        'retrieved_contexts': contexts,
        'answer': answer
    }

updated_dataset = test_dataset.map(process_item)

In [None]:
import json
output_file = "data/updated_qa_dataset.jsonl"

with open(output_file, 'w', encoding='utf-8') as f:
    for item in updated_dataset:
        json.dump(item, f, ensure_ascii=False)
        f.write('\n')

print(f"Dataset saved to {output_file}")

In [None]:
import json
from datasets import Dataset

input_file = "data/updated_qa_dataset.jsonl"
def read_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            yield json.loads(line.strip())

updated_dataset = Dataset.from_list(list(read_jsonl(input_file)))

item = updated_dataset[0]
print(f"Question: {item['question']}")
print(f"Answer: {item['answer']}")
print("-" * 50)

In [None]:
updated_dataset

In [None]:
# import json
# import numpy as np
# import boto3
# from botocore.config import Config

# class Faithfulness:
#     def __init__(self, llm_id, emb_id, region):
#         self.llm_id = llm_id
#         self.emb_id = emb_id
#         self.region = region
#         retry_config = Config(
#             region_name=region,
#             retries={"max_attempts": 10, "mode": "standard"}
#         )
#         self.boto3_client = boto3.client("bedrock-runtime", config=retry_config)
#         self.tool_config = self.init_tool()

#     def init_tool(self):
#         tool_config = {
#             "tools": [
#                 {
#                     "toolSpec": {
#                         "name": "StatementGenerator",
#                         "description": "Generates simpler statements from paragraphs.",
#                         "inputSchema": {
#                             "json": {
#                                 "type": "object",
#                                 "properties": {
#                                     "paragraph_index": {
#                                         "type": "integer",
#                                         "description": "The index of the original paragraph"
#                                     },
#                                     "simpler_statements": {
#                                         "type": "array",
#                                         "items": {
#                                             "type": "string"
#                                         },
#                                         "description": "An array of simpler statements derived from the original paragraph"
#                                     }
#                                 },
#                                 "required": ["paragraph_index", "simpler_statements"]
#                             }
#                         }
#                     }
#                 },
#                 {
#                     "toolSpec": {
#                         "name": "FaithfulnessChecker",
#                         "description": "Checks the faithfulness of statements based on a given context.",
#                         "inputSchema": {
#                             "json": {
#                                 "type": "object",
#                                 "properties": {
#                                     "statement": {
#                                         "type": "string",
#                                         "description": "The statement to check for faithfulness"
#                                     },
#                                     "reason": {
#                                         "type": "string",
#                                         "description": "The reason for the verdict"
#                                     },
#                                     "verdict": {
#                                         "type": "integer",
#                                         "description": "1 if the statement is faithful, 0 if not"
#                                     }
#                                 },
#                                 "required": ["statement", "reason", "verdict"]
#                             }
#                         }
#                     }
#                 }
#             ]
#         }
#         return tool_config

#     def create_message_format(self, sys_template, user_template):
#         sys_prompt = [{"text": sys_template}]
#         usr_prompt = [{"role": "user", "content": [{"text": user_template}]}]
#         return sys_prompt, usr_prompt

#     def converse_with_bedrock_tools(self, sys_prompt, usr_prompt):
#         inference_config = {"temperature": 0.0, "topP": 0.1}
#         response = self.boto3_client.converse(
#             modelId=self.llm_id,
#             messages=usr_prompt,
#             system=sys_prompt,
#             toolConfig=self.tool_config,
#             inferenceConfig=inference_config
#         )
#         return response

#     def parse_tool_use(self, message):
#         stop_reason = message['stopReason']
#         if stop_reason == 'tool_use':
#             tool_requests = message['output']['message']['content']
#             results = []
#             for tool_request in tool_requests:
#                 if 'toolUse' in tool_request:
#                     tool = tool_request['toolUse']
#                     results.append(tool['input'])
#             return results
#         return None

#     def segment_paragraphs(self, text):
#         paragraphs = text.split('\n\n')
#         paragraphs = [p.strip() for p in paragraphs if p.strip()]
#         return paragraphs

#     def generate_statements(self, question, answer):
#         sys_template = """
#         Given a question, an answer, and paragraphs from the answer, analyze each paragraph and break it down into one or more fully understandable statements while ensuring no pronouns are used in each statement. Use the StatementGenerator tool for each paragraph.
#         """
#         paragraphs = self.segment_paragraphs(answer)
#         paragraphs_str = '\n'.join([f"{i}: {p}" for i, p in enumerate(paragraphs)])
#         user_template = f"""
#         Question: {question}
#         Answer: {answer}
#         Paragraphs:
#         {paragraphs_str}
#         Use the StatementGenerator tool for each paragraph.
#         """
#         sys_prompt, user_prompt = self.create_message_format(sys_template, user_template)
#         response = self.converse_with_bedrock_tools(sys_prompt, user_prompt)
#         output = self.parse_tool_use(response)

#         statements = []
#         if output:
#             for item in output:
#                 statements.extend(item['simpler_statements'])
#         return statements

#     def check_faithfulness(self, context, statements):
#         sys_template = """
#         Your task is to judge the faithfulness of a series of statements based on given paragraphs. For each statement, use the FaithfulnessChecker tool to determine if the statement can be directly inferred from any of the paragraphs.
#         """
#         paragraphs = self.segment_paragraphs(context)
#         paragraphs_str = json.dumps(paragraphs, ensure_ascii=False)
#         statements_str = json.dumps(statements, ensure_ascii=False)
#         user_template = f"""
#         Paragraphs: {paragraphs_str}
#         Statements: {statements_str}
#         Use the FaithfulnessChecker tool for each statement.
#         """
#         sys_prompt, user_prompt = self.create_message_format(sys_template, user_template)
#         response = self.converse_with_bedrock_tools(sys_prompt, user_prompt)
#         output = self.parse_tool_use(response)

#         verdicts = []
#         if output:
#             for item in output:
#                 verdicts.append(item['verdict'])
#         return verdicts

#     def score(self, row):
#         question = row['user_input']
#         answer = row['response']
#         context = '\n'.join(row['retrieved_contexts'])
#         statements = self.generate_statements(question, answer)
#         if not statements:
#             return 0.0

#         verdicts = self.check_faithfulness(context, statements)
#         if not verdicts:
#             return 0.0

#         faithful_statements = sum(verdicts)
#         total_statements = len(verdicts)
#         score = faithful_statements / total_statements
#         return score

In [1]:
import json
from datasets import Dataset

input_file = "data/updated_qa_dataset.jsonl"
def read_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            yield json.loads(line.strip())

updated_dataset = Dataset.from_list(list(read_jsonl(input_file)))

item = updated_dataset[0]
print(f"Question: {item['question']}")
print(f"Answer: {item['answer']}")
print("-" * 50)

  from .autonotebook import tqdm as notebook_tqdm


Question: How do temperature, Top K, and Top P parameters interact in Amazon Bedrock's foundation models, and how might adjusting these affect the output when generating text about different types of equines?
Answer: Based on the provided context, here's how temperature, Top K, and Top P parameters interact in Amazon Bedrock's foundation models, and how adjusting them might affect the output when generating text about different types of equines:

1. Temperature: 
- Lower values increase the likelihood of higher-probability tokens and decrease the likelihood of lower-probability tokens. This would make the model more likely to choose "horses" in the given example.
- Higher values increase the likelihood of lower-probability tokens and decrease the likelihood of higher-probability tokens. This would make the model more likely to consider "zebras" or even "unicorns" in the example.

2. Top K:
- A lower Top K value (e.g., 2) would limit the model to consider only the top K most likely cand

In [2]:
def evaluate(dataset, metrics, llm_id, emb_id, region):
    """
    Evaluate the dataset using the specified metrics.

    Args:
    dataset (List[Dict]): List of dictionaries containing 'user_input', 'response', and 'retrieved_contexts'.
    metrics (List[AnswerRelevancy]): List of metric objects to use for evaluation.
    llm_id (str): ID of the LLM model to use.
    embeddings_id (str): ID of the embeddings model to use.
    region (str): AWS region to use for Bedrock.

    Returns:
    Dict: A dictionary containing the scores for each metric.
    """
    results = {}

    for metric in metrics:
        if isinstance(metric, AnswerRelevancy):
            metric.llm_id = llm_id
            metric.emb_id = emb_id
            metric.region = region

        scores = []
        for row in dataset:
            try:
                score = metric.score(row)
                scores.append(score)
            except Exception as e:
                print(f"Error processing row: {e}")
                continue

        if scores:
            avg_score = sum(scores) / len(scores)
            results[metric.__class__.__name__] = avg_score
        else:
            results[metric.__class__.__name__] = "No valid scores"

    return results

In [3]:
from libs.eval_metrics import AnswerRelevancy, Faithfulness

llm_id = "anthropic.claude-3-sonnet-20240229-v1:0"
emb_id = "amazon.titan-embed-text-v2:0"
region = "us-west-2"

#metrics = [AnswerRelevancy(llm_id=llm_id, emb_id=emb_id, region=region, strictness=1)]
metrics = [Faithfulness(llm_id=llm_id, emb_id=emb_id, region=region)]

def map_dataset(example):
    return {
        "user_input": example["question"],
        "retrieved_contexts": example["retrieved_contexts"],
        "referenced_contexts": example["original_contexts"],
        "response": example["answer"],
        "reference": example["ground_truth"]
    }

dataset = updated_dataset.map(map_dataset).select(range(2))
results = evaluate(dataset, metrics, llm_id, emb_id, region)
print(results)

Map: 100%|██████████| 10/10 [00:00<00:00, 2093.49 examples/s]


{'Faithfulness': 1.0}


In [None]:
result_df = result.to_pandas()
result_df.head()

In [None]:
result_df.to_csv("data/ragas_evaluation_result.csv", index=False)

In [None]:
result_df.loc[:, "context_precision":"context_recall"]