# Generate question and answer pairs based on retrieved documents

### Use case
- generating ground truth data with the help of an LLM to augment human labeling
- generating sample data

### Tech stack
- langchain as orchestrator
- OpenSearch Serverless as vector store
- Amazon Bedrock with Claude V2 as LLM

In [57]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import OpenSearchVectorSearch
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock

from langchain.chains.question_answering import load_qa_chain
from langchain.chains import RetrievalQA
from langchain.chat_models import BedrockChat
from langchain.evaluation.qa import QAEvalChain
from langchain.prompts import PromptTemplate

from langchain.evaluation.qa import QAGenerateChain
from langchain.evaluation.qa import QAEvalChain

from dotenv import load_dotenv, find_dotenv
import os
import boto3
from botocore.config import Config

load_dotenv(find_dotenv('dev.env'),override=True)

os.environ['LANGCHAIN_ASSUME_ROLE'] = os.getenv('LANGCHAIN_ASSUME_ROLE')
os.environ['BEDROCK_REGION_NAME'] = os.getenv('BEDROCK_REGION_NAME')
os.environ['BEDROCK_ENDPOINT_URL'] = os.getenv('BEDROCK_ENDPOINT_URL')
os.environ['OPENSEARCH_COLLECTION'] = os.getenv('OPENSEARCH_COLLECTION')
os.environ['OPENSEARCH_REGION'] = os.getenv('OPENSEARCH_REGION')

# Initialize the Bedrock runtime
config = Config(
   retries = {
      'max_attempts': 8
   }
)
bedrock_runtime = boto3.client(
        service_name="bedrock-runtime",
        region_name=os.environ.get("BEDROCK_REGION_NAME", None),
        config=config
)

In [4]:
import numpy as np
import pypdf
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from urllib.request import urlretrieve

os.makedirs("data", exist_ok=True)
files = [ "https://d3q8adh3y5sxpk.cloudfront.net/meetingrecordings/modelevaluationdata/amazon_10k_2023.pdf"]
for url in files:
    file_path = os.path.join("data", url.rpartition("/")[2])
    urlretrieve(url, file_path)
    

loader = PyPDFDirectoryLoader("./data/")
documents = loader.load()

In [5]:
## Split documents into smaller chunks 
### Compare results/impact of Character split and TokenTextSplitter

token_text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=0)
char_text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)

token_text_list = token_text_splitter.split_documents(documents)
char_text_list = char_text_splitter.split_documents(documents)
    
print("TokenTextSplitter split documents in to " + str(len(token_text_list)) + " chunks.\n")
print("CharacterTextSplitter split documents in to " + str(len(char_text_list)) + " chunks.\n")

TokenTextSplitter split documents in to 165 chunks.

CharacterTextSplitter split documents in to 651 chunks.



In [58]:
## create vectors and store them in our vector database (OpenSearch Serverless)
### Connect to OpenSearchServerless
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth

host = os.environ['OPENSEARCH_COLLECTION']  # serverless collection endpoint, without https://
region = os.environ['OPENSEARCH_REGION']  # e.g. us-east-1

service = 'aoss'
credentials = boto3.Session().get_credentials()
auth = AWSV4SignerAuth(credentials, region, service)

aos_client = OpenSearch(
    hosts=[{'host': host, 'port': 443}],
    http_auth=auth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
    pool_maxsize=20,
)

In [94]:
### Create a index in Amazon Opensearch Service 

# langchain version
knn_index = {
    "settings": {
        "index.knn": True,
        
    },
    "mappings": {
        "properties": {
            "vector_field": {
                "type": "knn_vector",
                "dimension": 1536,
                "store": True
            },
            "text": {
                "type": "text",
                "store": True
            },
        }
    }
}

index_name = "llmqageneration"
try:
    aos_client.indices.delete(index=index_name)
    print("Recreating index '" + index_name + "' on OpenSearch.")
    aos_client.indices.create(index=index_name,body=knn_index,ignore=400)
    aos_client.indices.get(index=index_name)
except:
    print("Index '" + index_name + "' not found. Creating index on OpenSearch.")
    aos_client.indices.create(index=index_name,body=knn_index,ignore=400)
    aos_client.indices.get(index=index_name)

Recreating index 'llmqageneration' on OpenSearch.


In [7]:
index_name = "llmqageneration"
aos_client.indices.get(index=index_name)

{'llmqageneration': {'aliases': {},
  'mappings': {'properties': {'text': {'type': 'text', 'store': True},
    'vector_field': {'type': 'knn_vector', 'store': True, 'dimension': 1536}}},
  'settings': {'index': {'number_of_shards': '2',
    'provided_name': 'llmqageneration',
    'knn': 'true',
    'creation_date': '1696180007445',
    'number_of_replicas': '0',
    'uuid': 'GBI07IoBlXWWdUA78dGU',
    'version': {'created': '135217827'}}}}}

In [97]:
### Insert embeddings into OpenSearch

def get_embedding(body, modelId, accept, contentType):
    response = bedrock_runtime.invoke_model(body=body, modelId=modelId, accept=accept, contentType=contentType)
    response_body = json.loads(response.get('body').read())
    embedding = response_body.get('embedding')
    return embedding

def embed_phrase(phrase):
    body = json.dumps({"inputText": str(phrase)})
    modelId = 'amazon.titan-embed-text-v1' #'amazon.titan-e1t-medium' # not available yet
    contentType = 'application/json'
    accept = 'application/json'
    embedding = get_embedding(body, modelId, accept, contentType)
    return embedding
    
def os_import(record, aos_client, index_name):
    search_vector = embed_phrase(record)
    aos_client.index(index=index_name,
             body={"vector_field": search_vector,
                   "text": record
                  },
            request_timeout=60*3,  # 3 minutes
            )

def query_opensearch(index_name, phrase, n=1):
    search_vector = embed_phrase(phrase)
    osquery={
        "_source": {
            "exclude": [ "vector_field" ]
        },
        
      "size": n,
      "query": {
        "knn": {
          "vector_field": {
            "vector":search_vector,
            "k":n
          }
        }
      }
    }

    res = aos_client.search(index=index_name, 
                           body=osquery,
                           stored_fields=["text"],
                           explain = True)
    print(res)
    result = {
            "text":""
        }
    if res['hits']['hits']:
        top_result = res['hits']['hits'][0]
    
        result = {
            "text":top_result['_source']['text'],

        }
    
    return result

### test embedding
#embed_phrase("pairs well with chocolate")

# test insert

#for record in char_text_list: 
    #print(record.page_content)
#    os_import(record.page_content, aos_client, index_name)

#for record in token_text_list: 
    #print(record.page_content)
#    os_import(record.page_content, aos_client, index_name)
    
#for record in token_text_list: 
#    print(record.page_content)
query_opensearch(index_name, 'Amazon')

{'took': 108, 'timed_out': False, '_shards': {'total': 0, 'successful': 0, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 11, 'relation': 'eq'}, 'max_score': 0.0018039581, 'hits': [{'_shard': '[026459568683::89rogzwnoq45nomc0tvh::VECTORSEARCH::llmqageneration:0][1]', '_node': '10.0.153.86', '_index': 'llmqageneration', '_id': '1%3A0%3AATI17IoBdWq38RY9sq9j', '_score': 0.0018039581, '_source': {'text': 'We serve consumers through our online and physical stores and focus on selection, price, and convenience. We design our stores to enable hundreds of\nmillions of unique products to be sold by us and by third parties across dozens of product categories. Customers access our offerings through our websites,\nmobile apps, Alexa, devices, streaming, and physically visiting our stores. We also manufacture and sell electronic devices, including Kindle, Fire tablet, Fire'}, 'fields': {'text': ['We serve consumers through our online and physical stores and focus on selection, price, and c

{'text': 'We serve consumers through our online and physical stores and focus on selection, price, and convenience. We design our stores to enable hundreds of\nmillions of unique products to be sold by us and by third parties across dozens of product categories. Customers access our offerings through our websites,\nmobile apps, Alexa, devices, streaming, and physically visiting our stores. We also manufacture and sell electronic devices, including Kindle, Fire tablet, Fire'}

In [59]:
### Use Titan Embeddings Model to generate embeddings

from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock

# LangChain requires AWS4Auth
from requests_aws4auth import AWS4Auth
def get_aws4_auth():
    region = os.environ.get("Region", os.environ["AWS_REGION"])
    service = "aoss"
    credentials = boto3.Session().get_credentials()
    return AWS4Auth(
        credentials.access_key,
        credentials.secret_key,
        region,
        service,
        session_token=credentials.token,
    )
aws4_auth = get_aws4_auth()

bedrock_embeddings = BedrockEmbeddings(client=bedrock_runtime)


In [None]:
## Alternative use Langchain Insert embeddings into OpenSearch - had some issues,that's why I used above instead

from langchain.vectorstores import OpenSearchVectorSearch

full_opensearch_endpoint = 'https://' + os.environ['OPENSEARCH_COLLECTION']
    
# get embeddings and do insert to OpenSearch.
def opensearch_insert(source_doc_list, target_index_name):
    doc_search=OpenSearchVectorSearch.from_documents(
            index_name = target_index_name,
            documents=source_doc_list,
            embedding=bedrock_embeddings,
            opensearch_url=full_opensearch_endpoint,
            http_auth=auth,
            use_ssl=True,
            verify_certs=True,
            connection_class=RequestsHttpConnection,
            timeout=60*4,
            bulk_size=1000,
            is_aoss=True
        )
    inserted_doc_length = len(source_doc_list)
    input_doclist_name = f'{char_text_list=}'.split('=')[0]
    print(f'ingested {inserted_doc_length} documents from source {input_doclist_name} to target {target_index_name}')
    return doc_search

doc_search=opensearch_insert(source_doc_list=char_text_list, target_index_name=index_name)

In [60]:
### validate load
res = aos_client.search(index=index_name, body={"query": {"match_all": {}}})
print("Records found: %d." % res['hits']['total']['value'])

Records found: 651.


In [102]:
# Set up retrieval QA chain
from langchain.evaluation.qa import QAEvalChain
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

inference_modifier = {
    "max_tokens_to_sample": 8000,
    "temperature": 0,
    "top_k": 250,
    "top_p": 1,
    "stop_sequences": ["\n\nHuman"],
}

llm = Bedrock( #create a Bedrock llm client
    region_name=os.environ.get("BEDROCK_REGION_NAME"), #sets the region name (if not the default)
    endpoint_url=os.environ.get("BEDROCK_ENDPOINT_URL"), #sets the endpoint URL (if necessary)
    model_id="anthropic.claude-v2",
    model_kwargs=inference_modifier
)

from langchain.vectorstores import OpenSearchVectorSearch
full_opensearch_endpoint = 'https://' + os.environ['OPENSEARCH_COLLECTION']

### define VectorStore Retriever
doc_search = OpenSearchVectorSearch(
        index_name=index_name,
        embedding_function=bedrock_embeddings,
        opensearch_url=full_opensearch_endpoint,
        http_auth=auth,
        use_ssl=True,
        verify_certs=True,
        connection_class=RequestsHttpConnection,
        is_aoss=True
    ) 
    
prompt_template = """
        Human: Given report provided, please read it and analyse the content.
        {question}
        - Return each response in <prompt> </prompt> XML tags.
        - Inside <promp> tag, return the question inside <question></question> XML tags.
        - Inside <promp> tag, return the answer inside <question_answer> </question_answer> XML tags.
        <report>
        {context}
        </report>

        Return the result in XML format
        
        
        Assistant:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["question", "context"]
)
# see also https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch.html#langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch.as_retriever
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=doc_search.as_retriever(search_kwargs={'k': 100}),
    return_source_documents=True,
    chain_type_kwargs = {"prompt": PROMPT}
    )


In [108]:
query = "Generate several pages of question and answer pairs and return the result in XML format. Only return the answers in xml tags"
results = qa({"query": query})

In [109]:
# used context
i = 0
str_value = ''
print(len(results['source_documents']))
while i < int(len(results['source_documents'])):
    str_value = str_value + results['source_documents'][i].page_content
    i+=1

import tiktoken

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

token_count = len(encoding.encode(str_value))
print(f"The context contains ~ {token_count} tokens.")

100
The context contains ~ 9245 tokens.


In [112]:
### parse output to csv file

from xml.etree import ElementTree
import csv
import xml.etree.ElementTree as ET

def convert_to_xml_method1(data_str):
    root = ET.fromstring(data_str)
    return root

# PARSE XML
root = convert_to_xml_method1(results['result'].split("\n", 1)[1])

# check if file exists

# Open our existing CSV file in append mode
# Create a file object for this file
filename = 'qsdata.csv'

if os.path.isfile(filename):
    with open(filename, 'a',encoding='utf-8') as csvfile:
        # Pass this file object to csv.writer()
        # and get a writer object
        csvfile_writer = csv.writer(csvfile)

        for prompt in root.findall('prompt'):
            if(prompt):
                # EXTRACT DETAILS  
                question = prompt.find("question")
                answer = prompt.find("question_answer")
                csv_line = [question.text, "<question_answer>" + answer.text + "</question_answer>"]
                # ADD A NEW ROW TO CSV FILE
                csvfile_writer.writerow(csv_line)

        # Close the file object
        csvfile.close()

else:
    # CREATE CSV FILE
    csvfile = open(filename,'w',encoding='utf-8')
    csvfile_writer = csv.writer(csvfile)

    # ADD THE HEADER TO CSV FILE
    csvfile_writer.writerow(["question","answer"])

    # FOR EACH PROMPT
    for prompt in root.findall('prompt'):
        if(prompt):
            # EXTRACT DETAILS  
            question = prompt.find("question")
            answer = prompt.find("question_answer")
            csv_line = [question.text, "<question_answer>" + answer.text + "</question_answer>"]
            # ADD A NEW ROW TO CSV FILE
            csvfile_writer.writerow(csv_line)
    csvfile.close()

In [113]:
### Alternative if you need more questions/context
from xml.etree import ElementTree
import csv
import xml.etree.ElementTree as ET

def convert_to_xml_method1(data_str):
    root = ET.fromstring(data_str)
    return root

prompt_template = """
        Human: Given report provided, please read it and analyse the content.
        {question}
        - Return each response in <prompt> </prompt> XML tags.
        - Inside <promp> tag, return the question inside <question></question> XML tags.
        - Inside <promp> tag, return the answer inside <question_answer> </question_answer> XML tags.
        <report>
        {context}
        </report>

        Return only the result in XML format
        
        
        Assistant:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["question", "context"]
)

filename = 'qsdata.csv'

query = "Generate 10 questions and answers and return the result in XML format."
resultset = []

for token in token_text_list:
    # Pass in values to the input variables
    prompt = PROMPT.format(question=query, 
                                     context=token
         )
    response = llm(prompt)
    resultset.append(response.split("\n",2)[2])
    

# PARSE XML
# remove first two lines of response
xmlstring = ' '.join([str(elem) for elem in resultset])
result = '<prompts>' + xmlstring + '</prompts>'
try:
    root = convert_to_xml_method1(result)

    # check if file exists
    # Open our existing CSV file in append mode

    if os.path.isfile(filename):
        with open(filename, 'a',encoding='utf-8') as csvfile:
            # Pass this file object to csv.writer()
            # and get a writer object
            csvfile_writer = csv.writer(csvfile)

            for prompt in root.findall('prompt'):
                if(prompt):
                    # EXTRACT DETAILS  
                    question = prompt.find("question")
                    answer = prompt.find("question_answer")
                    csv_line = [question.text, "<question_answer>" + answer.text + "</question_answer>"]
                    # ADD A NEW ROW TO CSV FILE
                    csvfile_writer.writerow(csv_line)

            # Close the file object
            csvfile.close()

    else:
        # CREATE CSV FILE
        csvfile = open(filename,'w',encoding='utf-8')
        csvfile_writer = csv.writer(csvfile)

        # ADD THE HEADER TO CSV FILE
        csvfile_writer.writerow(["question","answer"])

        # FOR EACH PROMPT
        for prompt in root.findall('prompt'):
            if(prompt):
                # EXTRACT DETAILS  
                question = prompt.find("question")
                answer = prompt.find("question_answer")
                csv_line = [question.text, "<question_answer>" + answer.text + "</question_answer>"]
                # ADD A NEW ROW TO CSV FILE
                csvfile_writer.writerow(csv_line)
        csvfile.close()
except:
    print('error occured, continue with next tokensplit')

ValueError: Error raised by bedrock service: Read timeout on endpoint URL: "https://bedrock-runtime.us-east-1.amazonaws.com/model/anthropic.claude-v2/invoke"