In [1]:
import json
import os
import sys
import boto3

import warnings
warnings.filterwarnings("ignore")

In [2]:
bedrock = boto3.client('bedrock-runtime')
br = boto3.client('bedrock')
# s3 = boto3.client("s3")

In [3]:
from langchain.document_loaders import AmazonTextractPDFLoader
from langchain.llms import Bedrock
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import BedrockEmbeddings
from langchain.vectorstores import FAISS

# Classification

In [4]:
loader = AmazonTextractPDFLoader("../data/Sample Document for Amazon Textract 1.jpg")
document = loader.load()

In [5]:
document

[Document(page_content='INVOICE\r\nDATE\r\nINVOICE NO\r\nTech Skills (dot) Ninja\r\n12395 Skye Park\r\n9/1/2021\r\n245001\r\nSan Antonio TX 78243\r\n(210) 123-4567\r\nsome.body@gmail.com\r\nINVOICE TO\r\n123 Main Street\r\nSan Antonio TX 78231\r\n(210) 333-4444\r\nany.body@gmail.com\r\nSALESPERSON\r\nJOB\r\nPAYMENT TERMS\r\nDUE DATE\r\nJacob Johnson\r\n#00435\r\nDue on Receipt\r\n9/2/2021\r\nQUANTITY\r\nDESCRIPTION\r\nUNIT PRICE\r\nLINE TOTAL\r\n3\r\nTesting Description 1\r\n$3,754.78\r\n$11,264.34\r\n1\r\nTesting Description 2\r\n$256.99\r\n$256.99\r\n2\r\nTesting Description 3\r\n$122.99\r\n$245.98\r\n1\r\nTesting Description 4\r\n$573.00\r\n$573.00\r\nSubtotal\r\n12,340.31\r\nSales Tax\r\n1,018.07\r\nTotal\r\n13,358.38', metadata={'source': '../data/Sample Document for Amazon Textract 1.jpg', 'page': 1})]

In [6]:
template = """

Given a list of classes, classify the document into one of these classes. Skip any preamble text and just give the class name.

<classes>INVOICE, RECEIPT, SHIPPING</classes>
<document>{doc_text}<document>
<classification>"""

prompt = PromptTemplate(template=template, input_variables=["doc_text"])
bedrock_llm = Bedrock(client=bedrock, model_id="amazon.titan-text-lite-v1")

llm_chain = LLMChain(prompt=prompt, llm=bedrock_llm)
class_name = llm_chain.invoke(document[0].page_content)

In [7]:
class_name['text']

'INVOICE</classification>'

# Summarization

In [8]:
loader = AmazonTextractPDFLoader(f"s3://aws-textract-bedrock-langchain/Sample Document for Amazon Textract 2.pdf")
document = loader.load()

In [9]:
document

[Document(page_content='1/19/24, 8:03 AM\r\nPrivacy - WordPress.org\r\nAbout\r\nDomains License Accessibility Privacy Policy Statistics\r\nPrivacy policy\r\nWordPress.org websites (collectively "WordPress.org" in this document) refer to sites\r\nhosted on the WordPress.org, WordPress.net, WordCamp.org, BuddyPress.org,\r\nbbPress.org, and other related domains and subdomains thereof. This privacy policy\r\ndescribes how WordPress.org uses and protects any information that you give us. We are\r\ncommitted to ensuring that your privacy is protected. If you provide us with personal\r\ninformation through WordPress.org, you can be assured that it will only be used in\r\naccordance with this privacy statement.\r\nWebsite visitors\r\nLike most website operators, WordPress.org collects non-personally-identifying\r\ninformation of the sort that web browsers and servers typically make available, such as\r\nthe browser type, language preference, referring site, and the date and time of each\r\nvi

In [10]:
num_docs = len(document)
print (f"There are {num_docs} pages in the document")
for index, doc in enumerate(document):
    num_tokens_first_doc = bedrock_llm.get_num_tokens(doc.page_content)
    print (f"Page {index+1} has approx. {num_tokens_first_doc} tokens")

There are 9 pages in the document


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Page 1 has approx. 399 tokens
Page 2 has approx. 383 tokens
Page 3 has approx. 474 tokens
Page 4 has approx. 341 tokens
Page 5 has approx. 466 tokens
Page 6 has approx. 397 tokens
Page 7 has approx. 355 tokens
Page 8 has approx. 330 tokens
Page 9 has approx. 64 tokens


In [11]:
summary_chain = load_summarize_chain(llm=bedrock_llm, chain_type='map_reduce',
                                     verbose=True # Set verbose=True if you want to see the prompts being used
                                    )
output = summary_chain.run(document)

  warn_deprecated(




[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mWrite a concise summary of the following:


"1/19/24, 8:03 AM
Privacy - WordPress.org
About
Domains License Accessibility Privacy Policy Statistics
Privacy policy
WordPress.org websites (collectively "WordPress.org" in this document) refer to sites
hosted on the WordPress.org, WordPress.net, WordCamp.org, BuddyPress.org,
bbPress.org, and other related domains and subdomains thereof. This privacy policy
describes how WordPress.org uses and protects any information that you give us. We are
committed to ensuring that your privacy is protected. If you provide us with personal
information through WordPress.org, you can be assured that it will only be used in
accordance with this privacy statement.
Website visitors
Like most website operators, WordPress.org collects non-personally-identifying
information of the sort that web browsers and servers typically

In [12]:
print(output.strip())

WordPress.org websites collect non-personally identifying information to better understand how visitors use their website. They may release non-personally-identifying information in the aggregate and collect potentially personally-identifying information like IP addresses. WordPress.org does not use IP addresses to identify visitors, but may disclose such information under the same circumstances that it uses and discloses personally-identifying information.
WordPress.org gathers personally-identifying information from certain visitors. This information is used to fulfill the purpose of the visitor's interaction with WordPress.org. WordPress.org does not disclose personally-identifying information other than


# Question and Answer

In [13]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400,
                                               separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
                                               chunk_overlap=0)
texts = text_splitter.split_documents(document)

for index, text in enumerate(texts):
    print(f"==== Chunk {index+1}, From Page {text.metadata['page']} ====")
    print(text.page_content)
    print("\n")

==== Chunk 1, From Page 1 ====
1/19/24, 8:03 AM
Privacy - WordPress.org
About
Domains License Accessibility Privacy Policy Statistics
Privacy policy
WordPress.org websites (collectively "WordPress.org" in this document) refer to sites
hosted on the WordPress.org, WordPress.net, WordCamp.org, BuddyPress.org,
bbPress.org, and other related domains and subdomains thereof. This privacy policy


==== Chunk 2, From Page 1 ====
describes how WordPress.org uses and protects any information that you give us. We are
committed to ensuring that your privacy is protected. If you provide us with personal
information through WordPress.org, you can be assured that it will only be used in
accordance with this privacy statement.
Website visitors
Like most website operators, WordPress.org collects non-personally-identifying


==== Chunk 3, From Page 1 ====
information of the sort that web browsers and servers typically make available, such as
the browser type, language preference, referring site, and the

In [14]:
resp = br.list_foundation_models(
    byOutputModality='EMBEDDING'
)
for model in resp['modelSummaries']:
    print(model['modelId'])

amazon.titan-embed-g1-text-02
amazon.titan-embed-text-v1:2:8k
amazon.titan-embed-text-v1
amazon.titan-embed-image-v1:0
amazon.titan-embed-image-v1
cohere.embed-english-v3
cohere.embed-multilingual-v3


In [15]:
embeddings = BedrockEmbeddings(client=bedrock)

In [16]:
embeddings = BedrockEmbeddings(client=bedrock,model_id="amazon.titan-embed-text-v1")
vector_db = FAISS.from_documents(documents=texts, embedding=embeddings)

In [17]:
query = "What information does Wordpress collect from website visitors?"
docs = vector_db.similarity_search(query)

In [18]:
docs

[Document(page_content="example, we ask visitors who use our forums to provide a username and email address.\r\nIn each case, WordPress.org collects such information only insofar as is necessary or\r\nappropriate to fulfill the purpose of the visitor's interaction with WordPress.org.\r\nWordPress.org does not disclose personally-identifying information other than as", metadata={'source': 's3://aws-textract-bedrock-langchain/Sample Document for Amazon Textract 2.pdf', 'page': 2}),
 Document(page_content='policy only applies to the websites listed at the beginning of this document, so when you\r\nvisit other websites, even when you click on a link posted on WordPress.org, you should\r\nread their own privacy policies.\r\nAggregated statistics\r\nWordPress.org may collect statistics about the behavior of visitors to its websites. For', metadata={'source': 's3://aws-textract-bedrock-langchain/Sample Document for Amazon Textract 2.pdf', 'page': 7}),
 Document(page_content='describes how Wor

In [19]:
print(len(docs))

4


In [20]:
for i in range(len(docs)):
    print("\nContent: " + docs[i].page_content)
    print("Source: " + docs[i].metadata['source'])
    print("Page Number: " + str(docs[i].metadata['page']))


Content: example, we ask visitors who use our forums to provide a username and email address.
In each case, WordPress.org collects such information only insofar as is necessary or
appropriate to fulfill the purpose of the visitor's interaction with WordPress.org.
WordPress.org does not disclose personally-identifying information other than as
Source: s3://aws-textract-bedrock-langchain/Sample Document for Amazon Textract 2.pdf
Page Number: 2

Content: policy only applies to the websites listed at the beginning of this document, so when you
visit other websites, even when you click on a link posted on WordPress.org, you should
read their own privacy policies.
Aggregated statistics
WordPress.org may collect statistics about the behavior of visitors to its websites. For
Source: s3://aws-textract-bedrock-langchain/Sample Document for Amazon Textract 2.pdf
Page Number: 7

Content: describes how WordPress.org uses and protects any information that you give us. We are
committed to ensuring t