In [37]:
!pip install boto3==1.34.131

Collecting boto3==1.34.131
  Downloading boto3-1.34.131-py3-none-any.whl.metadata (6.6 kB)
Downloading boto3-1.34.131-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.2/139.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: boto3
  Attempting uninstall: boto3
    Found existing installation: boto3 1.34.149
    Uninstalling boto3-1.34.149:
      Successfully uninstalled boto3-1.34.149
Successfully installed boto3-1.34.131
[0m

In [1]:
import pdfplumber

document = "../data/User_Manual_W11_Acer_1.0_A_A_SHORT.pdf"
pdf_obj = pdfplumber.open(document)
page = pdf_obj.pages[1]

In [3]:
PDF_TEXT = page.extract_text(
    layout=True, 
)
print(PDF_TEXT)

                                                          
                                                          
   14 - Acer 筆記型電腦導覽                                      
   左視圖                                                    
                                                          
       P214-52 / P214-52G / P214-41 / P214-41-G2 / P214-41-G3
                                                          
                                                          
                                                          
                 1 2   3   4  5  6  7   8  9              
                                                          
       P214-53 / P214-53G                                 
                                                          
                                                          
                                                          
                 1 2   3   4  5  6  10  8  9              
                                                     

In [4]:
image_path = './output_image.png'

with open(image_path, "rb") as f:
    IMAGE = f.read()

In [6]:
system_text = "You will be given an image of a PDF page and the text extracted from that PDF using a parser. Your task is to extract all contents from the image, analyze them, and create a structured markdown output while comparing the extracted text with the provided PDF text. Maintain the original order and language of the content."


In [7]:
input_text_pre = f"""
First, you will be provided with the text extracted from the PDF:

<filename>
{{FILENAME}}
</filename>

<page_number>
{{PAGE_NUMBER}}
</page_number>

<pdf_text>
{{PDF_TEXT}}
</pdf_text>

Next, you will be given an image.
"""

input_text_post = f"""
Follow these steps to complete the task:

1. Analyze the image content and identify the types of content present (graphs, tables, text).

2. For each type of content in the image:
   a. If it's a graph/diagram/figure:
      - Provide a short description of the graph/diagram/figure
      - Do not provide a fake link
   b. If it's a table:
      - Extract all cells correctly
      - Compare the extracted cell values with the corresponding section in the PDF text
      - Use the PDF text as the correct version if there are discrepancies
      - Format the table in markdown
      - Ensure each cell value is aligned with its column
   c. If it's text and not cell values of (b):
      - Extract the text
      - Compare the extracted text with the corresponding section in the PDF text
      - Use the PDF text as the correct version if there are discrepancies
      - Ensure there are no duplicates in both (b) and (c)

3. Maintain the original order of content as it appears in the image.

4. Use the **ORIGINAL MAIN LANGUAGE** in the PDF page, **DONOT** do translation.

5. Format the final content in clean, structured markdown.

6. Ensure that all content is included and accurately represented.

7. **DONOT** add any prefix before <result>.

Present your final result within <result> tags. The content inside these tags should be the clean, structured markdown representation of the PDF page content, suitable for use as RAG input.

Return <filename> and <page_number> info in a JSON format.

Remember to include all relevant information from both the PDF text and the image content, prioritizing accuracy and maintaining the original order and language of the content.
"""

In [8]:
import boto3

bedrock_client = boto3.client(service_name='bedrock-runtime')

model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
temperature = 0.5
top_k = 3

inference_config = {"temperature": temperature}
additional_model_fields = {"top_k": top_k}
system_prompts = [{"text": system_text}]
message = {
    "role": "user",
    "content": [
        {
            "text": input_text_pre
        },
        {
            "image": {
                "format": 'png',
                "source": {
                    "bytes": IMAGE
                }
            }
        },
        {
            "text": input_text_post
        }
    ]
}
messages = []
messages.append(message)
    
response = bedrock_client.converse(
    modelId=model_id,
    messages=messages,
    system=system_prompts,
    inferenceConfig=inference_config,
    additionalModelRequestFields=additional_model_fields
)

In [9]:
print(response['output']['message']['content'][0]['text'])

<result>
# Windows 作業系統畫面

![Windows 作業系統畫面](image.png)

| # | 項目 | 說明 |
| --- | --- | --- |
| 1 | [Settings] (設定) | 變更電腦的應用程式設定。 |
|   |   | 透過 [Pinned] (已釘選)、[Recommended] (推薦) 或 [All apps] (所有應用程式)，來瀏覽應用程式。 |
| 2 | [Start] (開始) | |
| 3 | [Search] (搜尋) | 輸入查看電腦和網路裝置上搜尋的文字。 |
| 4 | [Task View] (工作檢視) | 分鏡在執行中的應用程式之間進行切換，並新增虛擬桌面平台。 |
| 5 | [Widgets] (小工具) | 提供天氣、新聞、股票和體育資訊。 |
| 6 | [Power] (電源) | 開機、睡眠或重新啟動電腦。 |
| 7 | [Quick settings] (快速設定) | 顯示網路連線狀況、電池電量和音效設定。您可以切換快速控制項目以即時調整或開啟設定。 |
| 8 | [Notification] (通知) | 顯示通知和行事曆。 |
</result>

<filename_and_page_number>
{
  "filename": "{FILENAME}",
  "page_number": {PAGE_NUMBER}
}
</filename_and_page_number>
