# 02 - Download Raw PDF documents to analyze

Use this notebook to download a set of Amazon financial reports to use as input for testing and demonstrating how the `aws-agentic-document-assistant` solution works. You can replace links to your own documents here and customize the code to your use case. Alternatively, you can put your own documents on `Amazon S3` and update the code to use them instead.

Run the below cells to download them.

In [2]:
raw_base_directory = "raw_documents"

In [3]:
raw_base_directory

'raw_documents'

In [4]:
import os

if not os.path.exists(raw_base_directory):
    os.makedirs(raw_base_directory)

In [5]:
docs_mapping = {
    "Amazon": [
        {
            "doc_url": "https://s2.q4cdn.com/299287126/files/doc_financials/2023/ar/Amazon-2022-Annual-Report.pdf",
            "year": "2022",
            "pages": [15, 17, 18, 47, 48],
        },
        {
            "doc_url": "https://s2.q4cdn.com/299287126/files/doc_financials/2022/ar/Amazon-2021-Annual-Report.pdf",
            "year": "2021",
            "pages": [14, 16, 17, 18, 46, 47],
        },
        {"doc_url": "", "year": ""},
    ]
}

In [6]:
import os
import requests


def download_pdf_files(base_directory, docs_mapping, headers):
    # Create the base directory if it doesn't exist
    if not os.path.exists(base_directory):
        os.makedirs(base_directory)

    for company, docs in docs_mapping.items():
        company_directory = os.path.join(base_directory, company)

        # Create a directory for the company if it doesn't exist
        if not os.path.exists(company_directory):
            os.makedirs(company_directory)

        for doc_info in docs:
            doc_url = doc_info["doc_url"]
            year = doc_info["year"]

            # Skip empty URLs
            if not doc_url:
                continue

            # Construct the filename based on the year and the URL
            filename = f"annual_report_{year}.pdf"
            file_path = os.path.join(company_directory, filename)

            # Check if the file already exists
            if os.path.exists(file_path):
                print(f"{filename} already exists for {company}")
            else:
                # Download the document
                response = requests.get(doc_url, headers=headers)

                if response.status_code == 200:
                    with open(file_path, "wb") as file:
                        file.write(response.content)
                    print(f"Downloaded {filename} for {company}")
                else:
                    print(
                        f"Failed to download {filename} for {company}"
                        f" (Status Code: {response.status_code})"
                    )

In [7]:
# Define user-agent and headers to mimic a browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
download_pdf_files(raw_base_directory, docs_mapping, headers)

Downloaded annual_report_2022.pdf for Amazon
Downloaded annual_report_2021.pdf for Amazon


In [8]:
ls {raw_base_directory}/Amazon

annual_report_2021.pdf  annual_report_2022.pdf


## Keep relevant pages

Although you can run the full PDF documents through the solution, to optimize the extraction costs, we suggest that you select the relevant pages from each pdf documents.

In [9]:
%pip install -q pypdf 2> /dev/null

Note: you may need to restart the kernel to use updated packages.


In [10]:
import json
from pypdf import PdfReader, PdfWriter


def keep_relevant_pages_in_pdf(input_pdf_path, output_pdf_path, pages):
    input_pdf = PdfReader(input_pdf_path)
    print(f"Number of pages is {len(input_pdf.pages)}")
    print(f"Relevant pages are {pages}")
    output_pdf = PdfWriter()

    for page_num in pages:
        output_pdf.add_page(input_pdf.pages[page_num - 1])

    with open(output_pdf_path, "wb") as f:
        output_pdf.write(f)


def save_json(json_data, file_path):
    with open(file_path, "w") as f:
        json.dump(json_data, f)

In [11]:
import shutil


def keep_relevant_pages_in_pdfs(
    raw_base_directory, prepared_base_directory, docs_mapping
):
    metadata = []
    # Create the base directory if it doesn't exist
    if not os.path.exists(prepared_base_directory):
        os.makedirs(prepared_base_directory)

    for company, docs in docs_mapping.items():
        raw_company_directory = os.path.join(raw_base_directory, company)
        prepared_company_directory = os.path.join(prepared_base_directory, company)

        # Create a directory for the company if it doesn't exist
        if not os.path.exists(prepared_company_directory):
            os.makedirs(prepared_company_directory)

        for doc_info in docs:
            doc_url = doc_info["doc_url"]
            year = doc_info["year"]
            pages = doc_info.get("pages", [])
            if not doc_url:
                continue

            current_metadata = {}
            current_metadata["company"] = company
            current_metadata["year"] = year
            current_metadata["doc_url"] = doc_url

            # Construct the filename based on the year and the URL
            filename = f"annual_report_{year}.pdf"
            input_pdf_path = os.path.join(raw_company_directory, filename)
            output_pdf_path = os.path.join(prepared_company_directory, filename)

            current_metadata["local_pdf_path"] = output_pdf_path

            if not pages:
                # When page numbers are not defined, we assume the user wants
                # to process the full file, therefore, copy it as is
                # to the prepared folder
                shutil.copyfile(input_pdf_path, output_pdf_path)
                metadata.append(current_metadata)
                continue

            relevant_pages = doc_info["pages"]
            current_metadata["pages_kept"] = relevant_pages

            # Skip empty URLs

            keep_relevant_pages_in_pdf(input_pdf_path, output_pdf_path, relevant_pages)

            metadata.append(current_metadata)

    save_json(metadata, os.path.join(prepared_base_directory, "metadata.json"))

    return True

In [12]:
prepared_base_directory = os.path.join(raw_base_directory, "prepared/")
prepared_base_directory

'raw_documents/prepared/'

In [13]:
keep_relevant_pages_in_pdfs(raw_base_directory, prepared_base_directory, docs_mapping)

Number of pages is 88
Relevant pages are [15, 17, 18, 47, 48]
Number of pages is 86
Relevant pages are [14, 16, 17, 18, 46, 47]


True


Interesting entities:

* Amazon annual report 2022:
    * Human capital - pg 15.
    * Risks - pg 17, 18.
    * Consolidated statements of cash flows millions - pg 47.
    * Consolidated statements of operations (in millions, except per share data) - pg 48
* Amazon annual report 2021:
    * Human capital - pg 14.
    * Risks - pg 16, 17, 18.
    * Consolidated statements of cash flows millions - pg 46.
    * Consolidated statements of operations (in millions, except per share data) - pg 47

In [14]:
prepared_base_directory

'raw_documents/prepared/'

In [15]:
ls {prepared_base_directory}

[0m[01;34mAmazon[0m/  metadata.json


In [16]:
cat {prepared_base_directory}/metadata.json | python -m json.tool

[
    {
        "company": "Amazon",
        "year": "2022",
        "doc_url": "https://s2.q4cdn.com/299287126/files/doc_financials/2023/ar/Amazon-2022-Annual-Report.pdf",
        "local_pdf_path": "raw_documents/prepared/Amazon/annual_report_2022.pdf",
        "pages_kept": [
            15,
            17,
            18,
            47,
            48
        ]
    },
    {
        "company": "Amazon",
        "year": "2021",
        "doc_url": "https://s2.q4cdn.com/299287126/files/doc_financials/2022/ar/Amazon-2021-Annual-Report.pdf",
        "local_pdf_path": "raw_documents/prepared/Amazon/annual_report_2021.pdf",
        "pages_kept": [
            14,
            16,
            17,
            18,
            46,
            47
        ]
    }
]
