From b3e32c81c6a9c7e20469dcf845b98a0b7940fb07 Mon Sep 17 00:00:00 2001
From: jwilliams-elastic <james.williams@elastic.co>
Date: Mon, 17 Mar 2025 14:52:49 -0400
Subject: [PATCH 1/3] pdf parsing using azure ai document intelligence

---
 .../pdf-azure-ai-document-intelligence.ipynb  | 474 ++++++++++++++++++
 1 file changed, 474 insertions(+)
 create mode 100644 notebooks/ingestion-and-chunking/pdf-azure-ai-document-intelligence.ipynb

diff --git a/notebooks/ingestion-and-chunking/pdf-azure-ai-document-intelligence.ipynb b/notebooks/ingestion-and-chunking/pdf-azure-ai-document-intelligence.ipynb
new file mode 100644
index 000000000..fb70d7210
--- /dev/null
+++ b/notebooks/ingestion-and-chunking/pdf-azure-ai-document-intelligence.ipynb
@@ -0,0 +1,474 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "source": [
+    "# Overview\n",
+    "This notebook provides the following: \n",
+    "\n",
+    "1. Parses PDFs with [Azure Document Intelligence](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence/) that have text and tables. Each PDF is saved as a JSON file so that it can be loaded into elastic. \n",
+    "2. Loads JSON files into Elasticsearch. This notebook uses the elasticsearch python client to create an index with E5 and ELSER semantic_text mappings. \n",
+    "3. Once the data is loaded into Elasticsearch, you can ask questions in Playground and get answers grounded in truth. The index \"id\" field uses the following naming convention: PDF_FILENAME.pdf_PAGENUMBER. That allows you to see PDF and page number in the \"document sources\" link.\n",
+    "\n",
+    "**This notebook cannot be used to parse PDF images.**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Install python dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install elasticsearch python-dotenv tqdm azure-core azure-ai-documentintelligence requests httpx"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create a .env file that has the following entries. \n",
+    "\n",
+    "## Elasticsearch \n",
+    "- You must have a functional elasticsearch environment that has an `enterprise` level license\n",
+    "- The fastest way to get up and running is to use the [Elastic Serverless - Get started](https://www.elastic.co/guide/en/serverless/current/elasticsearch-get-started.html) guide\n",
+    "\n",
+    "```\n",
+    "ES_URL=?\n",
+    "ES_API_KEY=?\n",
+    "```\n",
+    "\n",
+    "## Azure AI Document Intelligence\n",
+    "\n",
+    "```\n",
+    "AZURE_AI_DOCUMENT_INTELLIGENCE_ENDPOINT=?\n",
+    "AZURE_AI_DOCUMENT_INTELLIGENCE_API_KEY=?\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create input and output folders\n",
+    "\n",
+    "- /pdf - place your PDF files in this input folder\n",
+    "- /json - parser will output one json file for each pdf in this output folder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "input_folder_pdf = \"./pdf\"\n",
+    "output_folder_pdf = \"./json\"\n",
+    "\n",
+    "folders = [input_folder_pdf, output_folder_pdf]\n",
+    "\n",
+    "\n",
+    "def create_folders_if_not_exist(folders):\n",
+    "    for folder in folders:\n",
+    "        os.makedirs(folder, exist_ok=True)\n",
+    "        print(f\"Folder '{folder}' created or already exists.\")\n",
+    "\n",
+    "\n",
+    "create_folders_if_not_exist(folders)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download PDF files\n",
+    "\n",
+    "- This notebook downloads 4 recent Elastic SEC 10-Q quarterly reports\n",
+    "- If you already have PDF files, feel free to place them in `./pdf` folder "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import requests\n",
+    "\n",
+    "\n",
+    "def download_pdf(url, directory=\"./pdf\", filename=None):\n",
+    "    if not os.path.exists(directory):\n",
+    "        os.makedirs(directory)\n",
+    "\n",
+    "    response = requests.get(url)\n",
+    "    if response.status_code == 200:\n",
+    "        if filename is None:\n",
+    "            filename = url.split(\"/\")[-1]\n",
+    "        filepath = os.path.join(directory, filename)\n",
+    "        with open(filepath, \"wb\") as file:\n",
+    "            file.write(response.content)\n",
+    "        print(f\"Downloaded {filepath}\")\n",
+    "    else:\n",
+    "        print(f\"Failed to download file from {url}\")\n",
+    "\n",
+    "\n",
+    "print(\"Downloading 4 recent 10-Q reports for Elastic NV.\")\n",
+    "base_url = \"https://s201.q4cdn.com/217177842/files/doc_financials\"\n",
+    "download_pdf(\n",
+    "    f\"{base_url}/2025/q2/e5aa7a0a-6f56-468d-a5bd-661792773d71.pdf\",\n",
+    "    filename=\"elastic-10Q-Q2-2025.pdf\",\n",
+    ")\n",
+    "download_pdf(\n",
+    "    f\"{base_url}/2025/q1/18656e06-8107-4423-8e2b-6f2945438053.pdf\",\n",
+    "    filename=\"elastic-10Q-Q1-2025.pdf\",\n",
+    ")\n",
+    "download_pdf(\n",
+    "    f\"{base_url}/2024/q4/9949f03b-09fb-4941-b105-62a304dc1411.pdf\",\n",
+    "    filename=\"elastic-10Q-Q4-2024.pdf\",\n",
+    ")\n",
+    "download_pdf(\n",
+    "    f\"{base_url}/2024/q3/7e60e3bd-ff50-4ae8-ab12-5b3ae19420e6.pdf\",\n",
+    "    filename=\"elastic-10Q-Q3-2024.pdf\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Set Azure AI Document Intelligence Imports and Environment Variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from azure.core.credentials import AzureKeyCredential\n",
+    "from azure.ai.documentintelligence import DocumentIntelligenceClient\n",
+    "from azure.ai.documentintelligence.models import AnalyzeResult\n",
+    "from azure.ai.documentintelligence.models import AnalyzeDocumentRequest\n",
+    "import json\n",
+    "from dotenv import load_dotenv\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
+    "AZURE_AI_DOCUMENT_INTELLIGENCE_ENDPOINT = os.getenv(\n",
+    "    \"AZURE_AI_DOCUMENT_INTELLIGENCE_ENDPOINT\"\n",
+    ")\n",
+    "AZURE_AI_DOCUMENT_INTELLIGENCE_API_KEY = os.getenv(\n",
+    "    \"AZURE_AI_DOCUMENT_INTELLIGENCE_API_KEY\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Parse paragraphs using AnalyzeResult\n",
+    "\n",
+    "This function extracts the paragraph text via an AnalyzeResult on a PDF file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def parse_paragraphs(analyze_result):\n",
+    "    table_offsets = []\n",
+    "    page_content = {}\n",
+    "\n",
+    "    for paragraph in analyze_result.paragraphs:\n",
+    "        for span in paragraph.spans:\n",
+    "            if span.offset not in table_offsets:\n",
+    "                for region in paragraph.bounding_regions:\n",
+    "                    page_number = region.page_number\n",
+    "                    if page_number not in page_content:\n",
+    "                        page_content[page_number] = []\n",
+    "                    page_content[page_number].append(\n",
+    "                        {\"content_text\": paragraph.content}\n",
+    "                    )\n",
+    "    return page_content, table_offsets"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Parse tables using AnalyzeResult\n",
+    "\n",
+    "This function extracts the paragraph text via an AnalyzeResult on a PDF file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def parse_tables(analyze_result, table_offsets):\n",
+    "    page_content = {}\n",
+    "\n",
+    "    for table in analyze_result.tables:\n",
+    "        table_data = []\n",
+    "        for region in table.bounding_regions:\n",
+    "            page_number = region.page_number\n",
+    "            for cell in table.cells:\n",
+    "                for span in cell.spans:\n",
+    "                    table_offsets.append(span.offset)\n",
+    "                table_data.append(\n",
+    "                    f\"Cell [{cell.row_index}, {cell.column_index}]: {cell.content}\"\n",
+    "                )\n",
+    "\n",
+    "        if page_number not in page_content:\n",
+    "            page_content[page_number] = []\n",
+    "\n",
+    "        page_content[page_number].append({\"content_text\": \"\\n\".join(table_data)})\n",
+    "\n",
+    "    return page_content"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Combine paragraph and table text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def combine_paragraphs_tables(filepath, paragraph_content, table_content):\n",
+    "    page_content_concatenated = {}\n",
+    "    structured_data = []\n",
+    "\n",
+    "    # Combine paragraph and table content\n",
+    "    for p_number, contents in {**paragraph_content, **table_content}.items():\n",
+    "        concatenated_text = \"\"\n",
+    "        for content in contents:\n",
+    "            concatenated_text += content[\"content_text\"] + \"\\n\"\n",
+    "\n",
+    "        page_content_concatenated[p_number] = concatenated_text.strip()\n",
+    "\n",
+    "    # Append a single item per page to the structured_data list\n",
+    "    for p_number, concatenated_text in page_content_concatenated.items():\n",
+    "        structured_data.append(\n",
+    "            {\n",
+    "                \"page_number\": p_number,\n",
+    "                \"content_text\": concatenated_text,\n",
+    "                \"pdf_file\": os.path.basename(filepath),\n",
+    "            }\n",
+    "        )\n",
+    "\n",
+    "    return structured_data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Bring it all together"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pdf_files = [\n",
+    "    os.path.join(input_folder_pdf, file)\n",
+    "    for file in os.listdir(input_folder_pdf)\n",
+    "    if file.endswith(\".pdf\")\n",
+    "]\n",
+    "\n",
+    "document_intelligence_client = DocumentIntelligenceClient(\n",
+    "    endpoint=AZURE_AI_DOCUMENT_INTELLIGENCE_ENDPOINT,\n",
+    "    credential=AzureKeyCredential(AZURE_AI_DOCUMENT_INTELLIGENCE_API_KEY),\n",
+    "    connection_timeout=600,\n",
+    ")\n",
+    "\n",
+    "for filepath in tqdm(pdf_files, desc=\"Parsing PDF files\"):\n",
+    "    with open(filepath, \"rb\") as file:\n",
+    "        poller = document_intelligence_client.begin_analyze_document(\n",
+    "            \"prebuilt-layout\", AnalyzeDocumentRequest(bytes_source=file.read())\n",
+    "        )\n",
+    "\n",
+    "        analyze_result: AnalyzeResult = poller.result()\n",
+    "\n",
+    "        paragraph_content, table_offsets = parse_paragraphs(analyze_result)\n",
+    "        table_content = parse_tables(analyze_result, table_offsets)\n",
+    "        structured_data = combine_paragraphs_tables(\n",
+    "            filepath, paragraph_content, table_content\n",
+    "        )\n",
+    "\n",
+    "        # Convert the structured data to JSON format\n",
+    "        json_output = json.dumps(structured_data, indent=4)\n",
+    "\n",
+    "        # Get the filename without the \".pdf\" extension\n",
+    "        filename_without_ext = os.path.splitext(os.path.basename(filepath))[0]\n",
+    "        # Write the JSON output to a file\n",
+    "        output_json_file = f\"{output_folder_pdf}/{filename_without_ext}.json\"\n",
+    "\n",
+    "        with open(output_json_file, \"w\") as json_file:\n",
+    "            json_file.write(json_output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Set imports for the elasticsearch client and environment variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from dotenv import load_dotenv\n",
+    "from elasticsearch import Elasticsearch\n",
+    "from tqdm import tqdm\n",
+    "import os\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
+    "ES_URL = os.getenv(\"ES_URL\")\n",
+    "ES_API_KEY = os.getenv(\"ES_API_KEY\")\n",
+    "\n",
+    "es = Elasticsearch(hosts=ES_URL, api_key=ES_API_KEY, request_timeout=60)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create index in Elastic Cloud Serverless"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "index_name = \"pdf-chat\"\n",
+    "index_body = {\n",
+    "    \"mappings\": {\n",
+    "        \"properties\": {\n",
+    "            \"page_content\": {\n",
+    "                \"type\": \"text\",\n",
+    "                \"copy_to\": [\"page_content_sparse\", \"page_content_dense\"],\n",
+    "            },\n",
+    "            \"page_content_sparse\": {\n",
+    "                \"type\": \"semantic_text\",\n",
+    "                \"inference_id\": \".elser-2-elasticsearch\",\n",
+    "            },\n",
+    "            \"page_content_dense\": {\n",
+    "                \"type\": \"semantic_text\",\n",
+    "                \"inference_id\": \".multilingual-e5-small-elasticsearch\",\n",
+    "            },\n",
+    "            \"page_number\": {\"type\": \"text\"},\n",
+    "            \"pdf_file\": {\"type\": \"text\", \"fields\": {\"keyword\": {\"type\": \"keyword\"}}},\n",
+    "        }\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "if es.indices.exists(index=index_name):\n",
+    "    es.indices.delete(index=index_name)\n",
+    "    print(f\"Index '{index_name}' deleted successfully.\")\n",
+    "\n",
+    "response = es.indices.create(index=index_name, body=index_body)\n",
+    "if \"acknowledged\" in response and response[\"acknowledged\"]:\n",
+    "    print(f\"Index '{index_name}' created successfully.\")\n",
+    "elif \"error\" in response:\n",
+    "    print(f\"Failed to create: '{index_name}'\")\n",
+    "    print(f\"Error: {response['error']['reason']}\")\n",
+    "else:\n",
+    "    print(f\"Index '{index_name}' already exists.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files = os.listdir(output_folder_pdf)\n",
+    "with tqdm(total=len(files), desc=\"Indexing PDF docs\") as pbar_files:\n",
+    "    for file in files:\n",
+    "        with open(output_folder_pdf + \"/\" + file) as f:\n",
+    "            data = json.loads(f.read())\n",
+    "\n",
+    "        with tqdm(total=len(data), desc=f\"Processing {file}\") as pbar_pages:\n",
+    "            for page in data:\n",
+    "                doc = {\n",
+    "                    \"page_content\": page[\"content_text\"],\n",
+    "                    \"page_number\": page[\"page_number\"],\n",
+    "                    \"pdf_file\": page[\"pdf_file\"],\n",
+    "                }\n",
+    "                id = f\"{page['pdf_file']}_{page['page_number']}\"\n",
+    "                es.index(index=index_name, id=id, body=json.dumps(doc))\n",
+    "                pbar_pages.update(1)\n",
+    "\n",
+    "        pbar_files.update(1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prompt List\n",
+    "\n",
+    "1. Compare/contrast subscription revenue for Q2-2025, Q1-2025, Q4-2024 and Q3-2024?\n",
+    "2. Provide an Income Taxes summary for Q2-2025, Q1-2025, Q4-2024 and Q3-2024?\n",
+    "3. How has the balance sheet changed for Q2-2025, Q1-2025, Q4-2024 and Q3-2024?"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 92ecb60bac1aa6e57917884c9040eba77dd58642 Mon Sep 17 00:00:00 2001
From: jwilliams-elastic <james.williams@elastic.co>
Date: Mon, 17 Mar 2025 17:11:06 -0400
Subject: [PATCH 2/3] pdf parsing using azure ai document intelligence. moving
 to supporting-blog-content

---
 .../pdf-azure-ai-document-intelligence.ipynb                      | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename {notebooks/ingestion-and-chunking => supporting-blog-content/pdf-azure-ai-document-intelligence}/pdf-azure-ai-document-intelligence.ipynb (100%)

diff --git a/notebooks/ingestion-and-chunking/pdf-azure-ai-document-intelligence.ipynb b/supporting-blog-content/pdf-azure-ai-document-intelligence/pdf-azure-ai-document-intelligence.ipynb
similarity index 100%
rename from notebooks/ingestion-and-chunking/pdf-azure-ai-document-intelligence.ipynb
rename to supporting-blog-content/pdf-azure-ai-document-intelligence/pdf-azure-ai-document-intelligence.ipynb

From 1601badf7f74298d655eb61158fd3e440e8181f0 Mon Sep 17 00:00:00 2001
From: jwilliams-elastic <james.williams@elastic.co>
Date: Tue, 18 Mar 2025 11:24:45 -0400
Subject: [PATCH 3/3] pdf parsing using azure ai document intelligence. fixed 2
 code blocks. 1-increased elastic client timeout. 2-fixed issue in combine
 pararaph and table text

---
 .../pdf-azure-ai-document-intelligence.ipynb       | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/supporting-blog-content/pdf-azure-ai-document-intelligence/pdf-azure-ai-document-intelligence.ipynb b/supporting-blog-content/pdf-azure-ai-document-intelligence/pdf-azure-ai-document-intelligence.ipynb
index fb70d7210..7861aeaeb 100644
--- a/supporting-blog-content/pdf-azure-ai-document-intelligence/pdf-azure-ai-document-intelligence.ipynb
+++ b/supporting-blog-content/pdf-azure-ai-document-intelligence/pdf-azure-ai-document-intelligence.ipynb
@@ -265,10 +265,16 @@
     "    structured_data = []\n",
     "\n",
     "    # Combine paragraph and table content\n",
-    "    for p_number, contents in {**paragraph_content, **table_content}.items():\n",
+    "    for p_number in set(paragraph_content.keys()).union(table_content.keys()):\n",
     "        concatenated_text = \"\"\n",
-    "        for content in contents:\n",
-    "            concatenated_text += content[\"content_text\"] + \"\\n\"\n",
+    "\n",
+    "        if p_number in paragraph_content:\n",
+    "            for content in paragraph_content[p_number]:\n",
+    "                concatenated_text += content[\"content_text\"] + \"\\n\"\n",
+    "\n",
+    "        if p_number in table_content:\n",
+    "            for content in table_content[p_number]:\n",
+    "                concatenated_text += content[\"content_text\"] + \"\\n\"\n",
     "\n",
     "        page_content_concatenated[p_number] = concatenated_text.strip()\n",
     "\n",
@@ -360,7 +366,7 @@
     "ES_URL = os.getenv(\"ES_URL\")\n",
     "ES_API_KEY = os.getenv(\"ES_API_KEY\")\n",
     "\n",
-    "es = Elasticsearch(hosts=ES_URL, api_key=ES_API_KEY, request_timeout=60)"
+    "es = Elasticsearch(hosts=ES_URL, api_key=ES_API_KEY, request_timeout=300)"
    ]
   },
   {