From b3e32c81c6a9c7e20469dcf845b98a0b7940fb07 Mon Sep 17 00:00:00 2001 From: jwilliams-elastic Date: Mon, 17 Mar 2025 14:52:49 -0400 Subject: [PATCH 1/3] pdf parsing using azure ai document intelligence --- .../pdf-azure-ai-document-intelligence.ipynb | 474 ++++++++++++++++++ 1 file changed, 474 insertions(+) create mode 100644 notebooks/ingestion-and-chunking/pdf-azure-ai-document-intelligence.ipynb diff --git a/notebooks/ingestion-and-chunking/pdf-azure-ai-document-intelligence.ipynb b/notebooks/ingestion-and-chunking/pdf-azure-ai-document-intelligence.ipynb new file mode 100644 index 000000000..fb70d7210 --- /dev/null +++ b/notebooks/ingestion-and-chunking/pdf-azure-ai-document-intelligence.ipynb @@ -0,0 +1,474 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "# Overview\n", + "This notebook provides the following: \n", + "\n", + "1. Parses PDFs with [Azure Document Intelligence](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence/) that have text and tables. Each PDF is saved as a JSON file so that it can be loaded into elastic. \n", + "2. Loads JSON files into Elasticsearch. This notebook uses the elasticsearch python client to create an index with E5 and ELSER semantic_text mappings. \n", + "3. Once the data is loaded into Elasticsearch, you can ask questions in Playground and get answers grounded in truth. The index \"id\" field uses the following naming convention: PDF_FILENAME.pdf_PAGENUMBER. That allows you to see PDF and page number in the \"document sources\" link.\n", + "\n", + "**This notebook cannot be used to parse PDF images.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Install python dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install elasticsearch python-dotenv tqdm azure-core azure-ai-documentintelligence requests httpx" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create a .env file that has the following entries. \n", + "\n", + "## Elasticsearch \n", + "- You must have a functional elasticsearch environment that has an `enterprise` level license\n", + "- The fastest way to get up and running is to use the [Elastic Serverless - Get started](https://www.elastic.co/guide/en/serverless/current/elasticsearch-get-started.html) guide\n", + "\n", + "```\n", + "ES_URL=?\n", + "ES_API_KEY=?\n", + "```\n", + "\n", + "## Azure AI Document Intelligence\n", + "\n", + "```\n", + "AZURE_AI_DOCUMENT_INTELLIGENCE_ENDPOINT=?\n", + "AZURE_AI_DOCUMENT_INTELLIGENCE_API_KEY=?\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create input and output folders\n", + "\n", + "- /pdf - place your PDF files in this input folder\n", + "- /json - parser will output one json file for each pdf in this output folder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "input_folder_pdf = \"./pdf\"\n", + "output_folder_pdf = \"./json\"\n", + "\n", + "folders = [input_folder_pdf, output_folder_pdf]\n", + "\n", + "\n", + "def create_folders_if_not_exist(folders):\n", + " for folder in folders:\n", + " os.makedirs(folder, exist_ok=True)\n", + " print(f\"Folder '{folder}' created or already exists.\")\n", + "\n", + "\n", + "create_folders_if_not_exist(folders)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Download PDF files\n", + "\n", + "- This notebook downloads 4 recent Elastic SEC 10-Q quarterly reports\n", + "- If you already have PDF files, feel free to place them in `./pdf` folder " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "\n", + "\n", + "def download_pdf(url, directory=\"./pdf\", filename=None):\n", + " if not os.path.exists(directory):\n", + " os.makedirs(directory)\n", + "\n", + " response = requests.get(url)\n", + " if response.status_code == 200:\n", + " if filename is None:\n", + " filename = url.split(\"/\")[-1]\n", + " filepath = os.path.join(directory, filename)\n", + " with open(filepath, \"wb\") as file:\n", + " file.write(response.content)\n", + " print(f\"Downloaded {filepath}\")\n", + " else:\n", + " print(f\"Failed to download file from {url}\")\n", + "\n", + "\n", + "print(\"Downloading 4 recent 10-Q reports for Elastic NV.\")\n", + "base_url = \"https://s201.q4cdn.com/217177842/files/doc_financials\"\n", + "download_pdf(\n", + " f\"{base_url}/2025/q2/e5aa7a0a-6f56-468d-a5bd-661792773d71.pdf\",\n", + " filename=\"elastic-10Q-Q2-2025.pdf\",\n", + ")\n", + "download_pdf(\n", + " f\"{base_url}/2025/q1/18656e06-8107-4423-8e2b-6f2945438053.pdf\",\n", + " filename=\"elastic-10Q-Q1-2025.pdf\",\n", + ")\n", + "download_pdf(\n", + " f\"{base_url}/2024/q4/9949f03b-09fb-4941-b105-62a304dc1411.pdf\",\n", + " filename=\"elastic-10Q-Q4-2024.pdf\",\n", + ")\n", + "download_pdf(\n", + " f\"{base_url}/2024/q3/7e60e3bd-ff50-4ae8-ab12-5b3ae19420e6.pdf\",\n", + " filename=\"elastic-10Q-Q3-2024.pdf\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Set Azure AI Document Intelligence Imports and Environment Variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from azure.core.credentials import AzureKeyCredential\n", + "from azure.ai.documentintelligence import DocumentIntelligenceClient\n", + "from azure.ai.documentintelligence.models import AnalyzeResult\n", + "from azure.ai.documentintelligence.models import AnalyzeDocumentRequest\n", + "import json\n", + "from dotenv import load_dotenv\n", + "from tqdm import tqdm\n", + "\n", + "load_dotenv()\n", + "\n", + "AZURE_AI_DOCUMENT_INTELLIGENCE_ENDPOINT = os.getenv(\n", + " \"AZURE_AI_DOCUMENT_INTELLIGENCE_ENDPOINT\"\n", + ")\n", + "AZURE_AI_DOCUMENT_INTELLIGENCE_API_KEY = os.getenv(\n", + " \"AZURE_AI_DOCUMENT_INTELLIGENCE_API_KEY\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Parse paragraphs using AnalyzeResult\n", + "\n", + "This function extracts the paragraph text via an AnalyzeResult on a PDF file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def parse_paragraphs(analyze_result):\n", + " table_offsets = []\n", + " page_content = {}\n", + "\n", + " for paragraph in analyze_result.paragraphs:\n", + " for span in paragraph.spans:\n", + " if span.offset not in table_offsets:\n", + " for region in paragraph.bounding_regions:\n", + " page_number = region.page_number\n", + " if page_number not in page_content:\n", + " page_content[page_number] = []\n", + " page_content[page_number].append(\n", + " {\"content_text\": paragraph.content}\n", + " )\n", + " return page_content, table_offsets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Parse tables using AnalyzeResult\n", + "\n", + "This function extracts the paragraph text via an AnalyzeResult on a PDF file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def parse_tables(analyze_result, table_offsets):\n", + " page_content = {}\n", + "\n", + " for table in analyze_result.tables:\n", + " table_data = []\n", + " for region in table.bounding_regions:\n", + " page_number = region.page_number\n", + " for cell in table.cells:\n", + " for span in cell.spans:\n", + " table_offsets.append(span.offset)\n", + " table_data.append(\n", + " f\"Cell [{cell.row_index}, {cell.column_index}]: {cell.content}\"\n", + " )\n", + "\n", + " if page_number not in page_content:\n", + " page_content[page_number] = []\n", + "\n", + " page_content[page_number].append({\"content_text\": \"\\n\".join(table_data)})\n", + "\n", + " return page_content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Combine paragraph and table text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def combine_paragraphs_tables(filepath, paragraph_content, table_content):\n", + " page_content_concatenated = {}\n", + " structured_data = []\n", + "\n", + " # Combine paragraph and table content\n", + " for p_number, contents in {**paragraph_content, **table_content}.items():\n", + " concatenated_text = \"\"\n", + " for content in contents:\n", + " concatenated_text += content[\"content_text\"] + \"\\n\"\n", + "\n", + " page_content_concatenated[p_number] = concatenated_text.strip()\n", + "\n", + " # Append a single item per page to the structured_data list\n", + " for p_number, concatenated_text in page_content_concatenated.items():\n", + " structured_data.append(\n", + " {\n", + " \"page_number\": p_number,\n", + " \"content_text\": concatenated_text,\n", + " \"pdf_file\": os.path.basename(filepath),\n", + " }\n", + " )\n", + "\n", + " return structured_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Bring it all together" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pdf_files = [\n", + " os.path.join(input_folder_pdf, file)\n", + " for file in os.listdir(input_folder_pdf)\n", + " if file.endswith(\".pdf\")\n", + "]\n", + "\n", + "document_intelligence_client = DocumentIntelligenceClient(\n", + " endpoint=AZURE_AI_DOCUMENT_INTELLIGENCE_ENDPOINT,\n", + " credential=AzureKeyCredential(AZURE_AI_DOCUMENT_INTELLIGENCE_API_KEY),\n", + " connection_timeout=600,\n", + ")\n", + "\n", + "for filepath in tqdm(pdf_files, desc=\"Parsing PDF files\"):\n", + " with open(filepath, \"rb\") as file:\n", + " poller = document_intelligence_client.begin_analyze_document(\n", + " \"prebuilt-layout\", AnalyzeDocumentRequest(bytes_source=file.read())\n", + " )\n", + "\n", + " analyze_result: AnalyzeResult = poller.result()\n", + "\n", + " paragraph_content, table_offsets = parse_paragraphs(analyze_result)\n", + " table_content = parse_tables(analyze_result, table_offsets)\n", + " structured_data = combine_paragraphs_tables(\n", + " filepath, paragraph_content, table_content\n", + " )\n", + "\n", + " # Convert the structured data to JSON format\n", + " json_output = json.dumps(structured_data, indent=4)\n", + "\n", + " # Get the filename without the \".pdf\" extension\n", + " filename_without_ext = os.path.splitext(os.path.basename(filepath))[0]\n", + " # Write the JSON output to a file\n", + " output_json_file = f\"{output_folder_pdf}/{filename_without_ext}.json\"\n", + "\n", + " with open(output_json_file, \"w\") as json_file:\n", + " json_file.write(json_output)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Set imports for the elasticsearch client and environment variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from dotenv import load_dotenv\n", + "from elasticsearch import Elasticsearch\n", + "from tqdm import tqdm\n", + "import os\n", + "\n", + "load_dotenv()\n", + "\n", + "ES_URL = os.getenv(\"ES_URL\")\n", + "ES_API_KEY = os.getenv(\"ES_API_KEY\")\n", + "\n", + "es = Elasticsearch(hosts=ES_URL, api_key=ES_API_KEY, request_timeout=60)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create index in Elastic Cloud Serverless" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "index_name = \"pdf-chat\"\n", + "index_body = {\n", + " \"mappings\": {\n", + " \"properties\": {\n", + " \"page_content\": {\n", + " \"type\": \"text\",\n", + " \"copy_to\": [\"page_content_sparse\", \"page_content_dense\"],\n", + " },\n", + " \"page_content_sparse\": {\n", + " \"type\": \"semantic_text\",\n", + " \"inference_id\": \".elser-2-elasticsearch\",\n", + " },\n", + " \"page_content_dense\": {\n", + " \"type\": \"semantic_text\",\n", + " \"inference_id\": \".multilingual-e5-small-elasticsearch\",\n", + " },\n", + " \"page_number\": {\"type\": \"text\"},\n", + " \"pdf_file\": {\"type\": \"text\", \"fields\": {\"keyword\": {\"type\": \"keyword\"}}},\n", + " }\n", + " }\n", + "}\n", + "\n", + "if es.indices.exists(index=index_name):\n", + " es.indices.delete(index=index_name)\n", + " print(f\"Index '{index_name}' deleted successfully.\")\n", + "\n", + "response = es.indices.create(index=index_name, body=index_body)\n", + "if \"acknowledged\" in response and response[\"acknowledged\"]:\n", + " print(f\"Index '{index_name}' created successfully.\")\n", + "elif \"error\" in response:\n", + " print(f\"Failed to create: '{index_name}'\")\n", + " print(f\"Error: {response['error']['reason']}\")\n", + "else:\n", + " print(f\"Index '{index_name}' already exists.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "files = os.listdir(output_folder_pdf)\n", + "with tqdm(total=len(files), desc=\"Indexing PDF docs\") as pbar_files:\n", + " for file in files:\n", + " with open(output_folder_pdf + \"/\" + file) as f:\n", + " data = json.loads(f.read())\n", + "\n", + " with tqdm(total=len(data), desc=f\"Processing {file}\") as pbar_pages:\n", + " for page in data:\n", + " doc = {\n", + " \"page_content\": page[\"content_text\"],\n", + " \"page_number\": page[\"page_number\"],\n", + " \"pdf_file\": page[\"pdf_file\"],\n", + " }\n", + " id = f\"{page['pdf_file']}_{page['page_number']}\"\n", + " es.index(index=index_name, id=id, body=json.dumps(doc))\n", + " pbar_pages.update(1)\n", + "\n", + " pbar_files.update(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prompt List\n", + "\n", + "1. Compare/contrast subscription revenue for Q2-2025, Q1-2025, Q4-2024 and Q3-2024?\n", + "2. Provide an Income Taxes summary for Q2-2025, Q1-2025, Q4-2024 and Q3-2024?\n", + "3. How has the balance sheet changed for Q2-2025, Q1-2025, Q4-2024 and Q3-2024?" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 92ecb60bac1aa6e57917884c9040eba77dd58642 Mon Sep 17 00:00:00 2001 From: jwilliams-elastic Date: Mon, 17 Mar 2025 17:11:06 -0400 Subject: [PATCH 2/3] pdf parsing using azure ai document intelligence. moving to supporting-blog-content --- .../pdf-azure-ai-document-intelligence.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {notebooks/ingestion-and-chunking => supporting-blog-content/pdf-azure-ai-document-intelligence}/pdf-azure-ai-document-intelligence.ipynb (100%) diff --git a/notebooks/ingestion-and-chunking/pdf-azure-ai-document-intelligence.ipynb b/supporting-blog-content/pdf-azure-ai-document-intelligence/pdf-azure-ai-document-intelligence.ipynb similarity index 100% rename from notebooks/ingestion-and-chunking/pdf-azure-ai-document-intelligence.ipynb rename to supporting-blog-content/pdf-azure-ai-document-intelligence/pdf-azure-ai-document-intelligence.ipynb From 1601badf7f74298d655eb61158fd3e440e8181f0 Mon Sep 17 00:00:00 2001 From: jwilliams-elastic Date: Tue, 18 Mar 2025 11:24:45 -0400 Subject: [PATCH 3/3] pdf parsing using azure ai document intelligence. fixed 2 code blocks. 1-increased elastic client timeout. 2-fixed issue in combine pararaph and table text --- .../pdf-azure-ai-document-intelligence.ipynb | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/supporting-blog-content/pdf-azure-ai-document-intelligence/pdf-azure-ai-document-intelligence.ipynb b/supporting-blog-content/pdf-azure-ai-document-intelligence/pdf-azure-ai-document-intelligence.ipynb index fb70d7210..7861aeaeb 100644 --- a/supporting-blog-content/pdf-azure-ai-document-intelligence/pdf-azure-ai-document-intelligence.ipynb +++ b/supporting-blog-content/pdf-azure-ai-document-intelligence/pdf-azure-ai-document-intelligence.ipynb @@ -265,10 +265,16 @@ " structured_data = []\n", "\n", " # Combine paragraph and table content\n", - " for p_number, contents in {**paragraph_content, **table_content}.items():\n", + " for p_number in set(paragraph_content.keys()).union(table_content.keys()):\n", " concatenated_text = \"\"\n", - " for content in contents:\n", - " concatenated_text += content[\"content_text\"] + \"\\n\"\n", + "\n", + " if p_number in paragraph_content:\n", + " for content in paragraph_content[p_number]:\n", + " concatenated_text += content[\"content_text\"] + \"\\n\"\n", + "\n", + " if p_number in table_content:\n", + " for content in table_content[p_number]:\n", + " concatenated_text += content[\"content_text\"] + \"\\n\"\n", "\n", " page_content_concatenated[p_number] = concatenated_text.strip()\n", "\n", @@ -360,7 +366,7 @@ "ES_URL = os.getenv(\"ES_URL\")\n", "ES_API_KEY = os.getenv(\"ES_API_KEY\")\n", "\n", - "es = Elasticsearch(hosts=ES_URL, api_key=ES_API_KEY, request_timeout=60)" + "es = Elasticsearch(hosts=ES_URL, api_key=ES_API_KEY, request_timeout=300)" ] }, {