docs: use direct API for pdf chunk and pdf extract (#2452)

shuoweil · web-flow · commit 543ce52c1826 · 2026-02-12T11:34:01.000-08:00
This PR updates `notebooks/multimodal/multimodal_dataframe.ipynb` to
demonstrate PDF text extraction a chunking using custom BigQuery Python
UDFs with the `pypdf` library.

Fixes #&lt;478952827&gt; 🦕
diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb
@@ -1317,161 +1317,119 @@
         "id": "iRUi8AjG7cIf"
       },
       "source": [
-        "### 5. PDF chunking function"
+        "### 5. PDF extraction and chunking function\n",
+        "\n",
+        "This section demonstrates how to extract text and chunk text from PDF files using custom BigQuery Python UDFs and the `pypdf` library."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 3,
-      "metadata": {
-        "id": "oDDuYtUm5Yiy"
-      },
+      "execution_count": null,
+      "metadata": {},
       "outputs": [],
       "source": [
-        "df_pdf = bpd.from_glob_path(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*\", name=\"pdf\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 18,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "7jLpMYaj7nj8",
-        "outputId": "06d5456f-580f-4693-adff-2605104b056c"
-      },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
-            "instead of using `db_dtypes` in the future when available in pandas\n",
-            "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
-            "  warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n",
-            "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n",
-            "  return method(*args, **kwargs)\n",
-            "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n",
-            "future version. Use `json_value_array` instead.\n",
-            "  warnings.warn(bfe.format_message(msg), category=UserWarning)\n",
-            "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n",
-            "future version. Use `json_value_array` instead.\n",
-            "  warnings.warn(bfe.format_message(msg), category=UserWarning)\n"
-          ]
-        }
-      ],
-      "source": [
-        "df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\")"
+        "# Construct the canonical connection ID\n",
+        "FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n",
+        "\n",
+        "@bpd.udf(\n",
+        "    input_types=[str],\n",
+        "    output_type=str,\n",
+        "    dataset=DATASET_ID,\n",
+        "    name=\"pdf_extract\",\n",
+        "    bigquery_connection=FULL_CONNECTION_ID,\n",
+        "    packages=[\"pypdf\", \"requests\", \"cryptography\"],\n",
+        ")\n",
+        "def pdf_extract(src_obj_ref_rt: str) -> str:\n",
+        "    import io\n",
+        "    import json\n",
+        "    from pypdf import PdfReader\n",
+        "    import requests\n",
+        "    from requests import adapters\n",
+        "    session = requests.Session()\n",
+        "    session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n",
+        "    src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n",
+        "    src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n",
+        "    response = session.get(src_url, timeout=30, stream=True)\n",
+        "    response.raise_for_status()\n",
+        "    pdf_bytes = response.content\n",
+        "    pdf_file = io.BytesIO(pdf_bytes)\n",
+        "    reader = PdfReader(pdf_file, strict=False)\n",
+        "    all_text = \"\"\n",
+        "    for page in reader.pages:\n",
+        "        page_extract_text = page.extract_text()\n",
+        "        if page_extract_text:\n",
+        "            all_text += page_extract_text\n",
+        "    return all_text\n",
+        "\n",
+        "@bpd.udf(\n",
+        "    input_types=[str, int, int],\n",
+        "    output_type=list[str],\n",
+        "    dataset=DATASET_ID,\n",
+        "    name=\"pdf_chunk\",\n",
+        "    bigquery_connection=FULL_CONNECTION_ID,\n",
+        "    packages=[\"pypdf\", \"requests\", \"cryptography\"],\n",
+        ")\n",
+        "def pdf_chunk(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> list[str]:\n",
+        "    import io\n",
+        "    import json\n",
+        "    from pypdf import PdfReader\n",
+        "    import requests\n",
+        "    from requests import adapters\n",
+        "    session = requests.Session()\n",
+        "    session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n",
+        "    src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n",
+        "    src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n",
+        "    response = session.get(src_url, timeout=30, stream=True)\n",
+        "    response.raise_for_status()\n",
+        "    pdf_bytes = response.content\n",
+        "    pdf_file = io.BytesIO(pdf_bytes)\n",
+        "    reader = PdfReader(pdf_file, strict=False)\n",
+        "    all_text_chunks = []\n",
+        "    curr_chunk = \"\"\n",
+        "    for page in reader.pages:\n",
+        "        page_text = page.extract_text()\n",
+        "        if page_text:\n",
+        "            curr_chunk += page_text\n",
+        "            while len(curr_chunk) >= chunk_size:\n",
+        "                split_idx = curr_chunk.rfind(\" \", 0, chunk_size)\n",
+        "                if split_idx == -1:\n",
+        "                    split_idx = chunk_size\n",
+        "                actual_chunk = curr_chunk[:split_idx]\n",
+        "                all_text_chunks.append(actual_chunk)\n",
+        "                overlap = curr_chunk[split_idx + 1 : split_idx + 1 + overlap_size]\n",
+        "                curr_chunk = overlap + curr_chunk[split_idx + 1 + overlap_size :]\n",
+        "    if curr_chunk:\n",
+        "        all_text_chunks.append(curr_chunk)\n",
+        "    return all_text_chunks"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 19,
+      "execution_count": null,
       "metadata": {},
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
-            "instead of using `db_dtypes` in the future when available in pandas\n",
-            "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
-            "  warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n",
-            "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n",
-            "  return method(*args, **kwargs)\n",
-            "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n",
-            "future version. Use `json_value_array` instead.\n",
-            "  warnings.warn(bfe.format_message(msg), category=UserWarning)\n"
-          ]
-        },
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>chunked_verbose</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>{'status': '', 'content': array([\"CritterCuisi...</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "<p>1 rows × 1 columns</p>\n",
-              "</div>[1 rows x 1 columns in total]"
-            ],
-            "text/plain": [
-              "                                     chunked_verbose\n",
-              "0  {'status': '', 'content': array([\"CritterCuisi...\n",
-              "\n",
-              "[1 rows x 1 columns]"
-            ]
-          },
-          "execution_count": 19,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
+      "outputs": [],
       "source": [
-        "df_pdf[\"chunked_verbose\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\", verbose=True)\n",
-        "df_pdf[[\"chunked_verbose\"]]"
+        "df_pdf = bpd.from_glob_path(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*\", name=\"pdf\")\n",
+        "\n",
+        "# Generate a JSON string containing the runtime information (including signed read URLs)\n",
+        "access_urls = get_runtime_json_str(df_pdf[\"pdf\"], mode=\"R\")\n",
+        "\n",
+        "# Apply PDF extraction\n",
+        "df_pdf[\"extracted_text\"] = access_urls.apply(pdf_extract)\n",
+        "\n",
+        "# Apply PDF chunking\n",
+        "df_pdf[\"chunked\"] = access_urls.apply(pdf_chunk, args=(2000, 200))\n",
+        "\n",
+        "df_pdf[[\"extracted_text\", \"chunked\"]]"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 20,
-      "metadata": {
-        "id": "kaPvJATN7zlw"
-      },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
-            "instead of using `db_dtypes` in the future when available in pandas\n",
-            "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
-            "  warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n"
-          ]
-        },
-        {
-          "data": {
-            "text/plain": [
-              "0    CritterCuisine Pro 5000 - Automatic Pet Feeder...\n",
-              "0    on a level, stable surface to prevent tipping....\n",
-              "0    included)\\nto maintain the schedule during pow...\n",
-              "0    digits for Meal 1 will flash.\\n\u0000. Use the UP/D...\n",
-              "0    paperclip) for 5\\nseconds. This will reset all...\n",
-              "0    unit with a damp cloth. Do not immerse the bas...\n",
-              "0    continues,\\ncontact customer support.\\nE2: Foo...\n",
-              "Name: chunked, dtype: string"
-            ]
-          },
-          "execution_count": 20,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
       "source": [
+        "# Explode the chunks to see each chunk as a separate row\n",
         "chunked = df_pdf[\"chunked\"].explode()\n",
         "chunked"
       ]
@@ -1674,7 +1632,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.13.0"
+      "version": "3.10.15"
     }
   },
   "nbformat": 4,