Skip to content

Commit 543ce52

Browse files
authored
docs: use direct API for pdf chunk and pdf extract (#2452)
This PR updates `notebooks/multimodal/multimodal_dataframe.ipynb` to demonstrate PDF text extraction a chunking using custom BigQuery Python UDFs with the `pypdf` library. Fixes #<478952827> 🦕
1 parent 3409acd commit 543ce52

File tree

1 file changed

+97
-139
lines changed

1 file changed

+97
-139
lines changed

notebooks/multimodal/multimodal_dataframe.ipynb

Lines changed: 97 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -1317,161 +1317,119 @@
13171317
"id": "iRUi8AjG7cIf"
13181318
},
13191319
"source": [
1320-
"### 5. PDF chunking function"
1320+
"### 5. PDF extraction and chunking function\n",
1321+
"\n",
1322+
"This section demonstrates how to extract text and chunk text from PDF files using custom BigQuery Python UDFs and the `pypdf` library."
13211323
]
13221324
},
13231325
{
13241326
"cell_type": "code",
1325-
"execution_count": 3,
1326-
"metadata": {
1327-
"id": "oDDuYtUm5Yiy"
1328-
},
1327+
"execution_count": null,
1328+
"metadata": {},
13291329
"outputs": [],
13301330
"source": [
1331-
"df_pdf = bpd.from_glob_path(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*\", name=\"pdf\")"
1332-
]
1333-
},
1334-
{
1335-
"cell_type": "code",
1336-
"execution_count": 18,
1337-
"metadata": {
1338-
"colab": {
1339-
"base_uri": "https://localhost:8080/"
1340-
},
1341-
"id": "7jLpMYaj7nj8",
1342-
"outputId": "06d5456f-580f-4693-adff-2605104b056c"
1343-
},
1344-
"outputs": [
1345-
{
1346-
"name": "stderr",
1347-
"output_type": "stream",
1348-
"text": [
1349-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
1350-
"instead of using `db_dtypes` in the future when available in pandas\n",
1351-
"(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
1352-
" warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n",
1353-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n",
1354-
" return method(*args, **kwargs)\n",
1355-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n",
1356-
"future version. Use `json_value_array` instead.\n",
1357-
" warnings.warn(bfe.format_message(msg), category=UserWarning)\n",
1358-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n",
1359-
"future version. Use `json_value_array` instead.\n",
1360-
" warnings.warn(bfe.format_message(msg), category=UserWarning)\n"
1361-
]
1362-
}
1363-
],
1364-
"source": [
1365-
"df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\")"
1331+
"# Construct the canonical connection ID\n",
1332+
"FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n",
1333+
"\n",
1334+
"@bpd.udf(\n",
1335+
" input_types=[str],\n",
1336+
" output_type=str,\n",
1337+
" dataset=DATASET_ID,\n",
1338+
" name=\"pdf_extract\",\n",
1339+
" bigquery_connection=FULL_CONNECTION_ID,\n",
1340+
" packages=[\"pypdf\", \"requests\", \"cryptography\"],\n",
1341+
")\n",
1342+
"def pdf_extract(src_obj_ref_rt: str) -> str:\n",
1343+
" import io\n",
1344+
" import json\n",
1345+
" from pypdf import PdfReader\n",
1346+
" import requests\n",
1347+
" from requests import adapters\n",
1348+
" session = requests.Session()\n",
1349+
" session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n",
1350+
" src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n",
1351+
" src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n",
1352+
" response = session.get(src_url, timeout=30, stream=True)\n",
1353+
" response.raise_for_status()\n",
1354+
" pdf_bytes = response.content\n",
1355+
" pdf_file = io.BytesIO(pdf_bytes)\n",
1356+
" reader = PdfReader(pdf_file, strict=False)\n",
1357+
" all_text = \"\"\n",
1358+
" for page in reader.pages:\n",
1359+
" page_extract_text = page.extract_text()\n",
1360+
" if page_extract_text:\n",
1361+
" all_text += page_extract_text\n",
1362+
" return all_text\n",
1363+
"\n",
1364+
"@bpd.udf(\n",
1365+
" input_types=[str, int, int],\n",
1366+
" output_type=list[str],\n",
1367+
" dataset=DATASET_ID,\n",
1368+
" name=\"pdf_chunk\",\n",
1369+
" bigquery_connection=FULL_CONNECTION_ID,\n",
1370+
" packages=[\"pypdf\", \"requests\", \"cryptography\"],\n",
1371+
")\n",
1372+
"def pdf_chunk(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> list[str]:\n",
1373+
" import io\n",
1374+
" import json\n",
1375+
" from pypdf import PdfReader\n",
1376+
" import requests\n",
1377+
" from requests import adapters\n",
1378+
" session = requests.Session()\n",
1379+
" session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n",
1380+
" src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n",
1381+
" src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n",
1382+
" response = session.get(src_url, timeout=30, stream=True)\n",
1383+
" response.raise_for_status()\n",
1384+
" pdf_bytes = response.content\n",
1385+
" pdf_file = io.BytesIO(pdf_bytes)\n",
1386+
" reader = PdfReader(pdf_file, strict=False)\n",
1387+
" all_text_chunks = []\n",
1388+
" curr_chunk = \"\"\n",
1389+
" for page in reader.pages:\n",
1390+
" page_text = page.extract_text()\n",
1391+
" if page_text:\n",
1392+
" curr_chunk += page_text\n",
1393+
" while len(curr_chunk) >= chunk_size:\n",
1394+
" split_idx = curr_chunk.rfind(\" \", 0, chunk_size)\n",
1395+
" if split_idx == -1:\n",
1396+
" split_idx = chunk_size\n",
1397+
" actual_chunk = curr_chunk[:split_idx]\n",
1398+
" all_text_chunks.append(actual_chunk)\n",
1399+
" overlap = curr_chunk[split_idx + 1 : split_idx + 1 + overlap_size]\n",
1400+
" curr_chunk = overlap + curr_chunk[split_idx + 1 + overlap_size :]\n",
1401+
" if curr_chunk:\n",
1402+
" all_text_chunks.append(curr_chunk)\n",
1403+
" return all_text_chunks"
13661404
]
13671405
},
13681406
{
13691407
"cell_type": "code",
1370-
"execution_count": 19,
1408+
"execution_count": null,
13711409
"metadata": {},
1372-
"outputs": [
1373-
{
1374-
"name": "stderr",
1375-
"output_type": "stream",
1376-
"text": [
1377-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
1378-
"instead of using `db_dtypes` in the future when available in pandas\n",
1379-
"(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
1380-
" warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n",
1381-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n",
1382-
" return method(*args, **kwargs)\n",
1383-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n",
1384-
"future version. Use `json_value_array` instead.\n",
1385-
" warnings.warn(bfe.format_message(msg), category=UserWarning)\n"
1386-
]
1387-
},
1388-
{
1389-
"data": {
1390-
"text/html": [
1391-
"<div>\n",
1392-
"<style scoped>\n",
1393-
" .dataframe tbody tr th:only-of-type {\n",
1394-
" vertical-align: middle;\n",
1395-
" }\n",
1396-
"\n",
1397-
" .dataframe tbody tr th {\n",
1398-
" vertical-align: top;\n",
1399-
" }\n",
1400-
"\n",
1401-
" .dataframe thead th {\n",
1402-
" text-align: right;\n",
1403-
" }\n",
1404-
"</style>\n",
1405-
"<table border=\"1\" class=\"dataframe\">\n",
1406-
" <thead>\n",
1407-
" <tr style=\"text-align: right;\">\n",
1408-
" <th></th>\n",
1409-
" <th>chunked_verbose</th>\n",
1410-
" </tr>\n",
1411-
" </thead>\n",
1412-
" <tbody>\n",
1413-
" <tr>\n",
1414-
" <th>0</th>\n",
1415-
" <td>{'status': '', 'content': array([\"CritterCuisi...</td>\n",
1416-
" </tr>\n",
1417-
" </tbody>\n",
1418-
"</table>\n",
1419-
"<p>1 rows × 1 columns</p>\n",
1420-
"</div>[1 rows x 1 columns in total]"
1421-
],
1422-
"text/plain": [
1423-
" chunked_verbose\n",
1424-
"0 {'status': '', 'content': array([\"CritterCuisi...\n",
1425-
"\n",
1426-
"[1 rows x 1 columns]"
1427-
]
1428-
},
1429-
"execution_count": 19,
1430-
"metadata": {},
1431-
"output_type": "execute_result"
1432-
}
1433-
],
1410+
"outputs": [],
14341411
"source": [
1435-
"df_pdf[\"chunked_verbose\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\", verbose=True)\n",
1436-
"df_pdf[[\"chunked_verbose\"]]"
1412+
"df_pdf = bpd.from_glob_path(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*\", name=\"pdf\")\n",
1413+
"\n",
1414+
"# Generate a JSON string containing the runtime information (including signed read URLs)\n",
1415+
"access_urls = get_runtime_json_str(df_pdf[\"pdf\"], mode=\"R\")\n",
1416+
"\n",
1417+
"# Apply PDF extraction\n",
1418+
"df_pdf[\"extracted_text\"] = access_urls.apply(pdf_extract)\n",
1419+
"\n",
1420+
"# Apply PDF chunking\n",
1421+
"df_pdf[\"chunked\"] = access_urls.apply(pdf_chunk, args=(2000, 200))\n",
1422+
"\n",
1423+
"df_pdf[[\"extracted_text\", \"chunked\"]]"
14371424
]
14381425
},
14391426
{
14401427
"cell_type": "code",
1441-
"execution_count": 20,
1442-
"metadata": {
1443-
"id": "kaPvJATN7zlw"
1444-
},
1445-
"outputs": [
1446-
{
1447-
"name": "stderr",
1448-
"output_type": "stream",
1449-
"text": [
1450-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
1451-
"instead of using `db_dtypes` in the future when available in pandas\n",
1452-
"(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
1453-
" warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n"
1454-
]
1455-
},
1456-
{
1457-
"data": {
1458-
"text/plain": [
1459-
"0 CritterCuisine Pro 5000 - Automatic Pet Feeder...\n",
1460-
"0 on a level, stable surface to prevent tipping....\n",
1461-
"0 included)\\nto maintain the schedule during pow...\n",
1462-
"0 digits for Meal 1 will flash.\\n\u0000. Use the UP/D...\n",
1463-
"0 paperclip) for 5\\nseconds. This will reset all...\n",
1464-
"0 unit with a damp cloth. Do not immerse the bas...\n",
1465-
"0 continues,\\ncontact customer support.\\nE2: Foo...\n",
1466-
"Name: chunked, dtype: string"
1467-
]
1468-
},
1469-
"execution_count": 20,
1470-
"metadata": {},
1471-
"output_type": "execute_result"
1472-
}
1473-
],
1428+
"execution_count": null,
1429+
"metadata": {},
1430+
"outputs": [],
14741431
"source": [
1432+
"# Explode the chunks to see each chunk as a separate row\n",
14751433
"chunked = df_pdf[\"chunked\"].explode()\n",
14761434
"chunked"
14771435
]
@@ -1674,7 +1632,7 @@
16741632
"name": "python",
16751633
"nbconvert_exporter": "python",
16761634
"pygments_lexer": "ipython3",
1677-
"version": "3.13.0"
1635+
"version": "3.10.15"
16781636
}
16791637
},
16801638
"nbformat": 4,

0 commit comments

Comments
 (0)