docs: Add EXIF metadata extraction example to multimodal notebook (#2429)

shuoweil · GarrettWu · chelsea-lin · web-flow · commit 84c6f883aef8 · 2026-02-09T13:53:56.000-08:00
This PR updates the notebooks/multimodal/multimodal_dataframe.ipynb
notebook to include a comprehensive example of extracting EXIF metadata
from images.

  Key Changes:
   * Added a new section "7. Extract EXIF metadata from images".
* Implemented a custom remote function (UDF) using pillow and requests
to retrieve and parse EXIF tags from image URLs.
* Demonstrated how to apply this function efficiently within a BigFrames
workflow to analyze image metadata.

This addition provides users with a practical pattern for handling image
metadata and using custom libraries within BigQuery DataFrames.

Fixes #&lt;478952827&gt; 🦕

---------

Co-authored-by: Garrett Wu &lt;6505921+GarrettWu@users.noreply.github.com&gt;
Co-authored-by: Chelsea Lin &lt;chelsealin@google.com&gt;
Co-authored-by: Shenyang Cai &lt;sycai@users.noreply.github.com&gt;
diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb
@@ -61,7 +61,8 @@
         "3. Conduct image transformations\n",
         "4. Use LLM models to ask questions and generate embeddings on images\n",
         "5. PDF chunking function\n",
-        "6. Transcribe audio"
+        "6. Transcribe audio\n",
+        "7. Extract EXIF metadata from images"
       ]
     },
     {
@@ -104,6 +105,11 @@
         "PROJECT = \"bigframes-dev\" # replace with your project. \n",
         "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#required_roles for your required permissions\n",
         "\n",
+        "LOCATION = \"us\" # replace with your location.\n",
+        "\n",
+        "# Dataset where the UDF will be created.\n",
+        "DATASET_ID = \"bigframes_samples\" # replace with your dataset ID.\n",
+        "\n",
         "OUTPUT_BUCKET = \"bigframes_blob_test\" # replace with your GCS bucket. \n",
         "# The connection (or bigframes-default-connection of the project) must have read/write permission to the bucket. \n",
         "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#grant-permissions for setting up connection service account permissions.\n",
@@ -112,12 +118,14 @@
         "import bigframes\n",
         "# Setup project\n",
         "bigframes.options.bigquery.project = PROJECT\n",
+        "bigframes.options.bigquery.location = LOCATION\n",
         "\n",
         "# Display options\n",
         "bigframes.options.display.blob_display_width = 300\n",
         "bigframes.options.display.progress_bar = None\n",
         "\n",
-        "import bigframes.pandas as bpd"
+        "import bigframes.pandas as bpd\n",
+        "import bigframes.bigquery as bbq"
       ]
     },
     {
@@ -1546,6 +1554,88 @@
         "transcribed_series_verbose = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=True)\n",
         "transcribed_series_verbose"
       ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 7. Extract EXIF metadata from images"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "This section demonstrates how to extract EXIF metadata from images using a custom BigQuery Python UDF and the `Pillow` library."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Construct the canonical connection ID\n",
+        "FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n",
+        "\n",
+        "@bpd.udf(\n",
+        "    input_types=[str],\n",
+        "    output_type=str,\n",
+        "    dataset=DATASET_ID,\n",
+        "    name=\"extract_exif\",\n",
+        "    bigquery_connection=FULL_CONNECTION_ID,\n",
+        "    packages=[\"pillow\", \"requests\"],\n",
+        "    max_batching_rows=8192,\n",
+        "    container_cpu=0.33,\n",
+        "    container_memory=\"512Mi\"\n",
+        ")\n",
+        "def extract_exif(src_obj_ref_rt: str) -> str:\n",
+        "    import io\n",
+        "    import json\n",
+        "    from PIL import ExifTags, Image\n",
+        "    import requests\n",
+        "    from requests import adapters\n",
+        "    session = requests.Session()\n",
+        "    session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n",
+        "    src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n",
+        "    src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n",
+        "    response = session.get(src_url, timeout=30)\n",
+        "    bts = response.content\n",
+        "    image = Image.open(io.BytesIO(bts))\n",
+        "    exif_data = image.getexif()\n",
+        "    exif_dict = {}\n",
+        "    if exif_data:\n",
+        "        for tag, value in exif_data.items():\n",
+        "            tag_name = ExifTags.TAGS.get(tag, tag)\n",
+        "            exif_dict[tag_name] = value\n",
+        "    return json.dumps(exif_dict)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Create a Multimodal DataFrame from the sample image URIs\n",
+        "exif_image_df = bpd.from_glob_path(\n",
+        "    \"gs://bigframes_blob_test/images_exif/*\",\n",
+        "    name=\"blob_col\",\n",
+        ")\n",
+        "\n",
+        "# Generate a JSON string containing the runtime information (including signed read URLs)\n",
+        "# This allows the UDF to download the images from Google Cloud Storage\n",
+        "access_urls = exif_image_df[\"blob_col\"].blob.get_runtime_json_str(mode=\"R\")\n",
+        "\n",
+        "# Apply the BigQuery Python UDF to the runtime JSON strings\n",
+        "# We cast to string to ensure the input matches the UDF's signature\n",
+        "exif_json = access_urls.astype(str).apply(extract_exif)\n",
+        "\n",
+        "# Parse the resulting JSON strings back into a structured JSON type for easier access\n",
+        "exif_data = bbq.parse_json(exif_json)\n",
+        "\n",
+        "exif_data"
+      ]
     }
   ],
   "metadata": {