Skip to content

Commit 84c6f88

Browse files
shuoweilGarrettWuchelsea-linsycai
authored
docs: Add EXIF metadata extraction example to multimodal notebook (#2429)
This PR updates the notebooks/multimodal/multimodal_dataframe.ipynb notebook to include a comprehensive example of extracting EXIF metadata from images. Key Changes: * Added a new section "7. Extract EXIF metadata from images". * Implemented a custom remote function (UDF) using pillow and requests to retrieve and parse EXIF tags from image URLs. * Demonstrated how to apply this function efficiently within a BigFrames workflow to analyze image metadata. This addition provides users with a practical pattern for handling image metadata and using custom libraries within BigQuery DataFrames. Fixes #<478952827> 🦕 --------- Co-authored-by: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Co-authored-by: Chelsea Lin <chelsealin@google.com> Co-authored-by: Shenyang Cai <sycai@users.noreply.github.com>
1 parent 9f1ba1d commit 84c6f88

File tree

1 file changed

+92
-2
lines changed

1 file changed

+92
-2
lines changed

notebooks/multimodal/multimodal_dataframe.ipynb

Lines changed: 92 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@
6161
"3. Conduct image transformations\n",
6262
"4. Use LLM models to ask questions and generate embeddings on images\n",
6363
"5. PDF chunking function\n",
64-
"6. Transcribe audio"
64+
"6. Transcribe audio\n",
65+
"7. Extract EXIF metadata from images"
6566
]
6667
},
6768
{
@@ -104,6 +105,11 @@
104105
"PROJECT = \"bigframes-dev\" # replace with your project. \n",
105106
"# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#required_roles for your required permissions\n",
106107
"\n",
108+
"LOCATION = \"us\" # replace with your location.\n",
109+
"\n",
110+
"# Dataset where the UDF will be created.\n",
111+
"DATASET_ID = \"bigframes_samples\" # replace with your dataset ID.\n",
112+
"\n",
107113
"OUTPUT_BUCKET = \"bigframes_blob_test\" # replace with your GCS bucket. \n",
108114
"# The connection (or bigframes-default-connection of the project) must have read/write permission to the bucket. \n",
109115
"# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#grant-permissions for setting up connection service account permissions.\n",
@@ -112,12 +118,14 @@
112118
"import bigframes\n",
113119
"# Setup project\n",
114120
"bigframes.options.bigquery.project = PROJECT\n",
121+
"bigframes.options.bigquery.location = LOCATION\n",
115122
"\n",
116123
"# Display options\n",
117124
"bigframes.options.display.blob_display_width = 300\n",
118125
"bigframes.options.display.progress_bar = None\n",
119126
"\n",
120-
"import bigframes.pandas as bpd"
127+
"import bigframes.pandas as bpd\n",
128+
"import bigframes.bigquery as bbq"
121129
]
122130
},
123131
{
@@ -1546,6 +1554,88 @@
15461554
"transcribed_series_verbose = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=True)\n",
15471555
"transcribed_series_verbose"
15481556
]
1557+
},
1558+
{
1559+
"cell_type": "markdown",
1560+
"metadata": {},
1561+
"source": [
1562+
"### 7. Extract EXIF metadata from images"
1563+
]
1564+
},
1565+
{
1566+
"cell_type": "markdown",
1567+
"metadata": {},
1568+
"source": [
1569+
"This section demonstrates how to extract EXIF metadata from images using a custom BigQuery Python UDF and the `Pillow` library."
1570+
]
1571+
},
1572+
{
1573+
"cell_type": "code",
1574+
"execution_count": null,
1575+
"metadata": {},
1576+
"outputs": [],
1577+
"source": [
1578+
"# Construct the canonical connection ID\n",
1579+
"FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n",
1580+
"\n",
1581+
"@bpd.udf(\n",
1582+
" input_types=[str],\n",
1583+
" output_type=str,\n",
1584+
" dataset=DATASET_ID,\n",
1585+
" name=\"extract_exif\",\n",
1586+
" bigquery_connection=FULL_CONNECTION_ID,\n",
1587+
" packages=[\"pillow\", \"requests\"],\n",
1588+
" max_batching_rows=8192,\n",
1589+
" container_cpu=0.33,\n",
1590+
" container_memory=\"512Mi\"\n",
1591+
")\n",
1592+
"def extract_exif(src_obj_ref_rt: str) -> str:\n",
1593+
" import io\n",
1594+
" import json\n",
1595+
" from PIL import ExifTags, Image\n",
1596+
" import requests\n",
1597+
" from requests import adapters\n",
1598+
" session = requests.Session()\n",
1599+
" session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n",
1600+
" src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n",
1601+
" src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n",
1602+
" response = session.get(src_url, timeout=30)\n",
1603+
" bts = response.content\n",
1604+
" image = Image.open(io.BytesIO(bts))\n",
1605+
" exif_data = image.getexif()\n",
1606+
" exif_dict = {}\n",
1607+
" if exif_data:\n",
1608+
" for tag, value in exif_data.items():\n",
1609+
" tag_name = ExifTags.TAGS.get(tag, tag)\n",
1610+
" exif_dict[tag_name] = value\n",
1611+
" return json.dumps(exif_dict)"
1612+
]
1613+
},
1614+
{
1615+
"cell_type": "code",
1616+
"execution_count": null,
1617+
"metadata": {},
1618+
"outputs": [],
1619+
"source": [
1620+
"# Create a Multimodal DataFrame from the sample image URIs\n",
1621+
"exif_image_df = bpd.from_glob_path(\n",
1622+
" \"gs://bigframes_blob_test/images_exif/*\",\n",
1623+
" name=\"blob_col\",\n",
1624+
")\n",
1625+
"\n",
1626+
"# Generate a JSON string containing the runtime information (including signed read URLs)\n",
1627+
"# This allows the UDF to download the images from Google Cloud Storage\n",
1628+
"access_urls = exif_image_df[\"blob_col\"].blob.get_runtime_json_str(mode=\"R\")\n",
1629+
"\n",
1630+
"# Apply the BigQuery Python UDF to the runtime JSON strings\n",
1631+
"# We cast to string to ensure the input matches the UDF's signature\n",
1632+
"exif_json = access_urls.astype(str).apply(extract_exif)\n",
1633+
"\n",
1634+
"# Parse the resulting JSON strings back into a structured JSON type for easier access\n",
1635+
"exif_data = bbq.parse_json(exif_json)\n",
1636+
"\n",
1637+
"exif_data"
1638+
]
15491639
}
15501640
],
15511641
"metadata": {

0 commit comments

Comments
 (0)