|
61 | 61 | "3. Conduct image transformations\n", |
62 | 62 | "4. Use LLM models to ask questions and generate embeddings on images\n", |
63 | 63 | "5. PDF chunking function\n", |
64 | | - "6. Transcribe audio" |
| 64 | + "6. Transcribe audio\n", |
| 65 | + "7. Extract EXIF metadata from images" |
65 | 66 | ] |
66 | 67 | }, |
67 | 68 | { |
|
104 | 105 | "PROJECT = \"bigframes-dev\" # replace with your project. \n", |
105 | 106 | "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#required_roles for your required permissions\n", |
106 | 107 | "\n", |
| 108 | + "LOCATION = \"us\" # replace with your location.\n", |
| 109 | + "\n", |
| 110 | + "# Dataset where the UDF will be created.\n", |
| 111 | + "DATASET_ID = \"bigframes_samples\" # replace with your dataset ID.\n", |
| 112 | + "\n", |
107 | 113 | "OUTPUT_BUCKET = \"bigframes_blob_test\" # replace with your GCS bucket. \n", |
108 | 114 | "# The connection (or bigframes-default-connection of the project) must have read/write permission to the bucket. \n", |
109 | 115 | "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#grant-permissions for setting up connection service account permissions.\n", |
|
112 | 118 | "import bigframes\n", |
113 | 119 | "# Setup project\n", |
114 | 120 | "bigframes.options.bigquery.project = PROJECT\n", |
| 121 | + "bigframes.options.bigquery.location = LOCATION\n", |
115 | 122 | "\n", |
116 | 123 | "# Display options\n", |
117 | 124 | "bigframes.options.display.blob_display_width = 300\n", |
118 | 125 | "bigframes.options.display.progress_bar = None\n", |
119 | 126 | "\n", |
120 | | - "import bigframes.pandas as bpd" |
| 127 | + "import bigframes.pandas as bpd\n", |
| 128 | + "import bigframes.bigquery as bbq" |
121 | 129 | ] |
122 | 130 | }, |
123 | 131 | { |
|
1546 | 1554 | "transcribed_series_verbose = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=True)\n", |
1547 | 1555 | "transcribed_series_verbose" |
1548 | 1556 | ] |
| 1557 | + }, |
| 1558 | + { |
| 1559 | + "cell_type": "markdown", |
| 1560 | + "metadata": {}, |
| 1561 | + "source": [ |
| 1562 | + "### 7. Extract EXIF metadata from images" |
| 1563 | + ] |
| 1564 | + }, |
| 1565 | + { |
| 1566 | + "cell_type": "markdown", |
| 1567 | + "metadata": {}, |
| 1568 | + "source": [ |
| 1569 | + "This section demonstrates how to extract EXIF metadata from images using a custom BigQuery Python UDF and the `Pillow` library." |
| 1570 | + ] |
| 1571 | + }, |
| 1572 | + { |
| 1573 | + "cell_type": "code", |
| 1574 | + "execution_count": null, |
| 1575 | + "metadata": {}, |
| 1576 | + "outputs": [], |
| 1577 | + "source": [ |
| 1578 | + "# Construct the canonical connection ID\n", |
| 1579 | + "FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n", |
| 1580 | + "\n", |
| 1581 | + "@bpd.udf(\n", |
| 1582 | + " input_types=[str],\n", |
| 1583 | + " output_type=str,\n", |
| 1584 | + " dataset=DATASET_ID,\n", |
| 1585 | + " name=\"extract_exif\",\n", |
| 1586 | + " bigquery_connection=FULL_CONNECTION_ID,\n", |
| 1587 | + " packages=[\"pillow\", \"requests\"],\n", |
| 1588 | + " max_batching_rows=8192,\n", |
| 1589 | + " container_cpu=0.33,\n", |
| 1590 | + " container_memory=\"512Mi\"\n", |
| 1591 | + ")\n", |
| 1592 | + "def extract_exif(src_obj_ref_rt: str) -> str:\n", |
| 1593 | + " import io\n", |
| 1594 | + " import json\n", |
| 1595 | + " from PIL import ExifTags, Image\n", |
| 1596 | + " import requests\n", |
| 1597 | + " from requests import adapters\n", |
| 1598 | + " session = requests.Session()\n", |
| 1599 | + " session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n", |
| 1600 | + " src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n", |
| 1601 | + " src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n", |
| 1602 | + " response = session.get(src_url, timeout=30)\n", |
| 1603 | + " bts = response.content\n", |
| 1604 | + " image = Image.open(io.BytesIO(bts))\n", |
| 1605 | + " exif_data = image.getexif()\n", |
| 1606 | + " exif_dict = {}\n", |
| 1607 | + " if exif_data:\n", |
| 1608 | + " for tag, value in exif_data.items():\n", |
| 1609 | + " tag_name = ExifTags.TAGS.get(tag, tag)\n", |
| 1610 | + " exif_dict[tag_name] = value\n", |
| 1611 | + " return json.dumps(exif_dict)" |
| 1612 | + ] |
| 1613 | + }, |
| 1614 | + { |
| 1615 | + "cell_type": "code", |
| 1616 | + "execution_count": null, |
| 1617 | + "metadata": {}, |
| 1618 | + "outputs": [], |
| 1619 | + "source": [ |
| 1620 | + "# Create a Multimodal DataFrame from the sample image URIs\n", |
| 1621 | + "exif_image_df = bpd.from_glob_path(\n", |
| 1622 | + " \"gs://bigframes_blob_test/images_exif/*\",\n", |
| 1623 | + " name=\"blob_col\",\n", |
| 1624 | + ")\n", |
| 1625 | + "\n", |
| 1626 | + "# Generate a JSON string containing the runtime information (including signed read URLs)\n", |
| 1627 | + "# This allows the UDF to download the images from Google Cloud Storage\n", |
| 1628 | + "access_urls = exif_image_df[\"blob_col\"].blob.get_runtime_json_str(mode=\"R\")\n", |
| 1629 | + "\n", |
| 1630 | + "# Apply the BigQuery Python UDF to the runtime JSON strings\n", |
| 1631 | + "# We cast to string to ensure the input matches the UDF's signature\n", |
| 1632 | + "exif_json = access_urls.astype(str).apply(extract_exif)\n", |
| 1633 | + "\n", |
| 1634 | + "# Parse the resulting JSON strings back into a structured JSON type for easier access\n", |
| 1635 | + "exif_data = bbq.parse_json(exif_json)\n", |
| 1636 | + "\n", |
| 1637 | + "exif_data" |
| 1638 | + ] |
1549 | 1639 | } |
1550 | 1640 | ], |
1551 | 1641 | "metadata": { |
|
0 commit comments