diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index 2a11974b8d..3dfe38811b 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -14,6 +14,7 @@ from dataclasses import dataclass import inspect +import typing from typing import Callable, Iterable, Union import google.cloud.bigquery as bigquery @@ -70,6 +71,12 @@ def _input_bq_signature(self): def _output_bq_type(self): sig = inspect.signature(self._func) + return_annotation = sig.return_annotation + origin = typing.get_origin(return_annotation) + if origin is Union: + args = typing.get_args(return_annotation) + if len(args) == 2 and args[1] is type(None): + return _PYTHON_TO_BQ_TYPES[args[0]] return _PYTHON_TO_BQ_TYPES[sig.return_annotation] def _create_udf(self): @@ -78,7 +85,7 @@ def _create_udf(self): self._session._anon_dataset_manager.generate_unique_resource_id() ) - func_body = inspect.getsource(self._func) + func_body = "import typing\n" + inspect.getsource(self._func) func_name = self._func.__name__ packages = str(list(self._requirements)) @@ -120,43 +127,50 @@ def udf(self): def exif_func(src_obj_ref_rt: str, verbose: bool) -> str: - import io - import json + try: + import io + import json - from PIL import ExifTags, Image - import requests - from requests import adapters + from PIL import ExifTags, Image + import requests + from requests import adapters - result_dict = {"status": "", "content": "{}"} - try: session = requests.Session() session.mount("https://", adapters.HTTPAdapter(max_retries=3)) src_obj_ref_rt_json = json.loads(src_obj_ref_rt) - src_url = src_obj_ref_rt_json["access_urls"]["read_url"] response = session.get(src_url, timeout=30) + response.raise_for_status() bts = response.content image = Image.open(io.BytesIO(bts)) exif_data = image.getexif() exif_dict = {} + if exif_data: for tag, value in exif_data.items(): tag_name = ExifTags.TAGS.get(tag, tag) - # Pillow might return bytes, which are not serializable. - if isinstance(value, bytes): - value = value.decode("utf-8", "replace") - exif_dict[tag_name] = value - result_dict["content"] = json.dumps(exif_dict) - except Exception as e: - result_dict["status"] = str(e) + # Convert non-serializable types to strings + try: + json.dumps(value) + exif_dict[tag_name] = value + except (TypeError, ValueError): + exif_dict[tag_name] = str(value) + + if verbose: + return json.dumps({"status": "", "content": json.dumps(exif_dict)}) + else: + return json.dumps(exif_dict) - if verbose: - return json.dumps(result_dict) - else: - return result_dict["content"] + except Exception as e: + # Return error as JSON with error field + error_result = {"status": f"{type(e).__name__}: {str(e)}", "content": "{}"} + if verbose: + return json.dumps(error_result) + else: + return "{}" exif_func_def = FunctionDef(exif_func, ["pillow", "requests"]) @@ -170,12 +184,10 @@ def image_blur_func( ksize_y: int, ext: str, verbose: bool, -) -> str: - import json - - result_dict = {"status": "", "content": dst_obj_ref_rt} - +) -> typing.Optional[str]: try: + import json + import cv2 as cv # type: ignore import numpy as np import requests @@ -193,35 +205,52 @@ def image_blur_func( dst_url = dst_obj_ref_rt_json["access_urls"]["write_url"] response = session.get(src_url, timeout=30) + response.raise_for_status() # Raise exception for HTTP errors bts = response.content nparr = np.frombuffer(bts, np.uint8) img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + + if img is None: + raise ValueError( + "Failed to decode image - possibly corrupted or unsupported format" + ) + img_blurred = cv.blur(img, ksize=(ksize_x, ksize_y)) - bts = cv.imencode(ext, img_blurred)[1].tobytes() + success, encoded = cv.imencode(ext, img_blurred) + if not success: + raise ValueError(f"Failed to encode image with extension {ext}") + + bts = encoded.tobytes() ext = ext.replace(".", "") ext_mappings = {"jpg": "jpeg", "tif": "tiff"} ext = ext_mappings.get(ext, ext) content_type = "image/" + ext - session.put( + put_response = session.put( url=dst_url, data=bts, - headers={ - "Content-Type": content_type, - }, + headers={"Content-Type": content_type}, timeout=30, ) + put_response.raise_for_status() - except Exception as e: - result_dict["status"] = str(e) + if verbose: + return json.dumps({"status": "", "content": dst_obj_ref_rt}) + else: + return dst_obj_ref_rt - if verbose: - return json.dumps(result_dict) - else: - return result_dict["content"] + except Exception as e: + if verbose: + error_result = { + "status": f"Error: {type(e).__name__}: {str(e)}", + "content": "", + } + return json.dumps(error_result) + else: + return None image_blur_def = FunctionDef(image_blur_func, ["opencv-python", "numpy", "requests"]) @@ -233,9 +262,6 @@ def image_blur_to_bytes_func( import base64 import json - status = "" - content = b"" - try: import cv2 as cv # type: ignore import numpy as np @@ -251,22 +277,36 @@ def image_blur_to_bytes_func( src_url = src_obj_ref_rt_json["access_urls"]["read_url"] response = session.get(src_url, timeout=30) + response.raise_for_status() bts = response.content nparr = np.frombuffer(bts, np.uint8) img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + if img is None: + raise ValueError( + "Failed to decode image - possibly corrupted or unsupported format" + ) img_blurred = cv.blur(img, ksize=(ksize_x, ksize_y)) - content = cv.imencode(ext, img_blurred)[1].tobytes() + success, encoded = cv.imencode(ext, img_blurred) + if not success: + raise ValueError(f"Failed to encode image with extension {ext}") + content = encoded.tobytes() + + encoded_content = base64.b64encode(content).decode("utf-8") + result_dict = {"status": "", "content": encoded_content} + if verbose: + return json.dumps(result_dict) + else: + return result_dict["content"] except Exception as e: - status = str(e) - - encoded_content = base64.b64encode(content).decode("utf-8") - result_dict = {"status": status, "content": encoded_content} - if verbose: - return json.dumps(result_dict) - else: - return result_dict["content"] + status = f"Error: {type(e).__name__}: {str(e)}" + encoded_content = base64.b64encode(b"").decode("utf-8") + result_dict = {"status": status, "content": encoded_content} + if verbose: + return json.dumps(result_dict) + else: + return result_dict["content"] image_blur_to_bytes_def = FunctionDef( @@ -283,12 +323,10 @@ def image_resize_func( fy: float, ext: str, verbose: bool, -) -> str: - import json - - result_dict = {"status": "", "content": dst_obj_ref_rt} - +) -> typing.Optional[str]: try: + import json + import cv2 as cv # type: ignore import numpy as np import requests @@ -306,20 +344,28 @@ def image_resize_func( dst_url = dst_obj_ref_rt_json["access_urls"]["write_url"] response = session.get(src_url, timeout=30) + response.raise_for_status() bts = response.content nparr = np.frombuffer(bts, np.uint8) img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + if img is None: + raise ValueError( + "Failed to decode image - possibly corrupted or unsupported format" + ) img_resized = cv.resize(img, dsize=(dsize_x, dsize_y), fx=fx, fy=fy) - bts = cv.imencode(ext, img_resized)[1].tobytes() + success, encoded = cv.imencode(ext, img_resized) + if not success: + raise ValueError(f"Failed to encode image with extension {ext}") + bts = encoded.tobytes() ext = ext.replace(".", "") ext_mappings = {"jpg": "jpeg", "tif": "tiff"} ext = ext_mappings.get(ext, ext) content_type = "image/" + ext - session.put( + put_response = session.put( url=dst_url, data=bts, headers={ @@ -327,14 +373,22 @@ def image_resize_func( }, timeout=30, ) + put_response.raise_for_status() - except Exception as e: - result_dict["status"] = str(e) + if verbose: + return json.dumps({"status": "", "content": dst_obj_ref_rt}) + else: + return dst_obj_ref_rt - if verbose: - return json.dumps(result_dict) - else: - return result_dict["content"] + except Exception as e: + if verbose: + error_result = { + "status": f"Error: {type(e).__name__}: {str(e)}", + "content": "", + } + return json.dumps(error_result) + else: + return None image_resize_def = FunctionDef( @@ -354,9 +408,6 @@ def image_resize_to_bytes_func( import base64 import json - status = "" - content = b"" - try: import cv2 as cv # type: ignore import numpy as np @@ -372,22 +423,36 @@ def image_resize_to_bytes_func( src_url = src_obj_ref_rt_json["access_urls"]["read_url"] response = session.get(src_url, timeout=30) + response.raise_for_status() bts = response.content nparr = np.frombuffer(bts, np.uint8) img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + if img is None: + raise ValueError( + "Failed to decode image - possibly corrupted or unsupported format" + ) img_resized = cv.resize(img, dsize=(dsize_x, dsize_y), fx=fx, fy=fy) - content = cv.imencode(".jpeg", img_resized)[1].tobytes() + success, encoded = cv.imencode(ext, img_resized) + if not success: + raise ValueError(f"Failed to encode image with extension {ext}") + content = encoded.tobytes() + + encoded_content = base64.b64encode(content).decode("utf-8") + result_dict = {"status": "", "content": encoded_content} + if verbose: + return json.dumps(result_dict) + else: + return result_dict["content"] except Exception as e: - status = str(e) - - encoded_content = base64.b64encode(content).decode("utf-8") - result_dict = {"status": status, "content": encoded_content} - if verbose: - return json.dumps(result_dict) - else: - return result_dict["content"] + status = f"Error: {type(e).__name__}: {str(e)}" + encoded_content = base64.b64encode(b"").decode("utf-8") + result_dict = {"status": status, "content": encoded_content} + if verbose: + return json.dumps(result_dict) + else: + return result_dict["content"] image_resize_to_bytes_def = FunctionDef( @@ -403,12 +468,10 @@ def image_normalize_func( norm_type: str, ext: str, verbose: bool, -) -> str: - import json - - result_dict = {"status": "", "content": dst_obj_ref_rt} - +) -> typing.Optional[str]: try: + import json + import cv2 as cv # type: ignore import numpy as np import requests @@ -433,22 +496,30 @@ def image_normalize_func( dst_url = dst_obj_ref_rt_json["access_urls"]["write_url"] response = session.get(src_url, timeout=30) + response.raise_for_status() bts = response.content nparr = np.frombuffer(bts, np.uint8) img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + if img is None: + raise ValueError( + "Failed to decode image - possibly corrupted or unsupported format" + ) img_normalized = cv.normalize( img, None, alpha=alpha, beta=beta, norm_type=norm_type_mapping[norm_type] ) - bts = cv.imencode(ext, img_normalized)[1].tobytes() + success, encoded = cv.imencode(ext, img_normalized) + if not success: + raise ValueError(f"Failed to encode image with extension {ext}") + bts = encoded.tobytes() ext = ext.replace(".", "") ext_mappings = {"jpg": "jpeg", "tif": "tiff"} ext = ext_mappings.get(ext, ext) content_type = "image/" + ext - session.put( + put_response = session.put( url=dst_url, data=bts, headers={ @@ -456,14 +527,22 @@ def image_normalize_func( }, timeout=30, ) + put_response.raise_for_status() - except Exception as e: - result_dict["status"] = str(e) + if verbose: + return json.dumps({"status": "", "content": dst_obj_ref_rt}) + else: + return dst_obj_ref_rt - if verbose: - return json.dumps(result_dict) - else: - return result_dict["content"] + except Exception as e: + if verbose: + error_result = { + "status": f"Error: {type(e).__name__}: {str(e)}", + "content": "", + } + return json.dumps(error_result) + else: + return None image_normalize_def = FunctionDef( @@ -482,8 +561,6 @@ def image_normalize_to_bytes_func( import base64 import json - result_dict = {"status": "", "content": ""} - try: import cv2 as cv # type: ignore import numpy as np @@ -506,25 +583,39 @@ def image_normalize_to_bytes_func( src_url = src_obj_ref_rt_json["access_urls"]["read_url"] response = session.get(src_url, timeout=30) + response.raise_for_status() bts = response.content nparr = np.frombuffer(bts, np.uint8) img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + if img is None: + raise ValueError( + "Failed to decode image - possibly corrupted or unsupported format" + ) img_normalized = cv.normalize( img, None, alpha=alpha, beta=beta, norm_type=norm_type_mapping[norm_type] ) - bts = cv.imencode(".jpeg", img_normalized)[1].tobytes() + success, encoded = cv.imencode(ext, img_normalized) + if not success: + raise ValueError(f"Failed to encode image with extension {ext}") + content = encoded.tobytes() - content_b64 = base64.b64encode(bts).decode("utf-8") - result_dict["content"] = content_b64 + encoded_content = base64.b64encode(content).decode("utf-8") + result_dict = {"status": "", "content": encoded_content} - except Exception as e: - result_dict["status"] = str(e) + if verbose: + return json.dumps(result_dict) + else: + return result_dict["content"] - if verbose: - return json.dumps(result_dict) - else: - return result_dict["content"] + except Exception as e: + status = f"Error: {type(e).__name__}: {str(e)}" + encoded_content = base64.b64encode(b"").decode("utf-8") + result_dict = {"status": status, "content": encoded_content} + if verbose: + return json.dumps(result_dict) + else: + return result_dict["content"] image_normalize_to_bytes_def = FunctionDef( diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 1f6b75a8f5..577de458f4 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -193,6 +193,20 @@ def _df_apply_udf( return s + def _apply_udf_or_raise_error( + self, df: bigframes.dataframe.DataFrame, udf, operation_name: str + ) -> bigframes.series.Series: + """Helper to apply UDF with consistent error handling.""" + try: + res = self._df_apply_udf(df, udf) + except Exception as e: + raise RuntimeError(f"{operation_name} UDF execution failed: {e}") from e + + if res is None: + raise RuntimeError(f"{operation_name} returned None result") + + return res + def read_url(self) -> bigframes.series.Series: """Retrieve the read URL of the Blob. @@ -343,6 +357,10 @@ def exif( Returns: bigframes.series.Series: JSON series of key-value pairs if verbose=False, or struct with status and content if verbose=True. + + Raises: + ValueError: If engine is not 'pillow'. + RuntimeError: If EXIF extraction fails or returns invalid structure. """ if engine is None or engine.casefold() != "pillow": raise ValueError("Must specify the engine, supported value is 'pillow'.") @@ -364,22 +382,28 @@ def exif( container_memory=container_memory, ).udf() - res = self._df_apply_udf(df, exif_udf) + res = self._apply_udf_or_raise_error(df, exif_udf, "EXIF extraction") if verbose: - exif_content_series = bbq.parse_json( - res._apply_unary_op(ops.JSONValue(json_path="$.content")) - ).rename("exif_content") - exif_status_series = res._apply_unary_op( - ops.JSONValue(json_path="$.status") - ) + try: + exif_content_series = bbq.parse_json( + res._apply_unary_op(ops.JSONValue(json_path="$.content")) + ).rename("exif_content") + exif_status_series = res._apply_unary_op( + ops.JSONValue(json_path="$.status") + ) + except Exception as e: + raise RuntimeError(f"Failed to parse EXIF JSON result: {e}") from e results_df = bpd.DataFrame( {"status": exif_status_series, "content": exif_content_series} ) results_struct = bbq.struct(results_df).rename("exif_results") return results_struct else: - return bbq.parse_json(res) + try: + return bbq.parse_json(res) + except Exception as e: + raise RuntimeError(f"Failed to parse EXIF JSON result: {e}") from e def image_blur( self, @@ -411,6 +435,10 @@ def image_blur( Returns: bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ. If verbose=True, returns struct with status and content. + + Raises: + ValueError: If engine is not 'opencv' or parameters are invalid. + RuntimeError: If image blur operation fails. """ if engine is None or engine.casefold() != "opencv": raise ValueError("Must specify the engine, supported value is 'opencv'.") @@ -437,7 +465,7 @@ def image_blur( df["ksize_x"], df["ksize_y"] = ksize df["ext"] = ext # type: ignore df["verbose"] = verbose - res = self._df_apply_udf(df, image_blur_udf) + res = self._apply_udf_or_raise_error(df, image_blur_udf, "Image blur") if verbose: blurred_content_b64_series = res._apply_unary_op( @@ -486,7 +514,7 @@ def image_blur( df["ext"] = ext # type: ignore df["verbose"] = verbose - res = self._df_apply_udf(df, image_blur_udf) + res = self._apply_udf_or_raise_error(df, image_blur_udf, "Image blur") res.cache() # to execute the udf if verbose: @@ -540,6 +568,10 @@ def image_resize( Returns: bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ. If verbose=True, returns struct with status and content. + + Raises: + ValueError: If engine is not 'opencv' or parameters are invalid. + RuntimeError: If image resize operation fails. """ if engine is None or engine.casefold() != "opencv": raise ValueError("Must specify the engine, supported value is 'opencv'.") @@ -570,11 +602,11 @@ def image_resize( container_memory=container_memory, ).udf() - df["dsize_x"], df["dsizye_y"] = dsize + df["dsize_x"], df["dsize_y"] = dsize df["fx"], df["fy"] = fx, fy df["ext"] = ext # type: ignore df["verbose"] = verbose - res = self._df_apply_udf(df, image_resize_udf) + res = self._apply_udf_or_raise_error(df, image_resize_udf, "Image resize") if verbose: resized_content_b64_series = res._apply_unary_op( @@ -620,12 +652,12 @@ def image_resize( dst_rt = dst.blob.get_runtime_json_str(mode="RW") df = df.join(dst_rt, how="outer") - df["dsize_x"], df["dsizye_y"] = dsize + df["dsize_x"], df["dsize_y"] = dsize df["fx"], df["fy"] = fx, fy df["ext"] = ext # type: ignore df["verbose"] = verbose - res = self._df_apply_udf(df, image_resize_udf) + res = self._apply_udf_or_raise_error(df, image_resize_udf, "Image resize") res.cache() # to execute the udf if verbose: @@ -679,6 +711,10 @@ def image_normalize( Returns: bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ. If verbose=True, returns struct with status and content. + + Raises: + ValueError: If engine is not 'opencv' or parameters are invalid. + RuntimeError: If image normalize operation fails. """ if engine is None or engine.casefold() != "opencv": raise ValueError("Must specify the engine, supported value is 'opencv'.") @@ -707,7 +743,9 @@ def image_normalize( df["norm_type"] = norm_type df["ext"] = ext # type: ignore df["verbose"] = verbose - res = self._df_apply_udf(df, image_normalize_udf) + res = self._apply_udf_or_raise_error( + df, image_normalize_udf, "Image normalize" + ) if verbose: normalized_content_b64_series = res._apply_unary_op( @@ -758,7 +796,7 @@ def image_normalize( df["ext"] = ext # type: ignore df["verbose"] = verbose - res = self._df_apply_udf(df, image_normalize_udf) + res = self._apply_udf_or_raise_error(df, image_normalize_udf, "Image normalize") res.cache() # to execute the udf if verbose: @@ -809,6 +847,10 @@ def pdf_extract( depend on the "verbose" parameter. Contains the extracted text from the PDF file. Includes error messages if verbosity is enabled. + + Raises: + ValueError: If engine is not 'pypdf'. + RuntimeError: If PDF extraction fails or returns invalid structure. """ if engine is None or engine.casefold() != "pypdf": raise ValueError("Must specify the engine, supported value is 'pypdf'.") @@ -830,18 +872,29 @@ def pdf_extract( df = self.get_runtime_json_str(mode="R").to_frame() df["verbose"] = verbose - res = self._df_apply_udf(df, pdf_extract_udf) + + res = self._apply_udf_or_raise_error(df, pdf_extract_udf, "PDF extraction") if verbose: - extracted_content_series = res._apply_unary_op( - ops.JSONValue(json_path="$.content") - ) - status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status")) - results_df = bpd.DataFrame( - {"status": status_series, "content": extracted_content_series} - ) - results_struct = bbq.struct(results_df).rename("extracted_results") - return results_struct + # Extract content with error handling + try: + content_series = res._apply_unary_op( + ops.JSONValue(json_path="$.content") + ) + except Exception as e: + raise RuntimeError( + f"Failed to extract content field from PDF result: {e}" + ) from e + try: + status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status")) + except Exception as e: + raise RuntimeError( + f"Failed to extract status field from PDF result: {e}" + ) from e + + res_df = bpd.DataFrame({"status": status_series, "content": content_series}) + struct_series = bbq.struct(res_df).rename("extracted_results") + return struct_series else: return res.rename("extracted_content") @@ -884,6 +937,10 @@ def pdf_chunk( depend on the "verbose" parameter. where each string is a chunk of text extracted from PDF. Includes error messages if verbosity is enabled. + + Raises: + ValueError: If engine is not 'pypdf'. + RuntimeError: If PDF chunking fails or returns invalid structure. """ if engine is None or engine.casefold() != "pypdf": raise ValueError("Must specify the engine, supported value is 'pypdf'.") @@ -915,13 +972,25 @@ def pdf_chunk( df["overlap_size"] = overlap_size df["verbose"] = verbose - res = self._df_apply_udf(df, pdf_chunk_udf) + res = self._apply_udf_or_raise_error(df, pdf_chunk_udf, "PDF chunking") + + try: + content_series = bbq.json_extract_string_array(res, "$.content") + except Exception as e: + raise RuntimeError( + f"Failed to extract content array from PDF chunk result: {e}" + ) from e if verbose: - chunked_content_series = bbq.json_extract_string_array(res, "$.content") - status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status")) + try: + status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status")) + except Exception as e: + raise RuntimeError( + f"Failed to extract status field from PDF chunk result: {e}" + ) from e + results_df = bpd.DataFrame( - {"status": status_series, "content": chunked_content_series} + {"status": status_series, "content": content_series} ) resultes_struct = bbq.struct(results_df).rename("chunked_results") return resultes_struct @@ -962,6 +1031,10 @@ def audio_transcribe( depend on the "verbose" parameter. Contains the transcribed text from the audio file. Includes error messages if verbosity is enabled. + + Raises: + ValueError: If engine is not 'bigquery'. + RuntimeError: If the transcription result structure is invalid. """ if engine.casefold() != "bigquery": raise ValueError("Must specify the engine, supported value is 'bigquery'.") @@ -984,6 +1057,10 @@ def audio_transcribe( model_params={"generationConfig": {"temperature": 0.0}}, ) + # Validate that the result is not None + if transcribed_results is None: + raise RuntimeError("Transcription returned None result") + transcribed_content_series = transcribed_results.struct.field("result").rename( "transcribed_content" ) diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb index c04463fc4c..0822ee4c2d 100644 --- a/notebooks/multimodal/multimodal_dataframe.ipynb +++ b/notebooks/multimodal/multimodal_dataframe.ipynb @@ -60,7 +60,8 @@ "2. Combine unstructured data with structured data\n", "3. Conduct image transformations\n", "4. Use LLM models to ask questions and generate embeddings on images\n", - "5. PDF chunking function" + "5. PDF chunking function\n", + "6. Transcribe audio" ] }, { @@ -215,23 +216,23 @@ " \n", " \n", " 0\n", - " \n", + " \n", " \n", " \n", " 1\n", - " \n", + " \n", " \n", " \n", " 2\n", - " \n", + " \n", " \n", " \n", " 3\n", - " \n", + " \n", " \n", " \n", " 4\n", - " \n", + " \n", " \n", " \n", "\n", @@ -297,21 +298,21 @@ "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:124: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:121: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", "version. Use `json_query` instead.\n", " warnings.warn(bfe.format_message(msg), category=UserWarning)\n", "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:124: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:121: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", "version. Use `json_query` instead.\n", " warnings.warn(bfe.format_message(msg), category=UserWarning)\n", "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:124: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:121: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", "version. Use `json_query` instead.\n", " warnings.warn(bfe.format_message(msg), category=UserWarning)\n", "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", @@ -351,7 +352,7 @@ " \n", " \n", " 0\n", - " \n", + " \n", " alice\n", " image/png\n", " 1591240\n", @@ -359,7 +360,7 @@ " \n", " \n", " 1\n", - " \n", + " \n", " bob\n", " image/png\n", " 1182951\n", @@ -367,7 +368,7 @@ " \n", " \n", " 2\n", - " \n", + " \n", " bob\n", " image/png\n", " 1520884\n", @@ -375,7 +376,7 @@ " \n", " \n", " 3\n", - " \n", + " \n", " alice\n", " image/png\n", " 1235401\n", @@ -383,7 +384,7 @@ " \n", " \n", " 4\n", - " \n", + " \n", " bob\n", " image/png\n", " 1591923\n", @@ -463,7 +464,7 @@ "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:124: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:121: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", "version. Use `json_query` instead.\n", " warnings.warn(bfe.format_message(msg), category=UserWarning)\n" ] @@ -471,7 +472,7 @@ { "data": { "text/html": [ - "" + "" ], "text/plain": [ "" @@ -483,7 +484,7 @@ { "data": { "text/html": [ - "" + "" ], "text/plain": [ "" @@ -527,19 +528,19 @@ "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", " return method(*args, **kwargs)\n", "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", " return method(*args, **kwargs)\n", "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", " return method(*args, **kwargs)\n" ] } @@ -579,7 +580,7 @@ "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", " return method(*args, **kwargs)\n" ] } @@ -589,9 +590,119 @@ "df_image[\"blur_resized\"] = df_image[\"blurred\"].blob.image_resize((300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_resize_transformed/\", engine=\"opencv\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using `verbose` mode for detailed output\\n\n", + "\\n\n", + "All multimodal functions support a `verbose` parameter, which defaults to `False`.\\n\n", + "\\n\n", + "* When `verbose=False` (the default), the function will only return the main content of the result (e.g., the transformed image, the extracted text).\\n\n", + "* When `verbose=True`, the function returns a `STRUCT` containing two fields:\\n\n", + " * `content`: The main result of the operation.\\n\n", + " * `status`: An informational field. If the operation is successful, this will be empty. If an error occurs during the processing of a specific row, this field will contain the error message, allowing the overall job to complete without failing.\\n\n", + "\\n\n", + "Using `verbose=True` is highly recommended for debugging and for workflows where you need to handle potential failures on a row-by-row basis. Let's see it in action with the `image_blur` function." + ] + }, { "cell_type": "code", "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + " return method(*args, **kwargs)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
blurred_verbose
0{'status': '', 'content': {'uri': 'gs://bigfra...
1{'status': '', 'content': {'uri': 'gs://bigfra...
2{'status': '', 'content': {'uri': 'gs://bigfra...
3{'status': '', 'content': {'uri': 'gs://bigfra...
4{'status': '', 'content': {'uri': 'gs://bigfra...
\n", + "

5 rows × 1 columns

\n", + "
[5 rows x 1 columns in total]" + ], + "text/plain": [ + " blurred_verbose\n", + "0 {'status': '', 'content': {'uri': 'gs://bigfra...\n", + "1 {'status': '', 'content': {'uri': 'gs://bigfra...\n", + "2 {'status': '', 'content': {'uri': 'gs://bigfra...\n", + "3 {'status': '', 'content': {'uri': 'gs://bigfra...\n", + "4 {'status': '', 'content': {'uri': 'gs://bigfra...\n", + "\n", + "[5 rows x 1 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_image[\"blurred_verbose\"] = df_image[\"image\"].blob.image_blur(\n", + " (20, 20), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_transformed_verbose/\", engine=\"opencv\", verbose=True\n", + ")\n", + "df_image[[\"blurred_verbose\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -657,73 +768,79 @@ " resized\n", " normalized\n", " blur_resized\n", + " blurred_verbose\n", " \n", " \n", " \n", " \n", " 0\n", - " \n", + " \n", " alice\n", " image/png\n", " 1591240\n", " 2025-03-20 17:45:04+00:00\n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " {'status': '', 'content': {'uri': 'gs://bigframes_blob_test/image_blur_transformed_verbose/k9-guard-dog-paw-balm.png', 'version': None, 'authorizer': 'bigframes-dev.us.bigframes-default-connection', 'details': None}}\n", " \n", " \n", " 1\n", - " \n", + " \n", " bob\n", " image/png\n", " 1182951\n", " 2025-03-20 17:45:02+00:00\n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " {'status': '', 'content': {'uri': 'gs://bigframes_blob_test/image_blur_transformed_verbose/k9-guard-dog-hot-spot-spray.png', 'version': None, 'authorizer': 'bigframes-dev.us.bigframes-default-connection', 'details': None}}\n", " \n", " \n", " 2\n", - " \n", + " \n", " bob\n", " image/png\n", " 1520884\n", " 2025-03-20 17:44:55+00:00\n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " {'status': '', 'content': {'uri': 'gs://bigframes_blob_test/image_blur_transformed_verbose/fluffy-buns-chinchilla-food-variety-pack.png', 'version': None, 'authorizer': 'bigframes-dev.us.bigframes-default-connection', 'details': None}}\n", " \n", " \n", " 3\n", - " \n", + " \n", " alice\n", " image/png\n", " 1235401\n", " 2025-03-20 17:45:19+00:00\n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " {'status': '', 'content': {'uri': 'gs://bigframes_blob_test/image_blur_transformed_verbose/purrfect-perch-cat-scratcher.png', 'version': None, 'authorizer': 'bigframes-dev.us.bigframes-default-connection', 'details': None}}\n", " \n", " \n", " 4\n", - " \n", + " \n", " bob\n", " image/png\n", " 1591923\n", " 2025-03-20 17:44:47+00:00\n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " {'status': '', 'content': {'uri': 'gs://bigframes_blob_test/image_blur_transformed_verbose/chirpy-seed-deluxe-bird-food.png', 'version': None, 'authorizer': 'bigframes-dev.us.bigframes-default-connection', 'details': None}}\n", " \n", " \n", "\n", - "

5 rows × 9 columns

\n", - "[5 rows x 9 columns in total]" + "

5 rows × 10 columns

\n", + "[5 rows x 10 columns in total]" ], "text/plain": [ " image author content_type \\\n", @@ -761,17 +878,24 @@ "3 {'uri': 'gs://bigframes_blob_test/image_normal... \n", "4 {'uri': 'gs://bigframes_blob_test/image_normal... \n", "\n", - " blur_resized \n", - "0 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", - "1 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", - "2 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", - "3 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", - "4 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + " blur_resized \\\n", + "0 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "1 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "2 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "3 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "4 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", "\n", - "[5 rows x 9 columns]" + " blurred_verbose \n", + "0 {'status': '', 'content': {'uri': 'gs://bigfra... \n", + "1 {'status': '', 'content': {'uri': 'gs://bigfra... \n", + "2 {'status': '', 'content': {'uri': 'gs://bigfra... \n", + "3 {'status': '', 'content': {'uri': 'gs://bigfra... \n", + "4 {'status': '', 'content': {'uri': 'gs://bigfra... \n", + "\n", + "[5 rows x 10 columns]" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -791,7 +915,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": { "id": "mRUGfcaFVW-3" }, @@ -800,7 +924,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", "default model will be removed in BigFrames 3.0. Please supply an\n", "explicit model to avoid this message.\n", " return method(*args, **kwargs)\n" @@ -814,7 +938,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -874,13 +998,13 @@ " \n", " \n", " 0\n", - " The item is a tin of K9Guard Dog Paw Balm.\n", - " \n", + " The item is a tin of K9 Guard dog paw balm.\n", + " \n", " \n", " \n", " 1\n", - " The item is a bottle of K9 Guard Dog Hot Spot Spray.\n", - " \n", + " The item is K9 Guard Dog Hot Spot Spray.\n", + " \n", " \n", " \n", "\n", @@ -888,9 +1012,9 @@ "[2 rows x 2 columns in total]" ], "text/plain": [ - " ml_generate_text_llm_result \\\n", - "0 The item is a tin of K9Guard Dog Paw Balm. \n", - "1 The item is a bottle of K9 Guard Dog Hot Spot ... \n", + " ml_generate_text_llm_result \\\n", + "0 The item is a tin of K9 Guard dog paw balm. \n", + "1 The item is K9 Guard Dog Hot Spot Spray. \n", "\n", " image \n", "0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n", @@ -899,7 +1023,7 @@ "[2 rows x 2 columns]" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -913,7 +1037,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": { "id": "IG3J3HsKhyBY" }, @@ -936,7 +1060,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -996,13 +1120,13 @@ " \n", " \n", " 0\n", - " The item is dog paw balm.\n", - " \n", + " The item is a tin of K9Guard Dog Paw Balm.\n", + " \n", " \n", " \n", " 1\n", - " The picture features a white bottle with a light blue spray nozzle and accents. The background is a neutral gray.\\n\n", - " \n", + " The bottle is mostly white, with a light blue accents. The background is a light gray. There are also black and green elements on the bottle's label.\n", + " \n", " \n", " \n", "\n", @@ -1011,8 +1135,8 @@ ], "text/plain": [ " ml_generate_text_llm_result \\\n", - "0 The item is dog paw balm. \n", - "1 The picture features a white bottle with a lig... \n", + "0 The item is a tin of K9Guard Dog Paw Balm. \n", + "1 The bottle is mostly white, with a light blue ... \n", "\n", " image \n", "0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n", @@ -1021,7 +1145,7 @@ "[2 rows x 2 columns]" ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1033,7 +1157,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1047,7 +1171,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", "default model will be removed in BigFrames 3.0. Please supply an\n", "explicit model to avoid this message.\n", " return method(*args, **kwargs)\n", @@ -1096,19 +1220,19 @@ " \n", " \n", " 0\n", - " [ 0.00638846 0.01666372 0.00451786 ... -0.02...\n", + " [ 0.00638842 0.01666344 0.00451782 ... -0.02...\n", " \n", " <NA>\n", " <NA>\n", - " {\"access_urls\":{\"expiry_time\":\"2025-10-09T12:2...\n", + " {\"access_urls\":{\"expiry_time\":\"2025-10-25T00:2...\n", " \n", " \n", " 1\n", - " [ 0.0097399 0.0214815 0.00244266 ... 0.00...\n", + " [ 0.00973689 0.02148374 0.00244311 ... 0.00...\n", " \n", " <NA>\n", " <NA>\n", - " {\"access_urls\":{\"expiry_time\":\"2025-10-09T12:2...\n", + " {\"access_urls\":{\"expiry_time\":\"2025-10-25T00:2...\n", " \n", " \n", "\n", @@ -1117,8 +1241,8 @@ ], "text/plain": [ " ml_generate_embedding_result \\\n", - "0 [ 0.00638846 0.01666372 0.00451786 ... -0.02... \n", - "1 [ 0.0097399 0.0214815 0.00244266 ... 0.00... \n", + "0 [ 0.00638842 0.01666344 0.00451782 ... -0.02... \n", + "1 [ 0.00973689 0.02148374 0.00244311 ... 0.00... \n", "\n", " ml_generate_embedding_status ml_generate_embedding_start_sec \\\n", "0 \n", @@ -1129,13 +1253,13 @@ "1 \n", "\n", " content \n", - "0 {\"access_urls\":{\"expiry_time\":\"2025-10-09T12:2... \n", - "1 {\"access_urls\":{\"expiry_time\":\"2025-10-09T12:2... \n", + "0 {\"access_urls\":{\"expiry_time\":\"2025-10-25T00:2... \n", + "1 {\"access_urls\":{\"expiry_time\":\"2025-10-25T00:2... \n", "\n", "[2 rows x 5 columns]" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1158,7 +1282,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": { "id": "oDDuYtUm5Yiy" }, @@ -1180,7 +1304,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1197,9 +1321,12 @@ "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", " return method(*args, **kwargs)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:244: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n", + "future version. Use `json_value_array` instead.\n", + " warnings.warn(bfe.format_message(msg), category=UserWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n", "future version. Use `json_value_array` instead.\n", " warnings.warn(bfe.format_message(msg), category=UserWarning)\n" ] @@ -1211,7 +1338,78 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + " return method(*args, **kwargs)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n", + "future version. Use `json_value_array` instead.\n", + " warnings.warn(bfe.format_message(msg), category=UserWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chunked_verbose
0{'status': '', 'content': array([\"CritterCuisi...
\n", + "

1 rows × 1 columns

\n", + "
[1 rows x 1 columns in total]" + ], + "text/plain": [ + " chunked_verbose\n", + "0 {'status': '', 'content': array([\"CritterCuisi...\n", + "\n", + "[1 rows x 1 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_pdf[\"chunked_verbose\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\", verbose=True)\n", + "df_pdf[[\"chunked_verbose\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, "metadata": { "id": "kaPvJATN7zlw" }, @@ -1239,7 +1437,7 @@ "Name: chunked, dtype: string" ] }, - "execution_count": 18, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1258,7 +1456,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1279,7 +1477,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1303,7 +1501,7 @@ "Name: transcribed_content, dtype: string" ] }, - "execution_count": 20, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1312,6 +1510,42 @@ "transcribed_series = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=False)\n", "transcribed_series" ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/plain": [ + "0 {'status': '', 'content': 'Now, as all books, ...\n", + "Name: transcription_results, dtype: struct[pyarrow]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transcribed_series_verbose = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=True)\n", + "transcribed_series_verbose" + ] } ], "metadata": {