From fe774c273761d47cfbe42bba1f5965db23474c4a Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 23 Oct 2025 19:52:47 +0000 Subject: [PATCH 01/10] add error handling for audio_transcribe --- bigframes/operations/blob.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 1f6b75a8f5..7c32dd8513 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -962,6 +962,10 @@ def audio_transcribe( depend on the "verbose" parameter. Contains the transcribed text from the audio file. Includes error messages if verbosity is enabled. + + Raises: + ValueError: If engine is not 'bigquery'. + RuntimeError: If the transcription result structure is invalid. """ if engine.casefold() != "bigquery": raise ValueError("Must specify the engine, supported value is 'bigquery'.") @@ -984,6 +988,10 @@ def audio_transcribe( model_params={"generationConfig": {"temperature": 0.0}}, ) + # Validate that the result is not None + if transcribed_results is None: + raise RuntimeError("Transcription returned None result") + transcribed_content_series = transcribed_results.struct.field("result").rename( "transcribed_content" ) @@ -999,4 +1007,4 @@ def audio_transcribe( results_struct = bbq.struct(results_df).rename("transcription_results") return results_struct else: - return transcribed_content_series.rename("transcribed_content") + return transcribed_content_series From 8b31c8a8ed1dc5d80d0525839d23596a2ed0983c Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 23 Oct 2025 23:12:18 +0000 Subject: [PATCH 02/10] add error handling for pdf functions --- bigframes/operations/blob.py | 114 +++++++++++++++++------------------ 1 file changed, 56 insertions(+), 58 deletions(-) diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 7c32dd8513..74ab076207 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -790,25 +790,9 @@ def pdf_extract( ) -> bigframes.series.Series: """Extracts text from PDF URLs and saves the text as string. - Args: - engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified. - connection (str or None, default None): BQ connection used for - function internet transactions, and the output blob if "dst" - is str. If None, uses default connection of the session. - max_batching_rows (int, default 1): Max number of rows per batch - send to cloud run to execute the function. - container_cpu (int or float, default 2): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers. - container_memory (str, default "1Gi"): container memory size. String of the format . Possible values are from 512Mi to 32Gi. - verbose (bool, default "False"): controls the verbosity of the output. - When set to True, both error messages and the extracted content - are displayed. Conversely, when set to False, only the extracted - content is presented, suppressing error messages. - - Returns: - bigframes.series.Series: str or struct[str, str], - depend on the "verbose" parameter. - Contains the extracted text from the PDF file. - Includes error messages if verbosity is enabled. + Raises: + ValueError: If engine is not 'pypdf'. + RuntimeError: If PDF extraction fails or returns invalid structure. """ if engine is None or engine.casefold() != "pypdf": raise ValueError("Must specify the engine, supported value is 'pypdf'.") @@ -830,20 +814,37 @@ def pdf_extract( df = self.get_runtime_json_str(mode="R").to_frame() df["verbose"] = verbose - res = self._df_apply_udf(df, pdf_extract_udf) + + try: + res = self._df_apply_udf(df, pdf_extract_udf) + except Exception as e: + raise RuntimeError(f"PDF extraction UDF failed: {e}") from e + + # Validate result is not None + if res is None: + raise RuntimeError("PDF extraction returned None result") + + # Extract content with error handling + try: + content_series = res._apply_unary_op(ops.JSONValue(json_path="$.content")) + except Exception as e: + raise RuntimeError( + f"Failed to extract content field from PDF result: {e}" + ) from e if verbose: - extracted_content_series = res._apply_unary_op( - ops.JSONValue(json_path="$.content") - ) - status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status")) - results_df = bpd.DataFrame( - {"status": status_series, "content": extracted_content_series} - ) - results_struct = bbq.struct(results_df).rename("extracted_results") - return results_struct + try: + status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status")) + except Exception as e: + raise RuntimeError( + f"Failed to extract status field from PDF result: {e}" + ) from e + + res_df = bpd.DataFrame({"status": status_series, "content": content_series}) + struct_series = bbq.struct(res_df) + return struct_series else: - return res.rename("extracted_content") + return content_series def pdf_chunk( self, @@ -860,30 +861,9 @@ def pdf_chunk( """Extracts and chunks text from PDF URLs and saves the text as arrays of strings. - Args: - engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified. - connection (str or None, default None): BQ connection used for - function internet transactions, and the output blob if "dst" - is str. If None, uses default connection of the session. - chunk_size (int, default 2000): the desired size of each text chunk - (number of characters). - overlap_size (int, default 200): the number of overlapping characters - between consective chunks. The helps to ensure context is - perserved across chunk boundaries. - max_batching_rows (int, default 1): Max number of rows per batch - send to cloud run to execute the function. - container_cpu (int or float, default 2): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers. - container_memory (str, default "1Gi"): container memory size. String of the format . Possible values are from 512Mi to 32Gi. - verbose (bool, default "False"): controls the verbosity of the output. - When set to True, both error messages and the extracted content - are displayed. Conversely, when set to False, only the extracted - content is presented, suppressing error messages. - - Returns: - bigframe.series.Series: array[str] or struct[str, array[str]], - depend on the "verbose" parameter. - where each string is a chunk of text extracted from PDF. - Includes error messages if verbosity is enabled. + Raises: + ValueError: If engine is not 'pypdf'. + RuntimeError: If PDF chunking fails or returns invalid structure. """ if engine is None or engine.casefold() != "pypdf": raise ValueError("Must specify the engine, supported value is 'pypdf'.") @@ -915,13 +895,31 @@ def pdf_chunk( df["overlap_size"] = overlap_size df["verbose"] = verbose - res = self._df_apply_udf(df, pdf_chunk_udf) + try: + res = self._df_apply_udf(df, pdf_chunk_udf) + except Exception as e: + raise RuntimeError(f"PDF chunking UDF failed: {e}") from e + + if res is None: + raise RuntimeError("PDF chunking returned None result") + + try: + content_series = bbq.json_extract_string_array(res, "$.content") + except Exception as e: + raise RuntimeError( + f"Failed to extract content array from PDF chunk result: {e}" + ) from e if verbose: - chunked_content_series = bbq.json_extract_string_array(res, "$.content") - status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status")) + try: + status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status")) + except Exception as e: + raise RuntimeError( + f"Failed to extract status field from PDF chunk result: {e}" + ) from e + results_df = bpd.DataFrame( - {"status": status_series, "content": chunked_content_series} + {"status": status_series, "content": content_series} ) resultes_struct = bbq.struct(results_df).rename("chunked_results") return resultes_struct From 6a37c1870b0d1c18fc49b363a2d5ba62cf55da36 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 24 Oct 2025 02:18:47 +0000 Subject: [PATCH 03/10] add eror handling for image functions --- bigframes/blob/_functions.py | 207 ++++++++++++++++++++++++----------- bigframes/operations/blob.py | 160 ++++++++++++++++++++++----- 2 files changed, 276 insertions(+), 91 deletions(-) diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index 2a11974b8d..510de3b5a7 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -120,43 +120,50 @@ def udf(self): def exif_func(src_obj_ref_rt: str, verbose: bool) -> str: - import io - import json + try: + import io + import json - from PIL import ExifTags, Image - import requests - from requests import adapters + from PIL import ExifTags, Image + import requests + from requests import adapters - result_dict = {"status": "", "content": "{}"} - try: session = requests.Session() session.mount("https://", adapters.HTTPAdapter(max_retries=3)) src_obj_ref_rt_json = json.loads(src_obj_ref_rt) - src_url = src_obj_ref_rt_json["access_urls"]["read_url"] response = session.get(src_url, timeout=30) + response.raise_for_status() bts = response.content image = Image.open(io.BytesIO(bts)) exif_data = image.getexif() exif_dict = {} + if exif_data: for tag, value in exif_data.items(): tag_name = ExifTags.TAGS.get(tag, tag) - # Pillow might return bytes, which are not serializable. - if isinstance(value, bytes): - value = value.decode("utf-8", "replace") - exif_dict[tag_name] = value - result_dict["content"] = json.dumps(exif_dict) - except Exception as e: - result_dict["status"] = str(e) + # Convert non-serializable types to strings + try: + json.dumps(value) + exif_dict[tag_name] = value + except (TypeError, ValueError): + exif_dict[tag_name] = str(value) + + if verbose: + return json.dumps({"status": "", "content": json.dumps(exif_dict)}) + else: + return json.dumps(exif_dict) - if verbose: - return json.dumps(result_dict) - else: - return result_dict["content"] + except Exception as e: + # Return error as JSON with error field + error_result = {"status": f"{type(e).__name__}: {str(e)}", "content": "{}"} + if verbose: + return json.dumps(error_result) + else: + return "{}" exif_func_def = FunctionDef(exif_func, ["pillow", "requests"]) @@ -171,11 +178,9 @@ def image_blur_func( ext: str, verbose: bool, ) -> str: - import json - - result_dict = {"status": "", "content": dst_obj_ref_rt} - try: + import json + import cv2 as cv # type: ignore import numpy as np import requests @@ -193,35 +198,52 @@ def image_blur_func( dst_url = dst_obj_ref_rt_json["access_urls"]["write_url"] response = session.get(src_url, timeout=30) + response.raise_for_status() # Raise exception for HTTP errors bts = response.content nparr = np.frombuffer(bts, np.uint8) img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + + if img is None: + raise ValueError( + "Failed to decode image - possibly corrupted or unsupported format" + ) + img_blurred = cv.blur(img, ksize=(ksize_x, ksize_y)) - bts = cv.imencode(ext, img_blurred)[1].tobytes() + success, encoded = cv.imencode(ext, img_blurred) + if not success: + raise ValueError(f"Failed to encode image with extension {ext}") + + bts = encoded.tobytes() ext = ext.replace(".", "") ext_mappings = {"jpg": "jpeg", "tif": "tiff"} ext = ext_mappings.get(ext, ext) content_type = "image/" + ext - session.put( + put_response = session.put( url=dst_url, data=bts, - headers={ - "Content-Type": content_type, - }, + headers={"Content-Type": content_type}, timeout=30, ) + put_response.raise_for_status() - except Exception as e: - result_dict["status"] = str(e) + if verbose: + return json.dumps({"status": "", "content": dst_obj_ref_rt}) + else: + return dst_obj_ref_rt - if verbose: - return json.dumps(result_dict) - else: - return result_dict["content"] + except Exception as e: + # Return error in structured format + error_result = {"status": f"Error: {type(e).__name__}: {str(e)}", "content": ""} + if verbose: + return json.dumps(error_result) + else: + # The calling function expects a json string that can be parsed as a blob ref + # Return a valid blob ref json string with empty values. + return '{"access_urls": {"read_url": "", "write_url": ""}, "authorizer": "", "generation": "", "uri": ""}' image_blur_def = FunctionDef(image_blur_func, ["opencv-python", "numpy", "requests"]) @@ -251,12 +273,20 @@ def image_blur_to_bytes_func( src_url = src_obj_ref_rt_json["access_urls"]["read_url"] response = session.get(src_url, timeout=30) + response.raise_for_status() bts = response.content nparr = np.frombuffer(bts, np.uint8) img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + if img is None: + raise ValueError( + "Failed to decode image - possibly corrupted or unsupported format" + ) img_blurred = cv.blur(img, ksize=(ksize_x, ksize_y)) - content = cv.imencode(ext, img_blurred)[1].tobytes() + success, encoded = cv.imencode(ext, img_blurred) + if not success: + raise ValueError(f"Failed to encode image with extension {ext}") + content = encoded.tobytes() except Exception as e: status = str(e) @@ -284,11 +314,9 @@ def image_resize_func( ext: str, verbose: bool, ) -> str: - import json - - result_dict = {"status": "", "content": dst_obj_ref_rt} - try: + import json + import cv2 as cv # type: ignore import numpy as np import requests @@ -306,20 +334,28 @@ def image_resize_func( dst_url = dst_obj_ref_rt_json["access_urls"]["write_url"] response = session.get(src_url, timeout=30) + response.raise_for_status() bts = response.content nparr = np.frombuffer(bts, np.uint8) img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + if img is None: + raise ValueError( + "Failed to decode image - possibly corrupted or unsupported format" + ) img_resized = cv.resize(img, dsize=(dsize_x, dsize_y), fx=fx, fy=fy) - bts = cv.imencode(ext, img_resized)[1].tobytes() + success, encoded = cv.imencode(ext, img_resized) + if not success: + raise ValueError(f"Failed to encode image with extension {ext}") + bts = encoded.tobytes() ext = ext.replace(".", "") ext_mappings = {"jpg": "jpeg", "tif": "tiff"} ext = ext_mappings.get(ext, ext) content_type = "image/" + ext - session.put( + put_response = session.put( url=dst_url, data=bts, headers={ @@ -327,14 +363,22 @@ def image_resize_func( }, timeout=30, ) + put_response.raise_for_status() - except Exception as e: - result_dict["status"] = str(e) + if verbose: + return json.dumps({"status": "", "content": dst_obj_ref_rt}) + else: + return dst_obj_ref_rt - if verbose: - return json.dumps(result_dict) - else: - return result_dict["content"] + except Exception as e: + # Return error in structured format + error_result = {"status": f"Error: {type(e).__name__}: {str(e)}", "content": ""} + if verbose: + return json.dumps(error_result) + else: + # The calling function expects a json string that can be parsed as a blob ref + # Return a valid blob ref json string with empty values. + return '{"access_urls": {"read_url": "", "write_url": ""}, "authorizer": "", "generation": "", "uri": ""}' image_resize_def = FunctionDef( @@ -372,12 +416,20 @@ def image_resize_to_bytes_func( src_url = src_obj_ref_rt_json["access_urls"]["read_url"] response = session.get(src_url, timeout=30) + response.raise_for_status() bts = response.content nparr = np.frombuffer(bts, np.uint8) img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + if img is None: + raise ValueError( + "Failed to decode image - possibly corrupted or unsupported format" + ) img_resized = cv.resize(img, dsize=(dsize_x, dsize_y), fx=fx, fy=fy) - content = cv.imencode(".jpeg", img_resized)[1].tobytes() + success, encoded = cv.imencode(ext, img_resized) + if not success: + raise ValueError(f"Failed to encode image with extension {ext}") + content = encoded.tobytes() except Exception as e: status = str(e) @@ -404,11 +456,9 @@ def image_normalize_func( ext: str, verbose: bool, ) -> str: - import json - - result_dict = {"status": "", "content": dst_obj_ref_rt} - try: + import json + import cv2 as cv # type: ignore import numpy as np import requests @@ -433,22 +483,30 @@ def image_normalize_func( dst_url = dst_obj_ref_rt_json["access_urls"]["write_url"] response = session.get(src_url, timeout=30) + response.raise_for_status() bts = response.content nparr = np.frombuffer(bts, np.uint8) img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + if img is None: + raise ValueError( + "Failed to decode image - possibly corrupted or unsupported format" + ) img_normalized = cv.normalize( img, None, alpha=alpha, beta=beta, norm_type=norm_type_mapping[norm_type] ) - bts = cv.imencode(ext, img_normalized)[1].tobytes() + success, encoded = cv.imencode(ext, img_normalized) + if not success: + raise ValueError(f"Failed to encode image with extension {ext}") + bts = encoded.tobytes() ext = ext.replace(".", "") ext_mappings = {"jpg": "jpeg", "tif": "tiff"} ext = ext_mappings.get(ext, ext) content_type = "image/" + ext - session.put( + put_response = session.put( url=dst_url, data=bts, headers={ @@ -456,14 +514,22 @@ def image_normalize_func( }, timeout=30, ) + put_response.raise_for_status() - except Exception as e: - result_dict["status"] = str(e) + if verbose: + return json.dumps({"status": "", "content": dst_obj_ref_rt}) + else: + return dst_obj_ref_rt - if verbose: - return json.dumps(result_dict) - else: - return result_dict["content"] + except Exception as e: + # Return error in structured format + error_result = {"status": f"Error: {type(e).__name__}: {str(e)}", "content": ""} + if verbose: + return json.dumps(error_result) + else: + # The calling function expects a json string that can be parsed as a blob ref + # Return a valid blob ref json string with empty values. + return '{"access_urls": {"read_url": "", "write_url": ""}, "authorizer": "", "generation": "", "uri": ""}' image_normalize_def = FunctionDef( @@ -482,7 +548,8 @@ def image_normalize_to_bytes_func( import base64 import json - result_dict = {"status": "", "content": ""} + status = "" + content = b"" try: import cv2 as cv # type: ignore @@ -506,20 +573,28 @@ def image_normalize_to_bytes_func( src_url = src_obj_ref_rt_json["access_urls"]["read_url"] response = session.get(src_url, timeout=30) + response.raise_for_status() bts = response.content nparr = np.frombuffer(bts, np.uint8) img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + if img is None: + raise ValueError( + "Failed to decode image - possibly corrupted or unsupported format" + ) img_normalized = cv.normalize( img, None, alpha=alpha, beta=beta, norm_type=norm_type_mapping[norm_type] ) - bts = cv.imencode(".jpeg", img_normalized)[1].tobytes() - - content_b64 = base64.b64encode(bts).decode("utf-8") - result_dict["content"] = content_b64 + success, encoded = cv.imencode(ext, img_normalized) + if not success: + raise ValueError(f"Failed to encode image with extension {ext}") + content = encoded.tobytes() except Exception as e: - result_dict["status"] = str(e) + status = str(e) + + encoded_content = base64.b64encode(content).decode("utf-8") + result_dict = {"status": status, "content": encoded_content} if verbose: return json.dumps(result_dict) diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 74ab076207..486909dd23 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -343,6 +343,10 @@ def exif( Returns: bigframes.series.Series: JSON series of key-value pairs if verbose=False, or struct with status and content if verbose=True. + + Raises: + ValueError: If engine is not 'pillow'. + RuntimeError: If EXIF extraction fails or returns invalid structure. """ if engine is None or engine.casefold() != "pillow": raise ValueError("Must specify the engine, supported value is 'pillow'.") @@ -364,22 +368,34 @@ def exif( container_memory=container_memory, ).udf() - res = self._df_apply_udf(df, exif_udf) + try: + res = self._df_apply_udf(df, exif_udf) + except Exception as e: + raise RuntimeError(f"EXIF extraction UDF execution failed: {e}") from e + + if res is None: + raise RuntimeError("EXIF extraction returned None result") if verbose: - exif_content_series = bbq.parse_json( - res._apply_unary_op(ops.JSONValue(json_path="$.content")) - ).rename("exif_content") - exif_status_series = res._apply_unary_op( - ops.JSONValue(json_path="$.status") - ) + try: + exif_content_series = bbq.parse_json( + res._apply_unary_op(ops.JSONValue(json_path="$.content")) + ).rename("exif_content") + exif_status_series = res._apply_unary_op( + ops.JSONValue(json_path="$.status") + ) + except Exception as e: + raise RuntimeError(f"Failed to parse EXIF JSON result: {e}") from e results_df = bpd.DataFrame( {"status": exif_status_series, "content": exif_content_series} ) results_struct = bbq.struct(results_df).rename("exif_results") return results_struct else: - return bbq.parse_json(res) + try: + return bbq.parse_json(res) + except Exception as e: + raise RuntimeError(f"Failed to parse EXIF JSON result: {e}") from e def image_blur( self, @@ -411,6 +427,10 @@ def image_blur( Returns: bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ. If verbose=True, returns struct with status and content. + + Raises: + ValueError: If engine is not 'opencv' or parameters are invalid. + RuntimeError: If image blur operation fails. """ if engine is None or engine.casefold() != "opencv": raise ValueError("Must specify the engine, supported value is 'opencv'.") @@ -437,7 +457,13 @@ def image_blur( df["ksize_x"], df["ksize_y"] = ksize df["ext"] = ext # type: ignore df["verbose"] = verbose - res = self._df_apply_udf(df, image_blur_udf) + try: + res = self._df_apply_udf(df, image_blur_udf) + except Exception as e: + raise RuntimeError(f"Image blur UDF execution failed: {e}") from e + + if res is None: + raise RuntimeError("Image blur returned None result") if verbose: blurred_content_b64_series = res._apply_unary_op( @@ -486,7 +512,13 @@ def image_blur( df["ext"] = ext # type: ignore df["verbose"] = verbose - res = self._df_apply_udf(df, image_blur_udf) + try: + res = self._df_apply_udf(df, image_blur_udf) + except Exception as e: + raise RuntimeError(f"Image blur UDF execution failed: {e}") from e + + if res is None: + raise RuntimeError("Image blur returned None result") res.cache() # to execute the udf if verbose: @@ -540,6 +572,10 @@ def image_resize( Returns: bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ. If verbose=True, returns struct with status and content. + + Raises: + ValueError: If engine is not 'opencv' or parameters are invalid. + RuntimeError: If image resize operation fails. """ if engine is None or engine.casefold() != "opencv": raise ValueError("Must specify the engine, supported value is 'opencv'.") @@ -570,11 +606,17 @@ def image_resize( container_memory=container_memory, ).udf() - df["dsize_x"], df["dsizye_y"] = dsize + df["dsize_x"], df["dsize_y"] = dsize df["fx"], df["fy"] = fx, fy df["ext"] = ext # type: ignore df["verbose"] = verbose - res = self._df_apply_udf(df, image_resize_udf) + try: + res = self._df_apply_udf(df, image_resize_udf) + except Exception as e: + raise RuntimeError(f"Image resize UDF execution failed: {e}") from e + + if res is None: + raise RuntimeError("Image resize returned None result") if verbose: resized_content_b64_series = res._apply_unary_op( @@ -620,12 +662,18 @@ def image_resize( dst_rt = dst.blob.get_runtime_json_str(mode="RW") df = df.join(dst_rt, how="outer") - df["dsize_x"], df["dsizye_y"] = dsize + df["dsize_x"], df["dsize_y"] = dsize df["fx"], df["fy"] = fx, fy df["ext"] = ext # type: ignore df["verbose"] = verbose - res = self._df_apply_udf(df, image_resize_udf) + try: + res = self._df_apply_udf(df, image_resize_udf) + except Exception as e: + raise RuntimeError(f"Image resize UDF execution failed: {e}") from e + + if res is None: + raise RuntimeError("Image resize returned None result") res.cache() # to execute the udf if verbose: @@ -679,6 +727,10 @@ def image_normalize( Returns: bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ. If verbose=True, returns struct with status and content. + + Raises: + ValueError: If engine is not 'opencv' or parameters are invalid. + RuntimeError: If image normalize operation fails. """ if engine is None or engine.casefold() != "opencv": raise ValueError("Must specify the engine, supported value is 'opencv'.") @@ -707,7 +759,13 @@ def image_normalize( df["norm_type"] = norm_type df["ext"] = ext # type: ignore df["verbose"] = verbose - res = self._df_apply_udf(df, image_normalize_udf) + try: + res = self._df_apply_udf(df, image_normalize_udf) + except Exception as e: + raise RuntimeError(f"Image normalize UDF execution failed: {e}") from e + + if res is None: + raise RuntimeError("Image normalize returned None result") if verbose: normalized_content_b64_series = res._apply_unary_op( @@ -758,7 +816,13 @@ def image_normalize( df["ext"] = ext # type: ignore df["verbose"] = verbose - res = self._df_apply_udf(df, image_normalize_udf) + try: + res = self._df_apply_udf(df, image_normalize_udf) + except Exception as e: + raise RuntimeError(f"Image normalize UDF execution failed: {e}") from e + + if res is None: + raise RuntimeError("Image normalize returned None result") res.cache() # to execute the udf if verbose: @@ -790,6 +854,26 @@ def pdf_extract( ) -> bigframes.series.Series: """Extracts text from PDF URLs and saves the text as string. + Args: + engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified. + connection (str or None, default None): BQ connection used for + function internet transactions, and the output blob if "dst" + is str. If None, uses default connection of the session. + max_batching_rows (int, default 1): Max number of rows per batch + send to cloud run to execute the function. + container_cpu (int or float, default 2): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers. + container_memory (str, default "1Gi"): container memory size. String of the format . Possible values are from 512Mi to 32Gi. + verbose (bool, default "False"): controls the verbosity of the output. + When set to True, both error messages and the extracted content + are displayed. Conversely, when set to False, only the extracted + content is presented, suppressing error messages. + + Returns: + bigframes.series.Series: str or struct[str, str], + depend on the "verbose" parameter. + Contains the extracted text from the PDF file. + Includes error messages if verbosity is enabled. + Raises: ValueError: If engine is not 'pypdf'. RuntimeError: If PDF extraction fails or returns invalid structure. @@ -824,15 +908,16 @@ def pdf_extract( if res is None: raise RuntimeError("PDF extraction returned None result") - # Extract content with error handling - try: - content_series = res._apply_unary_op(ops.JSONValue(json_path="$.content")) - except Exception as e: - raise RuntimeError( - f"Failed to extract content field from PDF result: {e}" - ) from e - if verbose: + # Extract content with error handling + try: + content_series = res._apply_unary_op( + ops.JSONValue(json_path="$.content") + ) + except Exception as e: + raise RuntimeError( + f"Failed to extract content field from PDF result: {e}" + ) from e try: status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status")) except Exception as e: @@ -844,7 +929,7 @@ def pdf_extract( struct_series = bbq.struct(res_df) return struct_series else: - return content_series + return res def pdf_chunk( self, @@ -861,6 +946,31 @@ def pdf_chunk( """Extracts and chunks text from PDF URLs and saves the text as arrays of strings. + Args: + engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified. + connection (str or None, default None): BQ connection used for + function internet transactions, and the output blob if "dst" + is str. If None, uses default connection of the session. + chunk_size (int, default 2000): the desired size of each text chunk + (number of characters). + overlap_size (int, default 200): the number of overlapping characters + between consective chunks. The helps to ensure context is + perserved across chunk boundaries. + max_batching_rows (int, default 1): Max number of rows per batch + send to cloud run to execute the function. + container_cpu (int or float, default 2): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers. + container_memory (str, default "1Gi"): container memory size. String of the format . Possible values are from 512Mi to 32Gi. + verbose (bool, default "False"): controls the verbosity of the output. + When set to True, both error messages and the extracted content + are displayed. Conversely, when set to False, only the extracted + content is presented, suppressing error messages. + + Returns: + bigframe.series.Series: array[str] or struct[str, array[str]], + depend on the "verbose" parameter. + where each string is a chunk of text extracted from PDF. + Includes error messages if verbosity is enabled. + Raises: ValueError: If engine is not 'pypdf'. RuntimeError: If PDF chunking fails or returns invalid structure. From 2d66362d8de1fde34ff50e8e14afcd6c4c3993a0 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 24 Oct 2025 04:19:36 +0000 Subject: [PATCH 04/10] final touch --- bigframes/blob/_functions.py | 75 ++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 33 deletions(-) diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index 510de3b5a7..a39e3d2ce1 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -255,9 +255,6 @@ def image_blur_to_bytes_func( import base64 import json - status = "" - content = b"" - try: import cv2 as cv # type: ignore import numpy as np @@ -288,15 +285,21 @@ def image_blur_to_bytes_func( raise ValueError(f"Failed to encode image with extension {ext}") content = encoded.tobytes() - except Exception as e: - status = str(e) + encoded_content = base64.b64encode(content).decode("utf-8") + result_dict = {"status": "", "content": encoded_content} + if verbose: + return json.dumps(result_dict) + else: + return result_dict["content"] - encoded_content = base64.b64encode(content).decode("utf-8") - result_dict = {"status": status, "content": encoded_content} - if verbose: - return json.dumps(result_dict) - else: - return result_dict["content"] + except Exception as e: + status = f"Error: {type(e).__name__}: {str(e)}" + encoded_content = base64.b64encode(b"").decode("utf-8") + result_dict = {"status": status, "content": encoded_content} + if verbose: + return json.dumps(result_dict) + else: + return result_dict["content"] image_blur_to_bytes_def = FunctionDef( @@ -398,9 +401,6 @@ def image_resize_to_bytes_func( import base64 import json - status = "" - content = b"" - try: import cv2 as cv # type: ignore import numpy as np @@ -431,15 +431,21 @@ def image_resize_to_bytes_func( raise ValueError(f"Failed to encode image with extension {ext}") content = encoded.tobytes() - except Exception as e: - status = str(e) + encoded_content = base64.b64encode(content).decode("utf-8") + result_dict = {"status": "", "content": encoded_content} + if verbose: + return json.dumps(result_dict) + else: + return result_dict["content"] - encoded_content = base64.b64encode(content).decode("utf-8") - result_dict = {"status": status, "content": encoded_content} - if verbose: - return json.dumps(result_dict) - else: - return result_dict["content"] + except Exception as e: + status = f"Error: {type(e).__name__}: {str(e)}" + encoded_content = base64.b64encode(b"").decode("utf-8") + result_dict = {"status": status, "content": encoded_content} + if verbose: + return json.dumps(result_dict) + else: + return result_dict["content"] image_resize_to_bytes_def = FunctionDef( @@ -548,9 +554,6 @@ def image_normalize_to_bytes_func( import base64 import json - status = "" - content = b"" - try: import cv2 as cv # type: ignore import numpy as np @@ -590,16 +593,22 @@ def image_normalize_to_bytes_func( raise ValueError(f"Failed to encode image with extension {ext}") content = encoded.tobytes() - except Exception as e: - status = str(e) + encoded_content = base64.b64encode(content).decode("utf-8") + result_dict = {"status": "", "content": encoded_content} - encoded_content = base64.b64encode(content).decode("utf-8") - result_dict = {"status": status, "content": encoded_content} + if verbose: + return json.dumps(result_dict) + else: + return result_dict["content"] - if verbose: - return json.dumps(result_dict) - else: - return result_dict["content"] + except Exception as e: + status = f"Error: {type(e).__name__}: {str(e)}" + encoded_content = base64.b64encode(b"").decode("utf-8") + result_dict = {"status": status, "content": encoded_content} + if verbose: + return json.dumps(result_dict) + else: + return result_dict["content"] image_normalize_to_bytes_def = FunctionDef( From 052bf814d263d1a0d24a5295dd1ce294b498d70b Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 24 Oct 2025 17:11:08 +0000 Subject: [PATCH 05/10] restore rename --- bigframes/operations/blob.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 486909dd23..66ba9ee8e2 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -926,10 +926,10 @@ def pdf_extract( ) from e res_df = bpd.DataFrame({"status": status_series, "content": content_series}) - struct_series = bbq.struct(res_df) + struct_series = bbq.struct(res_df).rename("extracted_results") return struct_series else: - return res + return res.rename("extracted_content") def pdf_chunk( self, @@ -1115,4 +1115,4 @@ def audio_transcribe( results_struct = bbq.struct(results_df).rename("transcription_results") return results_struct else: - return transcribed_content_series + return transcribed_content_series.rename("transcribed_content") From 313f04a5de169ade6b9b0e49257ee9bc5b4fab31 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 24 Oct 2025 18:34:55 +0000 Subject: [PATCH 06/10] update notebook to better reflect our new code change --- .../multimodal/multimodal_dataframe.ipynb | 426 ++++++++++++++---- 1 file changed, 330 insertions(+), 96 deletions(-) diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb index c04463fc4c..0822ee4c2d 100644 --- a/notebooks/multimodal/multimodal_dataframe.ipynb +++ b/notebooks/multimodal/multimodal_dataframe.ipynb @@ -60,7 +60,8 @@ "2. Combine unstructured data with structured data\n", "3. Conduct image transformations\n", "4. Use LLM models to ask questions and generate embeddings on images\n", - "5. PDF chunking function" + "5. PDF chunking function\n", + "6. Transcribe audio" ] }, { @@ -215,23 +216,23 @@ " \n", " \n", " 0\n", - " \n", + " \n", " \n", " \n", " 1\n", - " \n", + " \n", " \n", " \n", " 2\n", - " \n", + " \n", " \n", " \n", " 3\n", - " \n", + " \n", " \n", " \n", " 4\n", - " \n", + " \n", " \n", " \n", "\n", @@ -297,21 +298,21 @@ "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:124: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:121: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", "version. Use `json_query` instead.\n", " warnings.warn(bfe.format_message(msg), category=UserWarning)\n", "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:124: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:121: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", "version. Use `json_query` instead.\n", " warnings.warn(bfe.format_message(msg), category=UserWarning)\n", "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:124: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:121: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", "version. Use `json_query` instead.\n", " warnings.warn(bfe.format_message(msg), category=UserWarning)\n", "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", @@ -351,7 +352,7 @@ " \n", " \n", " 0\n", - " \n", + " \n", " alice\n", " image/png\n", " 1591240\n", @@ -359,7 +360,7 @@ " \n", " \n", " 1\n", - " \n", + " \n", " bob\n", " image/png\n", " 1182951\n", @@ -367,7 +368,7 @@ " \n", " \n", " 2\n", - " \n", + " \n", " bob\n", " image/png\n", " 1520884\n", @@ -375,7 +376,7 @@ " \n", " \n", " 3\n", - " \n", + " \n", " alice\n", " image/png\n", " 1235401\n", @@ -383,7 +384,7 @@ " \n", " \n", " 4\n", - " \n", + " \n", " bob\n", " image/png\n", " 1591923\n", @@ -463,7 +464,7 @@ "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:124: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:121: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", "version. Use `json_query` instead.\n", " warnings.warn(bfe.format_message(msg), category=UserWarning)\n" ] @@ -471,7 +472,7 @@ { "data": { "text/html": [ - "" + "" ], "text/plain": [ "" @@ -483,7 +484,7 @@ { "data": { "text/html": [ - "" + "" ], "text/plain": [ "" @@ -527,19 +528,19 @@ "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", " return method(*args, **kwargs)\n", "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", " return method(*args, **kwargs)\n", "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", " return method(*args, **kwargs)\n" ] } @@ -579,7 +580,7 @@ "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", " return method(*args, **kwargs)\n" ] } @@ -589,9 +590,119 @@ "df_image[\"blur_resized\"] = df_image[\"blurred\"].blob.image_resize((300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_resize_transformed/\", engine=\"opencv\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using `verbose` mode for detailed output\\n\n", + "\\n\n", + "All multimodal functions support a `verbose` parameter, which defaults to `False`.\\n\n", + "\\n\n", + "* When `verbose=False` (the default), the function will only return the main content of the result (e.g., the transformed image, the extracted text).\\n\n", + "* When `verbose=True`, the function returns a `STRUCT` containing two fields:\\n\n", + " * `content`: The main result of the operation.\\n\n", + " * `status`: An informational field. If the operation is successful, this will be empty. If an error occurs during the processing of a specific row, this field will contain the error message, allowing the overall job to complete without failing.\\n\n", + "\\n\n", + "Using `verbose=True` is highly recommended for debugging and for workflows where you need to handle potential failures on a row-by-row basis. Let's see it in action with the `image_blur` function." + ] + }, { "cell_type": "code", "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + " return method(*args, **kwargs)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
blurred_verbose
0{'status': '', 'content': {'uri': 'gs://bigfra...
1{'status': '', 'content': {'uri': 'gs://bigfra...
2{'status': '', 'content': {'uri': 'gs://bigfra...
3{'status': '', 'content': {'uri': 'gs://bigfra...
4{'status': '', 'content': {'uri': 'gs://bigfra...
\n", + "

5 rows × 1 columns

\n", + "
[5 rows x 1 columns in total]" + ], + "text/plain": [ + " blurred_verbose\n", + "0 {'status': '', 'content': {'uri': 'gs://bigfra...\n", + "1 {'status': '', 'content': {'uri': 'gs://bigfra...\n", + "2 {'status': '', 'content': {'uri': 'gs://bigfra...\n", + "3 {'status': '', 'content': {'uri': 'gs://bigfra...\n", + "4 {'status': '', 'content': {'uri': 'gs://bigfra...\n", + "\n", + "[5 rows x 1 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_image[\"blurred_verbose\"] = df_image[\"image\"].blob.image_blur(\n", + " (20, 20), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_transformed_verbose/\", engine=\"opencv\", verbose=True\n", + ")\n", + "df_image[[\"blurred_verbose\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -657,73 +768,79 @@ " resized\n", " normalized\n", " blur_resized\n", + " blurred_verbose\n", " \n", " \n", " \n", " \n", " 0\n", - " \n", + " \n", " alice\n", " image/png\n", " 1591240\n", " 2025-03-20 17:45:04+00:00\n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " {'status': '', 'content': {'uri': 'gs://bigframes_blob_test/image_blur_transformed_verbose/k9-guard-dog-paw-balm.png', 'version': None, 'authorizer': 'bigframes-dev.us.bigframes-default-connection', 'details': None}}\n", " \n", " \n", " 1\n", - " \n", + " \n", " bob\n", " image/png\n", " 1182951\n", " 2025-03-20 17:45:02+00:00\n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " {'status': '', 'content': {'uri': 'gs://bigframes_blob_test/image_blur_transformed_verbose/k9-guard-dog-hot-spot-spray.png', 'version': None, 'authorizer': 'bigframes-dev.us.bigframes-default-connection', 'details': None}}\n", " \n", " \n", " 2\n", - " \n", + " \n", " bob\n", " image/png\n", " 1520884\n", " 2025-03-20 17:44:55+00:00\n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " {'status': '', 'content': {'uri': 'gs://bigframes_blob_test/image_blur_transformed_verbose/fluffy-buns-chinchilla-food-variety-pack.png', 'version': None, 'authorizer': 'bigframes-dev.us.bigframes-default-connection', 'details': None}}\n", " \n", " \n", " 3\n", - " \n", + " \n", " alice\n", " image/png\n", " 1235401\n", " 2025-03-20 17:45:19+00:00\n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " {'status': '', 'content': {'uri': 'gs://bigframes_blob_test/image_blur_transformed_verbose/purrfect-perch-cat-scratcher.png', 'version': None, 'authorizer': 'bigframes-dev.us.bigframes-default-connection', 'details': None}}\n", " \n", " \n", " 4\n", - " \n", + " \n", " bob\n", " image/png\n", " 1591923\n", " 2025-03-20 17:44:47+00:00\n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " {'status': '', 'content': {'uri': 'gs://bigframes_blob_test/image_blur_transformed_verbose/chirpy-seed-deluxe-bird-food.png', 'version': None, 'authorizer': 'bigframes-dev.us.bigframes-default-connection', 'details': None}}\n", " \n", " \n", "\n", - "

5 rows × 9 columns

\n", - "[5 rows x 9 columns in total]" + "

5 rows × 10 columns

\n", + "[5 rows x 10 columns in total]" ], "text/plain": [ " image author content_type \\\n", @@ -761,17 +878,24 @@ "3 {'uri': 'gs://bigframes_blob_test/image_normal... \n", "4 {'uri': 'gs://bigframes_blob_test/image_normal... \n", "\n", - " blur_resized \n", - "0 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", - "1 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", - "2 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", - "3 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", - "4 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + " blur_resized \\\n", + "0 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "1 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "2 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "3 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "4 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", "\n", - "[5 rows x 9 columns]" + " blurred_verbose \n", + "0 {'status': '', 'content': {'uri': 'gs://bigfra... \n", + "1 {'status': '', 'content': {'uri': 'gs://bigfra... \n", + "2 {'status': '', 'content': {'uri': 'gs://bigfra... \n", + "3 {'status': '', 'content': {'uri': 'gs://bigfra... \n", + "4 {'status': '', 'content': {'uri': 'gs://bigfra... \n", + "\n", + "[5 rows x 10 columns]" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -791,7 +915,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": { "id": "mRUGfcaFVW-3" }, @@ -800,7 +924,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", "default model will be removed in BigFrames 3.0. Please supply an\n", "explicit model to avoid this message.\n", " return method(*args, **kwargs)\n" @@ -814,7 +938,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -874,13 +998,13 @@ " \n", " \n", " 0\n", - " The item is a tin of K9Guard Dog Paw Balm.\n", - " \n", + " The item is a tin of K9 Guard dog paw balm.\n", + " \n", " \n", " \n", " 1\n", - " The item is a bottle of K9 Guard Dog Hot Spot Spray.\n", - " \n", + " The item is K9 Guard Dog Hot Spot Spray.\n", + " \n", " \n", " \n", "\n", @@ -888,9 +1012,9 @@ "[2 rows x 2 columns in total]" ], "text/plain": [ - " ml_generate_text_llm_result \\\n", - "0 The item is a tin of K9Guard Dog Paw Balm. \n", - "1 The item is a bottle of K9 Guard Dog Hot Spot ... \n", + " ml_generate_text_llm_result \\\n", + "0 The item is a tin of K9 Guard dog paw balm. \n", + "1 The item is K9 Guard Dog Hot Spot Spray. \n", "\n", " image \n", "0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n", @@ -899,7 +1023,7 @@ "[2 rows x 2 columns]" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -913,7 +1037,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": { "id": "IG3J3HsKhyBY" }, @@ -936,7 +1060,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -996,13 +1120,13 @@ " \n", " \n", " 0\n", - " The item is dog paw balm.\n", - " \n", + " The item is a tin of K9Guard Dog Paw Balm.\n", + " \n", " \n", " \n", " 1\n", - " The picture features a white bottle with a light blue spray nozzle and accents. The background is a neutral gray.\\n\n", - " \n", + " The bottle is mostly white, with a light blue accents. The background is a light gray. There are also black and green elements on the bottle's label.\n", + " \n", " \n", " \n", "\n", @@ -1011,8 +1135,8 @@ ], "text/plain": [ " ml_generate_text_llm_result \\\n", - "0 The item is dog paw balm. \n", - "1 The picture features a white bottle with a lig... \n", + "0 The item is a tin of K9Guard Dog Paw Balm. \n", + "1 The bottle is mostly white, with a light blue ... \n", "\n", " image \n", "0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n", @@ -1021,7 +1145,7 @@ "[2 rows x 2 columns]" ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1033,7 +1157,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1047,7 +1171,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", "default model will be removed in BigFrames 3.0. Please supply an\n", "explicit model to avoid this message.\n", " return method(*args, **kwargs)\n", @@ -1096,19 +1220,19 @@ " \n", " \n", " 0\n", - " [ 0.00638846 0.01666372 0.00451786 ... -0.02...\n", + " [ 0.00638842 0.01666344 0.00451782 ... -0.02...\n", " \n", " <NA>\n", " <NA>\n", - " {\"access_urls\":{\"expiry_time\":\"2025-10-09T12:2...\n", + " {\"access_urls\":{\"expiry_time\":\"2025-10-25T00:2...\n", " \n", " \n", " 1\n", - " [ 0.0097399 0.0214815 0.00244266 ... 0.00...\n", + " [ 0.00973689 0.02148374 0.00244311 ... 0.00...\n", " \n", " <NA>\n", " <NA>\n", - " {\"access_urls\":{\"expiry_time\":\"2025-10-09T12:2...\n", + " {\"access_urls\":{\"expiry_time\":\"2025-10-25T00:2...\n", " \n", " \n", "\n", @@ -1117,8 +1241,8 @@ ], "text/plain": [ " ml_generate_embedding_result \\\n", - "0 [ 0.00638846 0.01666372 0.00451786 ... -0.02... \n", - "1 [ 0.0097399 0.0214815 0.00244266 ... 0.00... \n", + "0 [ 0.00638842 0.01666344 0.00451782 ... -0.02... \n", + "1 [ 0.00973689 0.02148374 0.00244311 ... 0.00... \n", "\n", " ml_generate_embedding_status ml_generate_embedding_start_sec \\\n", "0 \n", @@ -1129,13 +1253,13 @@ "1 \n", "\n", " content \n", - "0 {\"access_urls\":{\"expiry_time\":\"2025-10-09T12:2... \n", - "1 {\"access_urls\":{\"expiry_time\":\"2025-10-09T12:2... \n", + "0 {\"access_urls\":{\"expiry_time\":\"2025-10-25T00:2... \n", + "1 {\"access_urls\":{\"expiry_time\":\"2025-10-25T00:2... \n", "\n", "[2 rows x 5 columns]" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1158,7 +1282,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": { "id": "oDDuYtUm5Yiy" }, @@ -1180,7 +1304,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1197,9 +1321,12 @@ "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", " return method(*args, **kwargs)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:244: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n", + "future version. Use `json_value_array` instead.\n", + " warnings.warn(bfe.format_message(msg), category=UserWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n", "future version. Use `json_value_array` instead.\n", " warnings.warn(bfe.format_message(msg), category=UserWarning)\n" ] @@ -1211,7 +1338,78 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + " return method(*args, **kwargs)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n", + "future version. Use `json_value_array` instead.\n", + " warnings.warn(bfe.format_message(msg), category=UserWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chunked_verbose
0{'status': '', 'content': array([\"CritterCuisi...
\n", + "

1 rows × 1 columns

\n", + "
[1 rows x 1 columns in total]" + ], + "text/plain": [ + " chunked_verbose\n", + "0 {'status': '', 'content': array([\"CritterCuisi...\n", + "\n", + "[1 rows x 1 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_pdf[\"chunked_verbose\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\", verbose=True)\n", + "df_pdf[[\"chunked_verbose\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, "metadata": { "id": "kaPvJATN7zlw" }, @@ -1239,7 +1437,7 @@ "Name: chunked, dtype: string" ] }, - "execution_count": 18, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1258,7 +1456,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1279,7 +1477,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1303,7 +1501,7 @@ "Name: transcribed_content, dtype: string" ] }, - "execution_count": 20, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1312,6 +1510,42 @@ "transcribed_series = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=False)\n", "transcribed_series" ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/plain": [ + "0 {'status': '', 'content': 'Now, as all books, ...\n", + "Name: transcription_results, dtype: struct[pyarrow]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transcribed_series_verbose = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=True)\n", + "transcribed_series_verbose" + ] } ], "metadata": { From 639c0a500ac3c2f06f3a3b5d9aa3b30d6426710d Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 27 Oct 2025 19:36:58 +0000 Subject: [PATCH 07/10] return None on error with verbose=False for image functions --- bigframes/blob/_functions.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index a39e3d2ce1..22780967ac 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -14,6 +14,7 @@ from dataclasses import dataclass import inspect +import typing from typing import Callable, Iterable, Union import google.cloud.bigquery as bigquery @@ -70,6 +71,12 @@ def _input_bq_signature(self): def _output_bq_type(self): sig = inspect.signature(self._func) + return_annotation = sig.return_annotation + origin = typing.get_origin(return_annotation) + if origin is Union: + args = typing.get_args(return_annotation) + if len(args) == 2 and args[1] is type(None): + return _PYTHON_TO_BQ_TYPES[args[0]] return _PYTHON_TO_BQ_TYPES[sig.return_annotation] def _create_udf(self): @@ -177,7 +184,7 @@ def image_blur_func( ksize_y: int, ext: str, verbose: bool, -) -> str: +) -> typing.Optional[str]: try: import json @@ -241,9 +248,7 @@ def image_blur_func( if verbose: return json.dumps(error_result) else: - # The calling function expects a json string that can be parsed as a blob ref - # Return a valid blob ref json string with empty values. - return '{"access_urls": {"read_url": "", "write_url": ""}, "authorizer": "", "generation": "", "uri": ""}' + return None image_blur_def = FunctionDef(image_blur_func, ["opencv-python", "numpy", "requests"]) @@ -316,7 +321,7 @@ def image_resize_func( fy: float, ext: str, verbose: bool, -) -> str: +) -> typing.Optional[str]: try: import json @@ -379,9 +384,7 @@ def image_resize_func( if verbose: return json.dumps(error_result) else: - # The calling function expects a json string that can be parsed as a blob ref - # Return a valid blob ref json string with empty values. - return '{"access_urls": {"read_url": "", "write_url": ""}, "authorizer": "", "generation": "", "uri": ""}' + return None image_resize_def = FunctionDef( @@ -461,7 +464,7 @@ def image_normalize_func( norm_type: str, ext: str, verbose: bool, -) -> str: +) -> typing.Optional[str]: try: import json @@ -533,9 +536,7 @@ def image_normalize_func( if verbose: return json.dumps(error_result) else: - # The calling function expects a json string that can be parsed as a blob ref - # Return a valid blob ref json string with empty values. - return '{"access_urls": {"read_url": "", "write_url": ""}, "authorizer": "", "generation": "", "uri": ""}' + return None image_normalize_def = FunctionDef( From 526ab6e4945fad2a53b9cb2247ff316254d7615d Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 27 Oct 2025 21:16:44 +0000 Subject: [PATCH 08/10] define typing module in udf --- bigframes/blob/_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index 22780967ac..19c33ae7a1 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -85,7 +85,7 @@ def _create_udf(self): self._session._anon_dataset_manager.generate_unique_resource_id() ) - func_body = inspect.getsource(self._func) + func_body = "import typing\n" + inspect.getsource(self._func) func_name = self._func.__name__ packages = str(list(self._requirements)) From ebc9aeec3edee2d6e810502034b24fb604609c17 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 27 Oct 2025 21:25:46 +0000 Subject: [PATCH 09/10] only use local variable --- bigframes/blob/_functions.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index 19c33ae7a1..3dfe38811b 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -243,9 +243,11 @@ def image_blur_func( return dst_obj_ref_rt except Exception as e: - # Return error in structured format - error_result = {"status": f"Error: {type(e).__name__}: {str(e)}", "content": ""} if verbose: + error_result = { + "status": f"Error: {type(e).__name__}: {str(e)}", + "content": "", + } return json.dumps(error_result) else: return None @@ -379,9 +381,11 @@ def image_resize_func( return dst_obj_ref_rt except Exception as e: - # Return error in structured format - error_result = {"status": f"Error: {type(e).__name__}: {str(e)}", "content": ""} if verbose: + error_result = { + "status": f"Error: {type(e).__name__}: {str(e)}", + "content": "", + } return json.dumps(error_result) else: return None @@ -531,9 +535,11 @@ def image_normalize_func( return dst_obj_ref_rt except Exception as e: - # Return error in structured format - error_result = {"status": f"Error: {type(e).__name__}: {str(e)}", "content": ""} if verbose: + error_result = { + "status": f"Error: {type(e).__name__}: {str(e)}", + "content": "", + } return json.dumps(error_result) else: return None From f14cf555925f4efdb422be895ed24e1bbe81d47f Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 29 Oct 2025 06:01:19 +0000 Subject: [PATCH 10/10] Refactor code --- bigframes/operations/blob.py | 89 ++++++++++-------------------------- 1 file changed, 25 insertions(+), 64 deletions(-) diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 66ba9ee8e2..577de458f4 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -193,6 +193,20 @@ def _df_apply_udf( return s + def _apply_udf_or_raise_error( + self, df: bigframes.dataframe.DataFrame, udf, operation_name: str + ) -> bigframes.series.Series: + """Helper to apply UDF with consistent error handling.""" + try: + res = self._df_apply_udf(df, udf) + except Exception as e: + raise RuntimeError(f"{operation_name} UDF execution failed: {e}") from e + + if res is None: + raise RuntimeError(f"{operation_name} returned None result") + + return res + def read_url(self) -> bigframes.series.Series: """Retrieve the read URL of the Blob. @@ -368,13 +382,7 @@ def exif( container_memory=container_memory, ).udf() - try: - res = self._df_apply_udf(df, exif_udf) - except Exception as e: - raise RuntimeError(f"EXIF extraction UDF execution failed: {e}") from e - - if res is None: - raise RuntimeError("EXIF extraction returned None result") + res = self._apply_udf_or_raise_error(df, exif_udf, "EXIF extraction") if verbose: try: @@ -457,13 +465,7 @@ def image_blur( df["ksize_x"], df["ksize_y"] = ksize df["ext"] = ext # type: ignore df["verbose"] = verbose - try: - res = self._df_apply_udf(df, image_blur_udf) - except Exception as e: - raise RuntimeError(f"Image blur UDF execution failed: {e}") from e - - if res is None: - raise RuntimeError("Image blur returned None result") + res = self._apply_udf_or_raise_error(df, image_blur_udf, "Image blur") if verbose: blurred_content_b64_series = res._apply_unary_op( @@ -512,13 +514,7 @@ def image_blur( df["ext"] = ext # type: ignore df["verbose"] = verbose - try: - res = self._df_apply_udf(df, image_blur_udf) - except Exception as e: - raise RuntimeError(f"Image blur UDF execution failed: {e}") from e - - if res is None: - raise RuntimeError("Image blur returned None result") + res = self._apply_udf_or_raise_error(df, image_blur_udf, "Image blur") res.cache() # to execute the udf if verbose: @@ -610,13 +606,7 @@ def image_resize( df["fx"], df["fy"] = fx, fy df["ext"] = ext # type: ignore df["verbose"] = verbose - try: - res = self._df_apply_udf(df, image_resize_udf) - except Exception as e: - raise RuntimeError(f"Image resize UDF execution failed: {e}") from e - - if res is None: - raise RuntimeError("Image resize returned None result") + res = self._apply_udf_or_raise_error(df, image_resize_udf, "Image resize") if verbose: resized_content_b64_series = res._apply_unary_op( @@ -667,13 +657,7 @@ def image_resize( df["ext"] = ext # type: ignore df["verbose"] = verbose - try: - res = self._df_apply_udf(df, image_resize_udf) - except Exception as e: - raise RuntimeError(f"Image resize UDF execution failed: {e}") from e - - if res is None: - raise RuntimeError("Image resize returned None result") + res = self._apply_udf_or_raise_error(df, image_resize_udf, "Image resize") res.cache() # to execute the udf if verbose: @@ -759,13 +743,9 @@ def image_normalize( df["norm_type"] = norm_type df["ext"] = ext # type: ignore df["verbose"] = verbose - try: - res = self._df_apply_udf(df, image_normalize_udf) - except Exception as e: - raise RuntimeError(f"Image normalize UDF execution failed: {e}") from e - - if res is None: - raise RuntimeError("Image normalize returned None result") + res = self._apply_udf_or_raise_error( + df, image_normalize_udf, "Image normalize" + ) if verbose: normalized_content_b64_series = res._apply_unary_op( @@ -816,13 +796,7 @@ def image_normalize( df["ext"] = ext # type: ignore df["verbose"] = verbose - try: - res = self._df_apply_udf(df, image_normalize_udf) - except Exception as e: - raise RuntimeError(f"Image normalize UDF execution failed: {e}") from e - - if res is None: - raise RuntimeError("Image normalize returned None result") + res = self._apply_udf_or_raise_error(df, image_normalize_udf, "Image normalize") res.cache() # to execute the udf if verbose: @@ -899,14 +873,7 @@ def pdf_extract( df = self.get_runtime_json_str(mode="R").to_frame() df["verbose"] = verbose - try: - res = self._df_apply_udf(df, pdf_extract_udf) - except Exception as e: - raise RuntimeError(f"PDF extraction UDF failed: {e}") from e - - # Validate result is not None - if res is None: - raise RuntimeError("PDF extraction returned None result") + res = self._apply_udf_or_raise_error(df, pdf_extract_udf, "PDF extraction") if verbose: # Extract content with error handling @@ -1005,13 +972,7 @@ def pdf_chunk( df["overlap_size"] = overlap_size df["verbose"] = verbose - try: - res = self._df_apply_udf(df, pdf_chunk_udf) - except Exception as e: - raise RuntimeError(f"PDF chunking UDF failed: {e}") from e - - if res is None: - raise RuntimeError("PDF chunking returned None result") + res = self._apply_udf_or_raise_error(df, pdf_chunk_udf, "PDF chunking") try: content_series = bbq.json_extract_string_array(res, "$.content")